Example #1
0
    def __init__(self, workbench, data_path = "/home/moritz/DataBases/genomes/RefSeq/", clean = False):
        Database.__init__(self,workbench = workbench, data_path = data_path)

        if not os.path.exists(self.metadata_file) or clean:
            ftp =  FTP(ncbi)

            print "Getting metadata from ncbi"

            FNULL = open(os.devnull, 'w')
            ftp.login()
            ftp.cwd('genomes/refseq/bacteria/')
            info = StringIO.StringIO()
            ftp.retrbinary("RETR " + "assembly_summary.txt", info.write)
            info.seek(0)
            self.metadata = DataFrame.from_csv(info, sep="\t", header=1)
            ftp.close()
            self.metadata['assembly_level'] = self.metadata['assembly_level'].apply(lambda x: x.replace(" ","_"))
            self.metadata = self.metadata.transpose().to_dict()

            DataFrame.from_dict(self.metadata).to_csv(self.metadata_file)

        else :
            print "Loading metadata"
            self.metadata = DataFrame.from_csv(self.metadata_file).to_dict()

        print "Loading genomes"
        for k,v in tqdm(self.metadata.items()):
            genome_path = pjoin(self.data_path, v['assembly_level'].replace(" ","_"), k)
            genome_file = pjoin(genome_path, k + ".fna")
            self.genomes += [Genome(k, genome_path, ref=genome_file, manual_metadata = v, taxDb = self.taxDb, workbench = self.workbench)]
Example #2
0
def mash_matrix(gs, file, clean = False, proc=4):
    if os.path.exists(file) and not clean:
        pre_mat = DataFrame.from_csv(file)
        done = [g for g in gs if g.name in pre_mat.index]
        to_do = [g for g in gs if not g.name in pre_mat.index]
        if len(to_do) == 0:
            out_mat = pre_mat
        else:
            mat_small = DataFrame.from_dict({g : g.mash_compare_many(done, proc) for g in tqdm(to_do)})
            mat_small.index = Index([m.name for m in mat_small.index])
            mat_small.columns = Index([m.name for m in mat_small.columns])
            mat_small = mat_small.transpose()

            mat_big = DataFrame.from_dict({g : g.mash_compare_many(to_do + done, proc) for g in tqdm(to_do)})
            mat_big.index = Index([m.name for m in mat_big.index])
            mat_big.columns = Index([m.name for m in mat_big.columns])

            out_mat = concat([mat_big,concat([mat_small, pre_mat[mat_small.columns]], axis  = 0 ).loc[mat_big.index]], axis=1)
            out_mat = out_mat[out_mat.index]
            out_mat.to_csv(file)
    else :
        out_mat = DataFrame.from_dict({g : g.mash_compare_many(gs, proc) for g in tqdm(gs)})
        out_mat.index = Index([m.name for m in out_mat.index])
        out_mat.columns = Index([m.name for m in out_mat.columns])

        out_mat.to_csv(file)

    return out_mat.apply(lambda x : [ast.literal_eval(xx) if isinstance(xx,basestring) else xx for xx in x])
def testGBCLoss(db, count):
    train = db[:, 0 : count * 5]
    target = db[:, count * 5]
    testDataLoss = ["deviance", "exponential"]
    kfold = 5
    itog_val = {}
    for i in testDataLoss:
        scores = cross_validation.cross_val_score(
            GradientBoostingClassifier(
                loss=i,
                n_estimators=8,
                learning_rate=1,
                max_depth=3,
                min_samples_split=4,
                min_samples_leaf=2,
                min_weight_fraction_leaf=0,
                subsample=1,
                max_features="auto",
                random_state=3200,
            ),
            train,
            target,
            cv=kfold,
        )
        itog_val[i] = scores.mean()
    DataFrame.from_dict(data=itog_val, orient="index").plot(kind="barh", legend=False)
    plt.show()
Example #4
0
def get_document_mapping():
    data = json.load(open('/Users/pcravich/repo/personal-agents/search/nlctaglist.json'))
    labels = list(map(lambda x: x['labels'], data))
    df = DataFrame.from_dict(labels[0], orient='index').transpose()
    for i in range(1, len(labels)):
        df = df.append(DataFrame.from_dict(labels[i], orient='index').transpose(), ignore_index=True)
    df['url'] = list(map(lambda x: x['url'], data))
    return df
def testKNNNeingh(db, count):
    train = db[:, 0 : count * 5]
    target = db[:, count * 5]
    testData = [i for i in range(1, 21, 2)]
    kfold = 5
    itog_val = {}
    for i in testData:
        scores = cross_validation.cross_val_score(KNeighborsClassifier(n_neighbors=i), train, target, cv=kfold)
        itog_val[i.__str__()] = scores.mean()
    DataFrame.from_dict(data=itog_val, orient="index").plot(kind="barh", legend=False)
    plt.show()
Example #6
0
    def test_to_dict_index_dtypes(self, into, expected):
        # GH 18580
        # When using to_dict(orient='index') on a dataframe with int
        # and float columns only the int columns were cast to float

        df = DataFrame({'int_col': [1, 2, 3],
                        'float_col': [1.0, 2.0, 3.0]})

        result = df.to_dict(orient='index', into=into)
        cols = ['int_col', 'float_col']
        result = DataFrame.from_dict(result, orient='index')[cols]
        expected = DataFrame.from_dict(expected, orient='index')[cols]
        tm.assert_frame_equal(result, expected)
def testKNNMetric(db, count):
    train = db[:, 0 : count * 5]
    target = db[:, count * 5]
    testData = ["euclidean", "manhattan", "chebyshev", "minkowski"]
    kfold = 5
    itog_val = {}
    for i in testData:
        scores = cross_validation.cross_val_score(
            KNeighborsClassifier(metric=i, n_neighbors=3), train, target, cv=kfold
        )
        itog_val[i] = scores.mean()
    DataFrame.from_dict(data=itog_val, orient="index").plot(kind="barh", legend=False)
    plt.show()
def get_boxplot(root):
    """
    get boxplot data
    :param root: Root Server (in alphabet)
    :return:
    """
    container4 = {}
    container6 = {}

    for file in sorted(os.listdir('datasets/{}/'.format(root))):
        timestamp = int(file.split('-')[0])
        filename = 'datasets/{0}/{1}'.format(root, file)
        opened_file = DataFrame.from_csv(filename, sep='\t')
        if not opened_file.empty:
            res4 = opened_file['len4']
            container4[timestamp] = res4
            res6 = opened_file['len6']
            container6[timestamp] = res6
        else:
            container4[timestamp] = pd.Series()
            container6[timestamp] = pd.Series()

    df4 = DataFrame.from_dict(container4)
    df6 = DataFrame.from_dict(container6)

    dict4 = defaultdict()
    dict6 = defaultdict()

    #######
    # IPv4
    #######
    for ts in df4:
        dict4[ts] = {
            'name': datetime.fromtimestamp(ts).strftime('%Y-%m-%d'),
            'type': 'box',
            'y': [int(i) for i in df4[ts].dropna()]
        }
    result4 = [dict4[i] for i in dict4]

    #######
    # IPv6
    #######
    for ts in df6:
        dict6[ts] = {
            'name': datetime.fromtimestamp(ts).strftime('%Y-%m-%d'),
            'type': 'box',
            'y': [int(i) for i in df6[ts].dropna()]
        }
    result6 = [dict6[i] for i in dict6]

    return jsonify({'ipv4': result4, 'ipv6': result6})
Example #9
0
def sf_data(query):
    """
    Get opportunity data using supplied query.
    Get account data.

    Return both as dataframes.

    """

    USER = SALESFORCE['USERNAME']
    PASS = SALESFORCE['PASSWORD']
    TOKEN = SALESFORCE['TOKEN']
    HOST = SALESFORCE['HOST']

    sf = Salesforce(username=USER, password=PASS, security_token=TOKEN)

    bulk = SalesforceBulk(sessionId=sf.session_id, host=HOST)

    print "Creating Opportunity job..."
    job = bulk.create_query_job("Opportunity", contentType='CSV')
    print "Issuing query..."

    batch = bulk.query(job, query)
    while not bulk.is_batch_done(job, batch):
        print "waiting for query to complete..."
        sleep(3)
    bulk.close_job(job)

    rows = bulk.get_batch_result_iter(job, batch, parse_csv=True)
    all = list(rows)

    opps = DataFrame.from_dict(all)

    job = bulk.create_query_job("Account", contentType='CSV')
    print "Creating Account job..."

    batch = bulk.query(job,
            "SELECT Id, Website, Text_For_Donor_Wall__c FROM Account")
    print "Issuing query..."
    while not bulk.is_batch_done(job, batch):
        print "waiting for query to complete..."
        sleep(3)
    bulk.close_job(job)

    rows = bulk.get_batch_result_iter(job, batch, parse_csv=True)

    accts = DataFrame.from_dict(list(rows))
    accts.rename(columns={'Id': 'AccountId'}, inplace=True)

    return opps, accts
def get_as_path_avg_length(root):
    # read this: http://matplotlib.org/examples/pylab_examples/subplots_demo.html
    directory = '{}{}/'.format(csv_dir, root)
    result4 = {}
    result6 = {}
    for file in sorted(os.listdir(directory)):
        timestamp = int(file.split('-')[0])
        filename = '{}{}'.format(directory, file)
        opened_file = DataFrame.from_csv(filename, sep='\t')
        if not opened_file.empty:
            res4 = opened_file['len4']
            res6 = opened_file['len6']
            result4[timestamp] = res4
            result6[timestamp] = res6
        else:
            result4[timestamp] = pd.Series()
            result6[timestamp] = pd.Series()

    plot_result4 = DataFrame.from_dict(result4)
    plot_result6 = DataFrame.from_dict(result6)

    ################
    # Plot
    ################
    fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)

    plot4 = plot_result4.plot.box(figsize=(14, 5), ax=axes[0], ylim=(1.5, 9.5))
    plot6 = plot_result6.plot.box(figsize=(14, 5), ax=axes[1], ylim=(1.5, 9.5))

    n = 6

    # ticks = plot4.xaxis.get_ticklocs()
    # ticklabels = [datetime.fromtimestamp(int(l.get_text())).strftime('%d/%m/%y') for l in plot4.xaxis.get_ticklabels()]
    # plot4.xaxis.set_ticks(ticks[::n])
    # plot4.xaxis.set_ticklabels(ticklabels[::n], rotation=25)
    axes[0].text(3, 8, 'IPv4', fontsize=20, bbox={'facecolor': 'white', 'pad': 5})
    axes[1].text(3, 8, 'IPv6', fontsize=20, bbox={'facecolor': 'white', 'pad': 5})
    axes[0].grid(True)
    axes[1].grid(True)

    ticks = axes[1].xaxis.get_ticklocs()
    ticklabels = [datetime.fromtimestamp(int(l.get_text())).strftime('%d/%m/%y') for l in axes[1].xaxis.get_ticklabels()]
    axes[1].xaxis.set_ticks(ticks[::n])
    axes[1].xaxis.set_ticklabels(ticklabels[::n], rotation=25)

    plt.tight_layout()
    plt.savefig('figs/eps/path_avg_dist_{}.eps'.format(root), format='eps', dpi=1000)
    plt.savefig('figs/png/path_avg_dist_{}.png'.format(root))

    print('finish: path average {}-Root Server'.format(root))
Example #11
0
def getlinks(region, vendeur):

    #initialisation des variables
    pageSuivante = True
    if vendeur=="Particulier":
        url = "http://www.leboncoin.fr/voitures/offres/"+region+"/?o=1&q=renault%20captur&it=1&f=p"
    if vendeur=="Pros":
        url = "http://www.leboncoin.fr/voitures/offres/"+region+"/?o=1&q=renault%20captur&it=1&f=c"
    tableau = pd.DataFrame()
    liens = {}
    vente = {}
    reg = {}
    i=0
    
    #récupération des liens et retour du résultat dans un DataFrame
    while pageSuivante:
             
        soup = getSoupFromUrl(url)	
        
        #utilisation d'un regex sur le lien à récupérer    
        if region == "ile_de_france":       
            balises = soup.find_all(href=re.compile(r'(http:\/\/)(www\.leboncoin\.fr\/)(voitures\/)([\d]{9})\.(htm\?ca\=12\_s)$'))
        if region == "aquitaine":
            balises = soup.find_all(href=re.compile(r'(http:\/\/)(www\.leboncoin\.fr\/)(voitures\/)([\d]{9})\.(htm\?ca\=2\_s)$')) 
        if region == "provence_alpes_cote_d_azur":
            balises = soup.find_all(href=re.compile(r'(http:\/\/)(www\.leboncoin\.fr\/)(voitures\/)([\d]{9})\.(htm\?ca\=21\_s)$'))
        
        for lien in balises:
            liens[i]=lien.get('href')
            vente[i]=vendeur
            reg[i]=region
            i=i+1
        
        #vérification s'il y a une page suivante
        nav = soup.find_all("a", text = "Page suivante")
        
        if nav:
            url = nav[0].get('href')
        else:
            pageSuivante = False
            tableau = DataFrame.from_dict(liens,'index')
            tableau.columns = ['Lien']
            Vendeurs = DataFrame.from_dict(vente,'index')
            Vendeurs.columns = ['Vendeur']
            Regions = DataFrame.from_dict(reg,'index')
            Regions.columns = ['Region']
            tableau = pd.merge(tableau, Vendeurs, left_index=True, right_index=True)
            tableau = pd.merge(tableau, Regions, left_index=True, right_index=True)
    
    return tableau
Example #12
0
    def _as_dataframe(self, gene_obj, df_index=False):
        """
        converts gene object to DataFrame (pandas)
        """
        if not df_avail:
            print("Error: pandas module must be installed for as_dataframe option.")
            return

        if 'hits' in gene_obj:
            df = DataFrame.from_dict(gene_obj['hits'])
        else:
            df = DataFrame.from_dict(gene_obj)
        if df_index:
            df = df.set_index('_id')
        return df
Example #13
0
    def dataFrame(self):
        from pandas import DataFrame

        items = self.execute()
        if len(items) == 0:
            return DataFrame()
        return DataFrame.from_dict(items)
Example #14
0
def Get_Test_Data_YQL():
    # Will have to change conver_objects to specific numeric calls in the future
    result = load(urlopen("https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.historicaldata%20where%20symbol%20%3D%20%22YHOO%22%20and%20startDate%20%3D%20%222010-01-11%22%20and%20endDate%20%3D%20%222010-05-10%22&format=json&diagnostics=true&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys&callback="))
    x = DataFrame.from_dict(result['query']['results']['quote'])
    x["Date"] = to_datetime(x["Date"])
    x = x.convert_objects(convert_numeric=True)
    return x
Example #15
0
def collector2table(collector):
    """
    collector2table return a station table as a DataFrame.
    columns are station, sensor, lon, lat, and the index is the station
    number.

    This is a substitute for `sos_request`.

    """
    # This accepts only 1-day request, but since we only want the
    # stations available we try again with end=start.
    c = copy.copy(collector)
    try:
        response = c.raw(responseFormat="text/csv")
    except ExceptionReport:
        response = c.filter(end=c.start_time).raw(responseFormat="text/csv")
    df = read_csv(BytesIO(response.encode('utf-8')),
                  parse_dates=True)
    columns = {'sensor_id': 'sensor',
               'station_id': 'station',
               'latitude (degree)': 'lat',
               'longitude (degree)': 'lon'}
    df.rename(columns=columns, inplace=True)
    df['sensor'] = [s.split(':')[-1] for s in df['sensor']]
    df['station'] = [s.split(':')[-1] for s in df['station']]

    df = df[['station', 'sensor', 'lon', 'lat']]
    g = df.groupby('station')
    df = dict()
    for station in g.groups.keys():
        df.update({station: g.get_group(station).iloc[0]})
    return DataFrame.from_dict(df).T
Example #16
0
def apply_skill(dfs, function, remove_mean=True, filter_tides=False):
    skills = dict()
    for station, df in dfs.iteritems():
        if filter_tides:
            df = df.apply(low_pass)
        skill = dict()
        obs = df.pop('OBS_DATA')
        if obs.isnull().all():
            # No observations.
            skills.update({station: np.NaN})
            continue
        for model, y in df.iteritems():
            # No models.
            if y.isnull().all():
                skills.update({station: np.NaN})
                continue
            mask = both_valid(obs, y)
            x, y = obs[mask], y[mask]
            if remove_mean:
                x, y = x-x.mean(), y-y.mean()
            if x.size:
                ret = function(x, y)
            else:
                ret = np.NaN
            skill.update({model: ret})
        skills.update({station: skill})
    return DataFrame.from_dict(skills)
Example #17
0
def compare_assemblies(assemblies, chunk_size = 2000, identity_threshold = 0.40):
    """
    compares a set of assemblies:
    assemblies is a dictionary with names of the assemblies as keys and fasta-files of the assemblies as values
    """
    similarities = {}


    print "make blast dbs"
    for subject_name, subject in tqdm(assemblies.iteritems()):
        blast_db_cmd = ["makeblastdb" ,"-in", subject, "-dbtype", "nucl", "-out", subject]
        with open("/dev/null") as null:
            blastdb_return = call(blast_db_cmd, stdout=null)

    print "Run the hell out of it"
    for scaff_name, scaff in tqdm(assemblies.iteritems()):
        similarities[scaff_name] = {}
        chopped_up_query = "tmp.fasta"
        nb_chunks = len(cut_up_fasta(scaff, chopped_up_query, chunk_size))
        for subject_name, subject in assemblies.iteritems():
            nics = find_NICs(chopped_up_query, subject, identity_threshold, blast_db = False)
#            print scaff_name, "vs", subject_name
            similarities[scaff_name][subject_name] = len(nics.keys())/nb_chunks
    os.remove(chopped_up_query)

    print "clean up"
    for subject_name, subject in tqdm(assemblies.iteritems()):
        blast_db_files = [subject + ".nhr", subject + ".nin",  subject + ".nsq"]
        for f in blast_db_files:
            os.remove(f)


    similars =  DataFrame.from_dict(similarities)
    return similars
def cross_validation_test():
    data = get_train_data()
    target = data.Cover_Type
    train = data.drop(['Cover_Type'], axis = 1)
    kfold = 10
    cross_val_final = {}

    print 'Cross validation test...'
    model_rfc = RandomForestClassifier(n_estimators = 1024, criterion='entropy', n_jobs = -1)
    model_knc = KNeighborsClassifier(n_neighbors = 128)
    model_lr = LogisticRegression(penalty='l1', C=1e5)

    scores = cross_validation.cross_val_score(model_rfc, train, target, cv = kfold)
    cross_val_final['RFC'] = scores.mean()
    print 'RFC: ', scores.mean()

    scores = cross_validation.cross_val_score(model_knc, train, target, cv = kfold)
    cross_val_final['KNC'] = scores.mean()
    print 'KNC: ', scores.mean()


    scores = cross_validation.cross_val_score(model_lr, train, target, cv = kfold)
    cross_val_final['LR'] = scores.mean()
    print 'LR: ', scores.mean()

    f = plt.figure(figsize = (8, 6))
    p = DataFrame.from_dict(data = cross_val_final, orient='index').plot(kind='barh', legend=False, ax = f.gca())
    f.savefig('./test_plot/cross_validation_rfc_1024.png')
Example #19
0
def _project(dataframe, project_q):
    if not project_q:
        return dataframe

    assert_list("project", project_q)

    if project_q == [["count"]]:
        # Special case for count only, ~equal to SQL count(*)
        return DataFrame.from_dict({"count": [len(dataframe)]})

    aggregate_fns, alias_expressions = classify_expressions(project_q)

    if aggregate_fns and alias_expressions:
        raise_malformed("Cannot mix aliasing and aggregation functions", project_q)

    if isinstance(dataframe, DataFrameGroupBy):
        dataframe = _aggregate(dataframe, project_q, aggregate_fns)
    elif aggregate_fns:
        return _aggregate_without_group_by(dataframe, project_q, aggregate_fns)
    elif alias_expressions:
        dataframe = _alias(dataframe, alias_expressions)
    else:
        # Nothing to do here
        pass

    columns = [e if type(e) is not list else e[1] for e in project_q]

    try:
        return dataframe[columns]
    except KeyError:
        missing_columns = set(columns) - set(dataframe.columns.values)
        raise_malformed("Selected columns not in table", list(missing_columns))
Example #20
0
def read_umi_tools(filename: PathLike) -> AnnData:
    """Read a gzipped condensed count matrix from umi_tools.

    Parameters
    ----------
    filename
        File name to read from.
    """
    # import pandas for conversion of a dict of dicts into a matrix
    # import gzip to read a gzipped file :-)
    import gzip
    from pandas import DataFrame

    dod = {}  # this will contain basically everything
    fh = gzip.open(fspath(filename))
    header = fh.readline()  # read the first line

    for line in fh:
        t = line.decode('ascii').split('\t')  # gzip read bytes, hence the decoding
        try:
            dod[t[1]].update({t[0]:int(t[2])})
        except KeyError:
            dod[t[1]] = {t[0]:int(t[2])}

    df = DataFrame.from_dict(dod, orient='index')  # build the matrix
    df.fillna(value=0., inplace=True)  # many NaN, replace with zeros
    return AnnData(np.array(df), {'obs_names': df.index}, {'var_names': df.columns})
def cross_validation_test():
    data = get_analyze_data()
    target = data["hand"]
    train = data.drop(["id"], axis = 1)
    kfold = 5
    cross_val_test = {}

    print "Cross validation test..."
    model_rfc = RandomForestClassifier(n_estimators = 100)
    model_knc = KNeighborsClassifier(n_neighbors = 15)
    model_lr = LogisticRegression(penalty='l1', tol=0.01)

    scores = cross_validation.cross_val_score(model_rfc, train, target, cv = kfold)
    cross_val_test['RFC'] = scores.mean()

    scores = cross_validation.cross_val_score(model_knc, train, target, cv = kfold)
    cross_val_test['KNC'] = scores.mean()

    scores = cross_validation.cross_val_score(model_lr, train, target, cv = kfold)
    cross_val_test['LR'] = scores.mean()

    f = plt.figure(figsize = (8, 6))
    p = DataFrame.from_dict(data = cross_val_test, orient='index').plot(kind='barh', legend=False, ax = f.gca())
    f.savefig('./%s/cross_validation_test.png' % dirs[1])

    for k,v in cross_val_test.iteritems():
        print "%s : %s" % (k,str(v))
def formatChecker(filename):
  reader = csv.DictReader(open(filename))
  result = {}
  key = -1
  for row in reader:
    key += 1
    # key = row.pop('url')
    if key in result:
      pass
    if row['longitude'] is '' or row['latitude'] is '':
      continue
    row['longitude'] = float(row['longitude'])
    row['latitude'] = float(row['latitude'])
    result[key] = row

  for k, v, in result.iteritems():
    if 'location' not in v:
      raise NameError('Missing ["location"] header')
    if 'day' not in v or 'month' not in v or 'year' not in v:
      raise NameError('Missing ["day"], ["month"], or ["year"] header')
    tempDate = validDateToJulianDate(v['month'] +'-'+ v['day'] +'-'+ v['year'])
    result[k].update({'concatDate':tempDate[0]})
    result[k].update({'julianDay':tempDate[1]})
    result[k].update({'julianDate':tempDate[2]})
  df = DataFrame.from_dict(result, orient='index', dtype=None)
  if 'latitude' not in df.columns and 'longitude' not in df.columns:
    raise NameError('Missing ["latitude"], or ["longitude"] or header')
  # df = df.convert_objects(convert_numeric=True).dtypes
  # df[['latitude', 'longitude']] = df[['latitude', 'longitude']].astype(float)

  # except:
  #   sys.exit('Date field contains non-digits.')
  return df
Example #23
0
 def make_cluster_bmft(self):
     cluster_table = DataFrame.from_dict({i : {k : len(v) for k,v in c.to_dict()['genes'].iteritems()} for i,c in enumerate(self)}, orient='index')
     cluster_table = cluster_table.apply(nan_to_num)
     cluster_table['annotations'] = [c.annotation for c in self]
     cluster_table['qual_annot'] = [c.annot_fraction for c in self]
     cluster_table['genes'] = [";".join(c.genes) for c in self]
     return cluster_table
Example #24
0
def test_csv_read_files():
    df = DataFrame.from_dict({0:['01',2], 1:['x', 12]}, orient='index')
    df.columns = ['a','b']
    
    df2 = csv_read_files(tempfilename, index_col=0)
    assert_frame_equal(df, df2)
    
Example #25
0
 def gen_data(size, seed):
     data = {
         'a': generate_uniform_float_column(size, 0., 1., seed + 1),
         'b': generate_uniform_float_column(size, 0., 1., seed + 2),
         'c': generate_uniform_float_column(size, 0., 1., seed + 3)
     }
     return DataFrame.from_dict(data)
Example #26
0
    def _collect_requests(query, request_limit):
        """Collects the string-casted results of a query.

        Args:
            query: (aflow.control.Query) A query with unprocessed requests.
            request_limit: (int) Maximum number of requests to submit.

        Returns: (DataFrame) Results collected from the query.
        """

        # requests the first page of results to determine number of pages
        query._request(1, query.k)
        page_limit = (query._N // query.k) + 1
        if request_limit and (page_limit > request_limit):
            page_limit = request_limit

        # requests the remaining pages
        for page in range(2, page_limit + 1):
            query._request(page, query.k)

        # collects request responses
        records = {}
        for page in range(1, page_limit + 1):
            records.update(query.responses[page])
        return DataFrame.from_dict(data=records, orient='index')
def creat_table_base(records):
    # saved caculated variable names and descriptions in json format
    # currently only includes 16 most used variables
    calculated_vars = {"_iitax": "Federal income tax liability",
                       "_fica": "FICA taxes  (ee+er) for OASDI+HI",
                       "c00100": "Federal AGI",
                       "c02500": "OASDI benefits in AGI",
                       "c04600": "Post-phase-out personal exemption",
                       "_prexmp": "Pre-phase-out personal exemption",
                       "c21040": "Itemized deduction that is phased out",
                       "c04470": "Post-phase-out itemized deduction",
                       "c04800": "Federal regular taxable income",
                       "c05200": "Regular tax on taxable income",
                       "c07220": "Child tax credit (adjusted)",
                       "c11070": "Extra child tax credit (refunded)",
                       "c07180": "Child care credit",
                       "_eitc": "Federal EITC",
                       "c62100_everyone": "federal AMT taxable income",
                       "c09600": "federal AMT liability"}

    cal = DataFrame.from_dict(calculated_vars, orient='index')
    cal.columns = ['description']

    puf_ecodes_info = pd.read_csv(EVAR_PATH)

    # Use all variable list minus unused variable list
    # to get used variable list
    VALID_READ_VARS = records.VALID_READ_VARS

    CODES_IMP = set(['AGIR1', 'DSI', 'EFI', 'EIC', 'ELECT', 'FDED',
                     'FLPDYR', 'FLPDMO', 'f2441', 'f3800', 'f6251',
                     'f8582', 'f8606', 'f8829', 'f8910', 'f8936', 'n20',
                     'n24', 'n25', 'n30', 'PREP', 'SCHB', 'SCHCF', 'SCHE',
                     'TFORM', 'IE', 'TXST', 'XFPT', 'XFST', 'XOCAH',
                     'XOCAWH', 'XOODEP', 'XOPAR', 'XTOT', 'MARS', 'MIDR',
                     'RECID', 'gender', 'wage_head', 'wage_spouse',
                     'earnsplit', 'age', 'agedp1', 'agedp2', 'agedp3',
                     'AGERANGE', 's006', 's008', 's009', 'WSAMP', 'TXRT',
                     'filer', 'matched_weight', 'e00200p', 'e00200s',
                     'e00900p', 'e00900s', 'e02100p', 'e02100s'])

    UNUSED_READ_VARS = records.UNUSED_READ_VARS

    USED_VARS = list(VALID_READ_VARS - CODES_IMP - UNUSED_READ_VARS)

    # read variable description from e_variable_info.csv
    table = {}
    for i in range(0, len(USED_VARS) - 1):
        # use variable names as keys of dictionary
        var_name = USED_VARS[i]
        f = (puf_ecodes_info.Input_Name == var_name)
        description = puf_ecodes_info.Definition_2014[f].values[0]
        table[var_name] = description

    table = pd.DataFrame.from_dict(table, orient='index')
    table.columns = ["description"]

    table = table.append(cal)
    return table
Example #28
0
 def calculate(self):
     self.data = None
     cursor = connection.cursor()
     cursor.execute(self.get_query(),
                    dict(year=self.parameters.registry_year,
                         period=self.parameters.registry_period))
     self.data = DataFrame.from_dict(self.prepare_data(dictfetchall(cursor)), orient='columns')
     cursor.close()
Example #29
0
def pct_students_first_choice(to_compare):
	results = []
	for i in to_compare:
		results.append(float(sum([1 if len(s.assigned) > 0 and (s.preference[0] in s.assigned) else 0 for s in si_students[i]])) / float(nstudents))
	results = map(lambda x: x * 100, results)

	df = DataFrame.from_dict({'mechanism': sublist(mechanisms, to_compare), 'pct_students': results})
	return (results, bar_graph(df, "% Students Matched With Top Choice\n"))
Example #30
0
    def test_get_dummies_dont_sparsify_all_columns(self, sparse):
        # GH18914
        df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]),
                                              ('Nation', ['AB', 'CD'])]))
        df = get_dummies(df, columns=['Nation'], sparse=sparse)
        df2 = df.reindex(columns=['GDP'])

        tm.assert_frame_equal(df[['GDP']], df2)