def do_collate_pnpn(f_prefixes, minpos, minperc, mingenes, minsamples, minsamples_gene): tmpdir = mkdirifnotexists(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 'tmpfiles')) for fname in glob(f_prefixes + '*.pnpn.df'): _collate_pnpn_inner(fname, mingenes, minsamples, minsamples_gene, tmpdir) ret = [] ret_g1 = [] ret_g2 = [] for fname in glob(join(tmpdir, '*.tmp.df')): ret.append(read_pickle(fname).T) for fname in glob(join(tmpdir, '*.tmp.g1.df')): ret_g1.append(read_pickle(fname).T) for fname in glob(join(tmpdir, '*.tmp.g2.df')): ret_g2.append(read_pickle(fname).T) outdir = join(SNP.OM_RGC.OutDirCollate, 'pnpn') with open(join(outdir, 'pNpNCases.txt'), 'w') as ftxt: ftxt.write('Conditions for pN groups in this analysis\n') ftxt.write('Always pN(G1)/pN(G2) so invert if G1 is more conservative\n\n') bigdf = concat(ret, sort=False) bigg1 = concat(ret_g1, sort=False) bigg2 = concat(ret_g2, sort=False) for j, col in enumerate(bigdf.index.get_level_values(0).unique()): ftxt.write('Case {}: {}\n'.format(j,col)) bigdf.loc[col].T.to_csv(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 'pNpN_Case_{}_{}_{}_{}_{}_{}_{}.csv'\ .format(j, f_prefixes.split('/')[-1], minpos, minperc, mingenes, minsamples, minsamples_gene))) bigg1.loc[col].T.to_csv(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 'pNpN_Case_{}_{}_{}_{}_{}_{}_{}.g1.csv'\ .format(j, f_prefixes.split('/')[-1], minpos, minperc, mingenes, minsamples, minsamples_gene))) bigg2.loc[col].T.to_csv(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 'pNpN_Case_{}_{}_{}_{}_{}_{}_{}.g2.csv'\ .format(j, f_prefixes.split('/')[-1], minpos, minperc, mingenes, minsamples, minsamples_gene)))
def unite_sampledata(): concat([ read_pickle(Biodata.bioGEOTRACES.metadataDF), read_pickle(Biodata.ALOHA_BATS.metadataDF), read_pickle(Biodata.TARA.metadataDF) ], sort=False).to_pickle(Biodata.United.metadataDF)
def do_one_group_ffdeg_piwit(nm, f_prefix, genegroup, analysisclass, mingenes=3): ret = [] retlens = [] for prefix in set([g[:-4] for g in genegroup]): f_in = join(analysisclass.OutDir, prefix + '.ffdeg_pi_wit.df') if not exists(f_in): continue try: ldf = read_pickle(f_in) ldf = ldf.loc[[g for g in genegroup if g in ldf.index]] llensdf = read_pickle(f_in.replace('_pi_wit','_poss')) llensdf = llensdf.loc[[g for g in genegroup if g in llensdf.index]] except KeyError: continue if ldf.shape[0] > 0: ret.append(ldf) retlens.append(llensdf) if len(ret) == 0: return outdf = concat(ret, sort = False).dropna(how='all').dropna(how='all', axis = 1) if outdf.shape[0] < mingenes: return outdf_lens = concat(retlens) outdf_lens.name = 'Length' ret = {} for col in outdf: coldf = outdf[[col]].multiply(outdf_lens,axis=0).join(outdf_lens).dropna().sum() ret[col] = {'pi':coldf[col] / coldf['Length'], 'length':coldf['Length'], 'num_genes':len(outdf[col].dropna())} outpath = mkdirifnotexists(join(analysisclass.OutDirCollate, 'ffdeg')) DataFrame(ret).to_pickle(join(outpath, f_prefix + '_' + nm + '.ffdeg_pi_wit.df'))
def create_codon_trans_matrix(dirsname, tmpdir, grpnm): chdirmkifnotexist(join(tmpdir, 'tmpmq')) # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline for fname in glob(join(dirsname, '*codons*')): if exists( join(tmpdir, grpnm + basename(fname).replace('codons', 'sums'))): continue do_one_unite(fname, grpnm, tmpdir) df = None for f in glob(join(tmpdir, grpnm + '*sums.df')): print(basename(f)) if df is None: df = read_pickle(f) else: df = concat([df, read_pickle(f)], sort=False).groupby(level=[0, 1]).sum() df.columns = MultiIndex.from_tuples([('per',i) if '_s' not in i \ else ('sums',i.replace('_s','')) for i in df.columns]) sms = df.sums.sum().truediv(3) def applyfunc(ldf): ret = ldf.per.truediv(ldf.sums).loc[ldf.name].iloc[:, 0] ret[ldf.name] = 1 - ret.sum() return ret def applyfunc_counts(ldf): adds = ldf.sums.iloc[0][0] ret = ldf.per.loc[ldf.name].iloc[:, 0] ret[ldf.name] = adds - ret.sum() return ret df_counts = df.groupby( level=1, axis=1).apply(lambda x: x.groupby(level=0).apply(applyfunc_counts)) df_norm = df.groupby( level=1, axis=1).apply(lambda x: x.groupby(level=0).apply(applyfunc)) aas, _, _ = get_codon_table() df_aas = df_counts df_aas['aa_start'] = [aas[a] for a in df_counts.index.get_level_values(0)] df_aas['aa_end'] = [aas[a2] + ('.' if a1!=a2 and aas[a1]==aas[a2] else '') \ for a1, a2 in \ zip(df_counts.index.get_level_values(0),df_counts.index.get_level_values(1))] df_aas = df_aas.groupby(['aa_start', 'aa_end']).sum() df_norm.to_pickle( join(General.Basepath, grpnm + '_4_60_mutation_codons.df')) df_counts.to_pickle( join(General.Basepath, grpnm + '_4_60_mutation_codon_counts.df')) df_aas.to_pickle( join(General.Basepath, grpnm + '_4_60_mutation_aas_counts.df')) sms.to_pickle(join(General.Basepath, grpnm + '_4_60_nucleotide_count.df'))
def loadSitePhreeqcData(site, processedSitesDir = DEFAULT_DIR): """ Retrieves site PHREEQC data for an individual site from a directory of processed sites. Parameters ---------- site : string name of site to retrieve, with or without USGS- tag at beginning. processedSitesDir : string (optional) directory that contains the processed site directory associated with the desired site. It is important to change this if the default is not correct. (default='./Processed-Sites') Returns ------- sitedf : pandas.core.frame.DataFrame A pandas dataframe object with PHREEQC data from the requested site. """ #Add USGS tag if needed if not(site.startswith('USGS-')): site = 'USGS-'+site try: phreeqcFile = os.path.join(processedSitesDir, site, site+'-PHREEQC.pkl') sitedf = read_pickle(phreeqcFile) except IOError: print ("Problem reading pickle file: " + phreeqcFile ) return None return sitedf
def _collate_pnpn_inner(fname, mingenes, minsamples, minsamples_gene, outdir): grpname = split3way(fname)[1].replace('.pnpn','').split(':')[1] ret = defaultdict(dict) ret_g1 = defaultdict(dict) ret_g2 = defaultdict(dict) df = read_pickle(fname) for nm, ldf in df.groupby(level=1): keepinds = ldf.index.get_level_values(0).isin(\ ((ldf.groupby(level=0).count() > 1).sum(1) >= minsamples_gene)\ .replace(False, np.nan).dropna().index) ldf = ldf.loc[keepinds] ldf = ldf.loc[:,(ldf.groupby(level=0).count() > 1).sum(0) >= mingenes] if ldf.shape[1] <= minsamples: continue gs = ldf[['GeneSites']] ldf = ldf.drop('GeneSites', axis = 1) for col in ldf.columns: coldf = ldf[[col]].join(gs).dropna() coldf = coldf.groupby(level=2).sum() coldf = coldf[col].truediv(coldf['GeneSites']) ret_g1[(nm,grpname)][col] = coldf['G1'] ret_g2[(nm,grpname)][col] = coldf['G2'] ret[(nm,grpname)][col] = (coldf['G1']/coldf['G2']) if coldf['G2'] !=0 else np.nan outdf = DataFrame(ret) outdf_g1 = DataFrame(ret_g1) outdf_g2 = DataFrame(ret_g2) if outdf.shape != (0,0): outdf.to_pickle(join(outdir, grpname + '.tmp.df')) if outdf_g1.shape != (0,0): outdf_g1.to_pickle(join(outdir, grpname + '.tmp.g1.df')) if outdf_g2.shape != (0,0): outdf_g2.to_pickle(join(outdir, grpname + '.tmp.g2.df'))
def get_measurements_TARA(): tara_md = read_pickle(Biodata.TARA.metadataDF) tara_ixs = tara_md.groupby(level=0).first().index carbchem = readTARAxls(Biodata.TARA.CarbChemXL, Biodata.TARA.XLHeaderLine) nutrient = readTARAxls(Biodata.TARA.NutrientsXL, Biodata.TARA.XLHeaderLine) hplc = readTARAxls(Biodata.TARA.HPLCXL, Biodata.TARA.XLHeaderLine) sensors = read_csv(Biodata.TARA.DepthSensorCSV, sep='\t', header=Biodata.TARA.DepthSensorHeader) sensors = sensors.rename(columns = {sensors.columns[2]:'SampleID', sensors.columns[0]:'TARA_SampleID'})\ .drop(sensors.columns[1], axis = 1).set_index('SampleID') sensors.columns = [c.split(' (')[0] + ('_min' if '(minimum' in c \ else '_25p' if '(lower quartile' in c \ else '_median' if '(median' in c \ else '_75p' if '(upper quartile' in c \ else '_max' if '(maximum' in c \ else '' if 'OXYGEN' in c and c.endswith(')') \ else '!@#$%' if '(calculated' in c or '(Calculated' in c \ else '') for c in sensors.columns] sensors = sensors[[c for c in sensors.columns if '!@#$%' not in c]] concat([sensors.loc[tara_ixs], nutrient.loc[tara_md.groupby(level=0).first().index].loc[tara_ixs].iloc[:,16:], carbchem.loc[tara_md.groupby(level=0).first().index].loc[tara_ixs].iloc[:,16:], hplc.loc[tara_md.groupby(level=0).first().index].loc[tara_ixs].iloc[:,16:]], axis=1)\ .to_pickle(Biodata.TARA.SampleMeasurementsDF)
def get_measurements_HOT(): hotmd = read_pickle(Biodata.ALOHA_BATS.metadataDF) hotmd = hotmd[hotmd.Cruise_series == 'HOT'] def read_ds(f): ds = open_dataset(f, decode_times=False) df = ds.to_dataframe().reset_index() if ds.time_coverage_start != ds.time_coverage_end: raise dsdt = datetime.strptime(ds.time_coverage_start, '%Y-%m-%dT%H:%M:%SZ') if abs(hotmd.Collection_datetime - dsdt).min() > timedelta(days=2): return None df['TIME'] = dsdt return df ctd = concat( [read_ds(f) for f in glob(join(Biodata.ALOHA_BATS.ALOHACTD, '*.nc'))]) ctd = ctd.rename(columns={'TIME': 'Collection_datetime'}).reset_index() water = concat([ read_ds(f) for f in glob(join(Biodata.ALOHA_BATS.ALOHAWater, '*.nc')) ]) water = water.rename(columns={'TIME': 'Collection_datetime'}).reset_index() ret = {} for nm, row in hotmd.iterrows(): watrow = _getrowmd(water, row, 'DEPTH', False) ctdrow = _getrowmd(ctd, row, 'DEPTH', False) watrow.index = [i.replace('Orig', 'BotOrig') for i in watrow.index] ret[nm[0]] = concat([ctdrow, watrow[11:]]) df = DataFrame({k: v for k, v in ret.items() if type(v) == Series}).T df = df[[c for c in df.columns if not c.endswith('_QC')]] df.replace('nan', np.nan).drop('index', axis=1).to_pickle( Biodata.ALOHA_BATS.ALOHASampleMeasurementsDF)
def loadSitePhreeqcData(site, processedSitesDir=DEFAULT_DIR): """ Retrieves site PHREEQC data for an individual site from a directory of processed sites. Parameters ---------- site : string name of site to retrieve, with or without USGS- tag at beginning. processedSitesDir : string (optional) directory that contains the processed site directory associated with the desired site. It is important to change this if the default is not correct. (default='./Processed-Sites') Returns ------- sitedf : pandas.core.frame.DataFrame A pandas dataframe object with PHREEQC data from the requested site. """ #Add USGS tag if needed if not (site.startswith('USGS-')): site = 'USGS-' + site try: phreeqcFile = os.path.join(processedSitesDir, site, site + '-PHREEQC.pkl') sitedf = read_pickle(phreeqcFile) except IOError: print("Problem reading pickle file: " + phreeqcFile) return None return sitedf
def prepare_and_merge_data(): # Retrieves all dataframes and merges into a single dataframe # which is then pickled job_data = read_pickle('Job Data.pkl') company_data = read_pickle('LinkedIn Company Data.pkl') industry_data = read_pickle('LinkedIn Industry Data.pkl') speciality_data = read_pickle('LinkedIn Speciality Data.pkl') # Add in derived data and fill in blank data job_data['post_year'] = job_data.date_posted.apply(get_year) # Get date_posted year job_data['post_month'] = job_data.date_posted.apply(get_month) # Get date_posted month job_data['desc_word_count'] = job_data.description.apply(get_word_count) # Number of words in job description job_data['desc_char_count'] = job_data.description.apply(get_char_count) # Number of characters in job description job_data['estimated_seniority_value'] = job_data.estimated_seniority.apply(get_est_seniority_value) # Convert estimated seniority to an integer company_data.loc[company_data.employee_count_code.isnull(), 'employee_count_code'] = 'D' # '51-200' company_data.loc[company_data.company_type_code.isnull(), 'company_type_code'] = 'P' # 'Privately Held' company_data['employee_count_value'] = company_data.employee_count_code.apply(get_emply_count_value) # Convert employee count code to an integer company_data['company_type_value'] = company_data.company_type_code.apply(get_cmpny_type_value) # Convert company type code to an integer industry_data = pd.merge(industry_data, company_data[['lnkn_name']], how = 'right', on = 'lnkn_name') industry_data.loc[industry_data.industry_type_name.isnull(), 'industry_type_name'] = 'Unknown' # Converting the Industry and Speciality data into dataframes of frequencies # Only counting a subset of specialities as data science-y industry_group = industry_data[['lnkn_name', 'industry_type_name']].groupby(['lnkn_name', 'industry_type_name']).size().unstack('industry_type_name') industry_group[industry_group.notnull()] = 1 industry_group[industry_group.isnull()] = 0 ds_specialities = ['Big Data', 'Analytics', 'Machine Learning', 'analytics', 'Data Science'] ds_specialities.extend(['Big Data Analytics', 'Natural Language Processing', 'Predictive Analytics', 'Data Mining']) speciality_group = speciality_data[speciality_data.speciality.isin(ds_specialities)].groupby(['lnkn_name', 'speciality']).size().unstack('speciality') speciality_group = pd.merge(speciality_group, company_data[['lnkn_name']], how = 'right', right_on = 'lnkn_name', left_index = True) speciality_group.set_index('lnkn_name', inplace = True) speciality_group[speciality_group.notnull()] = 1 speciality_group[speciality_group.isnull()] = 0 # Merge the dataframes merge_data = pd.merge(job_data, company_data, on = 'name') merge_data = pd.merge(merge_data, industry_group, left_on = 'lnkn_name', right_index = True) merge_data = pd.merge(merge_data, speciality_group, how = 'left', left_on = 'lnkn_name', right_index = True) merge_data.to_pickle('Clean Job Data.pkl')
def getLsiDict(): externalFilePath = '..' + os.sep + "projectMidPoint" + os.sep + "tmp" + os.sep + "LsiModel" + os.sep + "mergeLsiData.dict" externalFile = Path(externalFilePath) if not externalFile.is_file(): print("Running Will's LSI Model\n") os.chdir('..' + os.sep + "projectMidPoint" + os.sep) call(["python", "LsiModel.py"]) dictionary = pickle.read_pickle(externalFilePath) return dictionary
def do_one_unite(fname, grpnm, tmpdir): cod = read_pickle(fname) fcod = read_pickle(fname.replace('codons', 'percmut')) allcodons = get_relevant_codonpairs() def apply_counts(ldf): nm = ldf.index.get_level_values(0)[0] ldf.index = MultiIndex.from_tuples(ldf.index.get_level_values(1)) ldf = ldf.dropna(how='all', axis=1) codldf = Series({ i: (cod.loc[nm, i[0]] if i[0] in cod.loc[nm] else 0) for i in allcodons }) ldf = codldf.to_frame('cod').join(ldf).drop('cod', axis=1) for col in ldf.columns: ldf[col + '_s'] = codldf return ldf.fillna(0) fcod = fcod.groupby(level=0).apply(apply_counts) fcod.groupby(level=[1, 2]).sum().to_pickle( join(tmpdir, grpnm + basename(fname).replace('codons', 'sums')))
def main(): os.chdir(mkdirifnotexists(join(Calling.OM_RGC.CallDir, 'tmp'))) bioG_m = read_pickle(Biodata.bioGEOTRACES.metadataDF) ALOHA_m = read_pickle(Biodata.ALOHA_BATS.metadataDF) TARA_m = read_pickle(Biodata.TARA.metadataDF) allbams = concat([TARA_m, ALOHA_m, bioG_m], sort=False)[['ICRABAM_1', 'ICRABAM_2']] dirnames = sorted(list(set([ref[:-4] for ref \ in pysam.AlignmentFile(allbams.iloc[-1]['ICRABAM_1']).header.references]))) # This is set to process 80 genes (each with all samples) at a time. # Changing it to a higher setting will cause everything to run faster on a hpc system, but # take up more memory and space for intermediate files dirnamegrps = [dirnames[i:i + 80] for i in range(0, len(dirnames), 80)] for reference_list in dirnamegrps: # TODO: IMPORTANT! Wrap the loops in the called method with your hpc job submission pipeline # Also IMPORTANT! Make sure each loop runs synchronously with the next (wait for one to # finish before you start the next) # Estimated total CPU time for this part >25,000 hours (Intel(R) Xeon(R) CPU E5-2690 v3) do_references(allbams, Calling.OM_RGC.CallDir, reference_list, Calling.OM_RGC.FilterThreshold, Calling.OM_RGC.DbFasta, Calling.OM_RGC.mpileupParams, THREADS)
def from_pickle(path: Union[str, Path]) -> "Metropolis": """ Load pickled Metropolis object from file. Returns ------- [Metropolis] Deserialized Metropolis python object. """ metro = read_pickle(path) metro.lattice._set_recursion_limit() return metro
def demorinex(obsfn,maxchunk=None): #switchyard based on filename extension name,ext = splitext(obsfn) if ext[-1] == 'o': f = RINEXFile(obsfn,maxchunk) f.save_pickle(name + '.pickle') f.save_hdf5(name+'.h5') #this can crash some Python with incompatible PyTables/Pandas/HDF5 versions return f.data elif ext in ('.pkl','.pickle'): return read_pickle(expanduser(obsfn)) elif ext in ('.h5','.hdf5'): print('not implemented yet') return None
def unite_measurements(): biog = read_pickle(Biodata.bioGEOTRACES.SampleMeasurementsDF)\ .rename(columns = {'DEPTH [m]':DEPTH, 'CTDTMP [deg C]':TEMPERATURE, 'CTDSAL':SALINITY, 'CTDOXY [umol/kg]':OXYGEN, 'PHOSPHATE_D_CONC_BOTTLE [umol/kg]':PHOSPHATE, 'SILICATE_D_CONC_BOTTLE [umol/kg]':SILICATE, 'NITRATE_D_CONC_BOTTLE [umol/kg]':NITRATE, 'NO2+NO3_D_CONC_BOTTLE [umol/kg]':NITRIRA}) tara = read_pickle(Biodata.TARA.SampleMeasurementsDF)\ .rename(columns = {'Depth, nominal':DEPTH, 'Temp [°C]_median':TEMPERATURE, 'Sal_median':SALINITY, 'OXYGEN [µmol/kg]':OXYGEN, 'Phosphate_median':PHOSPHATE, #umol/l - https://doi.pangaea.de/10.1594/PANGAEA.839233 'Silicate_median':SILICATE, #umol/l '[NO3]- [µmol/l]_median':NITRATE, 'Nitrate and Nitrite_median':NITRIRA}) #umol/l tara[SILICATE] = tara[SILICATE].astype(float) bats = read_pickle(Biodata.ALOHA_BATS.BATSSampleMeasurementsDF)\ .rename(columns = {'Depth':DEPTH, 'Temperature [c]':TEMPERATURE, 'Salinity':SALINITY, 'Oxygen [umol/kg]':OXYGEN, 'Phosphate':PHOSPHATE, #umol/kg 'Silicate':SILICATE, #umol/kg 'Nitrate+Nitrite':NITRIRA}) #umol/kg bats[[DEPTH,TEMPERATURE,SALINITY,OXYGEN,PHOSPHATE,SILICATE,NITRIRA]] = \ bats[[DEPTH,TEMPERATURE,SALINITY,OXYGEN,PHOSPHATE,SILICATE,NITRIRA]].astype(float) alha = read_pickle(Biodata.ALOHA_BATS.ALOHASampleMeasurementsDF)\ .rename(columns = {'DEPTH':DEPTH, 'TEMP':TEMPERATURE, 'PSAL':SALINITY, 'DOXY1':OXYGEN, 'PO41':PHOSPHATE, #umol/kg 'SILC1':SILICATE, #umol/kg 'NO31':NITRATE}) #umol/kg tara[DEPTH] = tara[DEPTH].apply(lambda x: float(x) if '-' not in str(x) else np.nan) tara[[PHOSPHATE, SILICATE, NITRATE, NITRIRA]] = tara[[PHOSPHATE, SILICATE, NITRATE, NITRIRA]]\ .astype(float)\ .truediv(1 + tara['Sigma-theta [kg/m**3]_median']/1000, axis=0) #umol/l --> umol/kg concat([tara,bats,alha,biog], sort=False)\ [[DEPTH,TEMPERATURE,SALINITY,OXYGEN,PHOSPHATE,SILICATE,NITRATE,NITRIRA]]\ .to_pickle(Biodata.United.SampleMeasurementsDF)
def demorinex(obsfn, maxchunk=None): #switchyard based on filename extension name, ext = splitext(obsfn) if ext[-1] == 'o': f = RINEXFile(obsfn, maxchunk) f.save_pickle(name + '.pickle') f.save_hdf5( name + '.h5' ) #this can crash some Python with incompatible PyTables/Pandas/HDF5 versions return f.data elif ext in ('.pkl', '.pickle'): return read_pickle(expanduser(obsfn)) elif ext in ('.h5', '.hdf5'): print('not implemented yet') return None
def analyze_genes(genes_df_f, db_fasta, outdir, cachedir, min_pos_reads, min_perc_poss, min_total_var_support, min_maf, min_samples, min_variants): # This method potentially calculates all metrics (pN/pS, pi_within, etc.) for one gene file # out of approximately 2500, containing up to 10,000 separate genes (usually 500-1000) # It iterates all genes and calculates everything for each. # Then it concatenates and saves everything. The calling for each gene is to 'analyze_one'. df = read_pickle(genes_df_f) pnpn_groups = create_all_pnpn_groups() log_.info('Analyzing file {} minpos {} minperc {}'.format( basename(genes_df_f), min_pos_reads, min_perc_poss)) gene_seqs = _getgeneseqs(genes_df_f, db_fasta, df.index.get_level_values(0).unique(), cachedir) sitess, pnpss, percmuts, ffdeg_pws, pnpns = [], [], [], [], [] ffdeg_poss = {} for nm, genedf in df.groupby(level=0): sites, pnps, percmut, ffdeg_pw, pnpn = analyze_one( nm, genedf, min_pos_reads, min_perc_poss, min_total_var_support, min_maf, min_samples, min_variants, gene_seqs, pnpn_groups) if sites is not None: sitess.append(sites) if pnps is not None: pnpss.append(pnps) if percmut is not None: percmuts.append(percmut) if ffdeg_pw is not None: ffdeg_pws.append(ffdeg_pw[0]) ffdeg_poss[nm] = ffdeg_pw[1] if pnpn is not None: pnpns.append(pnpn) baseout = basename(genes_df_f).replace('.df', '') if len(sitess) > 0 and any([s is not None for s in sitess]): concat(sitess, sort=False).to_pickle(join(outdir, baseout + '.muts.df')) if len(pnpss) > 0 and any([p is not None for p in pnpss]): concat(pnpss, sort=False).to_pickle(join(outdir, baseout + '.pnps.df')) if len(percmuts) > 0: concat([p[0] for p in percmuts], sort=False).to_pickle(join(outdir, baseout + '.percmut.df')) concat([p[1] for p in percmuts], sort=False).to_pickle(join(outdir, baseout + '.codons.df')) if len(ffdeg_pws) > 0: concat(ffdeg_pws, sort=False).to_pickle(join(outdir, baseout + '.ffdeg_pi_wit.df')) Series(ffdeg_poss).to_pickle(join(outdir, baseout + '.ffdeg_poss.df')) if len(pnpns) > 0: concat(pnpns, sort=False).to_pickle(join(outdir, baseout + '.pnpn.df'))
def million_codes(): # This creates one million permutations of the genetic code aas, _, _ = get_codon_table() df = read_pickle( join(General.Basepath, 'All_4_60_mutation_codon_counts.df')) # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline for i in range(100): codon_risk(df, aas, 'All_{:02d}'.format(i), True, subdir='Million') compiled_f = join(CodeAnalysis.CodonsDir, 'Codon_risk_compiled.dat') ret = defaultdict(list) for i, fn in enumerate( glob(join(CodeAnalysis.CodonsDir, 'Million', '*.dat'))): ret_l = Utils.Load(fn) for var in ['n+_risk', 'c+_risk', 'o+_risk', 'hyd_risk', 'PR_risk']: ret[var].extend((ret_l[var] if i == 0 else ret_l[var][1:])) print(i) Utils.Write(compiled_f, ret) return compiled_f
def filter_db(dbdct, analysisclass, mingenes): indir = analysisclass.OutDir pnps_piwig_f = join(indir, 'PNPSPiWiGenes.dat') genes_in_use = [] if exists(pnps_piwig_f): pnpsgenes = Utils.Load(pnps_piwig_f) else: for fname in glob(join(indir, '*pnps.df')) + glob(join(indir, '*pi_wit.df')): print(fname) genes_in_use.extend(list(read_pickle(fname).index.get_level_values(0).unique())) pnpsgenes = list(set(genes_in_use)) Utils.Write(pnps_piwig_f, genes_in_use) ret = {} pnpsgenes = set(pnpsgenes) for k,v in dbdct.items(): if len(set(v).intersection(pnpsgenes)) >= mingenes: ret[k] = v return(ret)
def get_retweet_factors_impressions(): # Use the Twitter search API to retrieve recent tweets that contain high retweet scoring reference data # (User Mentions and Hashtags) as well as Chase mentions. To be run after Twitter_Analysis.analyze_retweet_factors. twitter = get_twitter_conn() score_table = read_pickle('k_score_table.pkl') sorted_score_table = score_table[score_table.Type.isin( ['User Mention', 'Hashtag'])].sort(columns=['Score'], ascending=False) # Retrieve names of top scoring reference data as well as anything that has 'chase' or 'sapphire' in the name top_score_table = sorted_score_table[sorted_score_table.Score > 1000.0] chase_tags = np.array([ 'chase' in name_val for name_val in sorted_score_table.Name.values.ravel() ]) sapphire_tags = np.array([ 'sapphire' in name_val for name_val in sorted_score_table.Name.values.ravel() ]) chase_score_table = sorted_score_table[chase_tags | sapphire_tags] table_array = [top_score_table, chase_score_table] retweet_tag_tweets = {} # Loop through each tag and retrieve the most recent 1000 results for data_table in table_array: for _, score_row in data_table.iterrows(): tag_results = [] results = twitter.cursor(twitter.search, q=score_row[0], count=100) n = 1 for result in results: tag_results.append(result) check_twitter_timeout(twitter) n += 1 if n > 1000: break retweet_tag_tweets[score_row[0]] = tag_results pickle.dump(retweet_tag_tweets, open("retweet_tag_tweets.pkl", "wb"))
def do_one_group_pnpn(nm, f_prefix, genegroup, analysisclass, mingenes=3): ret = [] for prefix in set([g[:-4] for g in genegroup]): f_in = join(analysisclass.OutDir, prefix + '.pnpn.df') if not exists(f_in): continue try: ldf = read_pickle(f_in) ldf = ldf.loc[[g for g in genegroup if g in ldf.index]] except KeyError: continue if ldf.shape[0] > 0: ret.append(ldf) if len(ret) == 0: return outdf = concat(ret, sort = False) outpath = mkdirifnotexists(join(analysisclass.OutDirCollate, 'pnpn')) if outdf.groupby(level=0).first().shape[0] < mingenes: return outdf.to_pickle(join(outpath, f_prefix + '_' + nm + '.pnpn.df'))
def calc_codon_costs(out_f=None, force_rerun=False): if exists(out_f) and not force_rerun: return read_pickle(out_f) aas, _, _ = get_codon_table() ret = {} for cod_1, aa1 in aas.to_dict().items(): for cod_2, aa2 in aas.to_dict().items(): ret[(cod_1, cod_2)] = {'aa_s': aa1, 'aa_e': aa2} aa_props = read_csv('./aa_NCHP.csv', index_col=0) ret = DataFrame(ret).T ret.index.names = ['Codon_s', 'Codon_e'] codon_props = ret.join(aa_props, on='aa_s').join(aa_props, on='aa_e', lsuffix='_s', rsuffix='_e') for v in ['C', 'N', 'hyd', 'PR']: codon_props[v + '_d'] = codon_props[v + '_e'] - codon_props[v + '_s'] codon_props[v + '_abs_d'] = abs(codon_props[v + '_d']) if out_f is not None: codon_props.to_pickle(out_f) return codon_props
def get_measurements_BATS(): ctd = concat([read_csv(f, header=None, sep='\t') \ for f in glob(join(Biodata.ALOHA_BATS.BATSASCIIDir, '*ctd.txt'))], sort = False).dropna().replace(-999, np.nan) ctd.columns = [ 'ID', 'Collection_datetime', 'Latitude', 'Longitude', 'Pressure [dbar]', 'Depth', 'Temperature [c]', 'Conductivity [S/m]', 'Salinity', 'Oxygen [umol/kg]', 'Beam Attenuation Coefficient [1/m]', 'Flourescence', 'PAR [uE/m2/s]' ] def todatetime(x): year = int(x) rem = x - year basedt = datetime(year, 1, 1) return basedt + timedelta(seconds=(basedt.replace(year=year + 1) - basedt).total_seconds() * rem) ctd.Collection_datetime = ctd.Collection_datetime.apply(todatetime) bot = read_csv(Biodata.ALOHA_BATS.BATSBottle, header=Biodata.ALOHA_BATS.BATSBottleHeader, sep='\t').reset_index().replace(-999, np.nan) bot.columns = [ 'ID', 'yyyymmdd', 'Collection_datetime', 'time', 'Latitude', 'Longitude', 'Depth', 'Temp', 'CTD_S', 'Sal1', 'Sig-th', 'O2_1', 'OxFix', 'Anom1', 'CO2', 'Alk', 'Nitrate+Nitrite', 'Nitrite', 'Phosphate', 'Silicate', 'POC', 'PON', 'TOC', 'TN', 'Bact', 'POP', 'TDP', 'SRP', 'BSi', 'LSi', 'Pro', 'Syn', 'Piceu', 'Naneu' ] bot.Collection_datetime = bot.Collection_datetime.apply(todatetime) batsmd = read_pickle(Biodata.ALOHA_BATS.metadataDF) batsmd = batsmd[batsmd.Cruise_series == 'BATS'] ret = {} for nm, row in batsmd.iterrows(): botrow = _getrowmd(bot, row, 'Depth', False) ctdrow = _getrowmd(ctd, row, 'Depth', False) botrow.index = [i.replace('Orig', 'BotOrig') for i in botrow.index] ret[nm[0]] = concat([ctdrow, botrow[10:]]) DataFrame({k:v for k,v in ret.items() if type(v) == Series}).T\ .to_pickle(Biodata.ALOHA_BATS.BATSSampleMeasurementsDF)
def codon_bias(outdir): # Data source: https://elifesciences.org/articles/41043 df = read_pickle('./resource/Proc_strains_codons.df') thr = df.loc[['ACT', 'ACA', 'ACC', 'ACG']] thr = thr.truediv(thr.sum()).T ile = df.loc[['ATT', 'ATA', 'ATC']] ile = ile.truediv(ile.sum()).T print(wilcoxon(thr['ACT'], thr['ACA'])) print(wilcoxon(thr['ACC'], thr['ACG'])) print(wilcoxon(ile['ATT'], ile['ATA'])) _, ax = plt.subplots(1, figsize=(8.2, 5.2), dpi=144) bp = ax.violinplot(concat([thr, ile], axis=1).T, positions=[1, 2, 3, 4, 6, 7, 8]) for partname in ('cbars', 'cmins', 'cmaxes'): vp = bp[partname] vp.set_edgecolor('k') vp.set_linewidth(1) [m.set_color('#0d4c7c') for m in bp['bodies'][:4]] [m.set_color('#891919') for m in bp['bodies'][-3:]] ax.set_ylim(0, 1.) ax.set_xticks(range(1, 9)) plt.savefig(join(outdir, 'Codon_usage.png'), dpi=144)
def parseeggnog(): df_f = join(Annotate.OM_RGC.EggnogDir, 'OM-RGC_annotations.df') if exists(df_f): df = read_pickle(df_f) else: df = concat([read_csv(f, sep='\t', header=None) \ for f in sorted(glob(join(Annotate.OM_RGC.AnnotDir, '*.annotations')))]) df.columns = [ 'GeneID', 'seed_eggNOG_ortholog', 'seed_ortholog_evalue', 'seed_ortholog_score', 'Predicted_taxonomic_group', 'Predicted_protein_name', 'GeneOntology', 'EC_number', 'KEGG_ko', 'KEGG_Pathway', 'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE', 'KEGG_TC', 'CAZy', 'BiGG_Reaction', 'tax_scope', 'eggNOG_OGs', 'bestOG', 'COG_Functional_Category', 'eggNOG_description' ] df = df.set_index('GeneID') df.to_pickle(df_f) for col in df.columns[3:19]: if col == 'bestOG': continue if exists(join(Annotate.OM_RGC.EggnogDir, col + '.dat')): continue ret = defaultdict(list) locser = df[col].dropna() for nm in locser.index: if col == 'eggNOG_OGs': terms = set([x.split('@')[0] for x in locser[nm].split(',')]) elif col == 'COG_Functional_Category': terms = set([x for x in locser[nm]]) else: terms = set(locser[nm].split(',')) for term in terms: ret[term].append(nm) Write(join(Annotate.OM_RGC.EggnogDir, col + '.dat'), ret) print(col)
def multi_organism_analyze(): # This replicates the analysis presented in Fig. 4 # Source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5581930/ codons_all = read_pickle('./resource/ModelOrganisms.df').set_index('Taxid') # Take only the organisms with more than 50K codons in the calculation codons_all = codons_all.loc[codons_all.iloc[:, 11:].sum(1) >= 50000] aas, _, _ = get_codon_table() # Create alternative codes for each organism and transition-transverskion rate # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline for taxid, row in codons_all.iterrows(): codons = row[11:].astype(float) for titv in [0.2, 0.25, 0.333, 0.5, 0.667, 1, 1.5, 2, 3, 4, 5]: ti = (2 * titv) / (1 + 2 * titv) codon_risk(None, aas, 'Tax_{}_Rate_{}'.format(taxid, titv), all_mutations=False, external_counts=codons, external_titv=(ti, 1 - ti), subdir='MultiOrg') # Collate the results in one table proc_stats = {} for fnm in glob(join(CodeAnalysis.CodonsDir, 'MultiOrg/*.dat')): tax = float(basename(fnm).split('Tax_')[-1].split('_')[0]) rate = float(basename(fnm).split('Rate_')[-1].split('_')[0]) ret = Utils.Load(fnm) npr = np.array(ret['n+_risk']) cpr = np.array(ret['c+_risk']) proc_stats[(tax, rate)] = { 'cpr_p': sum(cpr[1:] <= cpr[0]) / 10000., 'npr_p': sum(npr[1:] <= npr[0]) / 10000., 'ncpr_p': sum((cpr[1:] <= cpr[0]) & (npr[1:] <= npr[0])), 'cpr': cpr[0], 'npr': npr[0] } DataFrame(proc_stats).to_pickle( join(CodeAnalysis.CodonsDir, 'MultiOrg_rates.df'))
def get_measurements_GEOTRACES(depth_tolerance=None): sample_md = read_pickle(Biodata.bioGEOTRACES.metadataDF) disc_df = read_csv(Biodata.bioGEOTRACES.DiscreteSampleTXT, sep='\t') disc_df = disc_df.rename( columns={'yyyy-mm-ddThh:mm:ss.sss': 'Collection_datetime'}) disc_df.Collection_datetime = disc_df.Collection_datetime\ .apply(lambda x:datetime.strptime(x,'%Y-%m-%dT%H:%M:%S')) disc_df = disc_df[disc_df.Collection_datetime > datetime(2000, 1, 1)] ret = {} for nm, row in sample_md.iterrows(): d_row = _getrowmd(disc_df, row, depth_tolerance=depth_tolerance) if depth_tolerance is not None: if type(d_row) == Series: ret[(*nm, d_row.name)] = d_row[[i for i in d_row.index \ if (not i.startswith('QV') and (not i.startswith('STANDARD_DEV')))]] else: #if more than one date, take closest datediffs = (d_row.Collection_datetime.astype(np.datetime64) - d_row.Orig_datetime).apply(np.abs) d_row = d_row.loc[datediffs[datediffs == datediffs.min()].index] for rownm, row in d_row.iterrows(): ret[(*nm, rownm)] = row[[i for i in row.index \ if (not i.startswith('QV') and (not i.startswith('STANDARD_DEV')))]] else: ret[nm] = d_row[[i for i in d_row.index \ if (not i.startswith('QV') and (not i.startswith('STANDARD_DEV')))]] if depth_tolerance is None: DataFrame(ret).dropna(how='all').T.groupby(level=0).first()\ .to_pickle(Biodata.bioGEOTRACES.SampleMeasurementsDF) else: df = DataFrame(ret).dropna(how='all').T.reset_index()\ .rename(columns={'level_0':'SampleID','level_2':'MeasurementID'})\ .set_index(['SampleID','MeasurementID']).drop('level_1',axis=1) df.to_pickle(Biodata.bioGEOTRACES.SampleMeasurementsDF\ .replace('.df','.tol_{}.df'.format(depth_tolerance)))
def do_collate(f_prefixes, minpos, minperc, mingenes, minsamples, minsamples_gene): ret = defaultdict(dict) ps = defaultdict(dict) pn = defaultdict(dict) for fname in glob(f_prefixes + '*.pnps.df'): grpname = split3way(fname)[1].replace('.pnps','') df = read_pickle(fname) keepinds = df.index.get_level_values(0).isin(\ ((df.groupby(level=0).count() > 1).sum(1) >= minsamples_gene)\ .replace(False, np.nan).dropna().index) df = df.loc[keepinds] df = df.loc[:,(df.groupby(level=0).count() > 1).sum(0) >= mingenes] if df.shape[1] <= minsamples: continue gs = df[['GeneSites']] df = df.drop('GeneSites', axis = 1) for col in df.columns: coldf = df[[col]].join(gs).dropna() coldf = coldf.groupby(level=1).sum() coldf = coldf[col].truediv(coldf['GeneSites']) ret[grpname][col] = coldf.NS/coldf.S pn[grpname][col] = coldf.NS ps[grpname][col] = coldf.S print(grpname) df = DataFrame(ret) ps = DataFrame(ps) pn = DataFrame(pn) df.to_csv(join(General.Basepath, '{}_{}_{}_{}_{}_{}.csv'\ .format(f_prefixes.split('/')[-1], minpos, minperc, mingenes, minsamples, minsamples_gene))) pn.to_csv(join(General.Basepath, '{}_{}_{}_{}_{}_{}.pn.csv'\ .format(f_prefixes.split('/')[-1], minpos, minperc, mingenes, minsamples, minsamples_gene))) ps.to_csv(join(General.Basepath.OutDirCollate, '{}_{}_{}_{}_{}_{}.ps.csv'\ .format(f_prefixes.split('/')[-1], minpos, minperc, mingenes, minsamples, minsamples_gene)))
def load(self, path): # TODO remove in 0.13 import warnings from pandas.io.pickle import read_pickle warnings.warn("load is deprecated, use pd.read_pickle", FutureWarning) return read_pickle(path)
def run_model(): # Run the models against the data and write the results to a spreadsheet # Retrieve the data that we processed in prepare_and_merge_data() clean_job_data = read_pickle('Clean Job Data.pkl') clean_job_data = clean_job_data[clean_job_data.estimated_salary > 0.0] # These are the columns that we don't need drop_cols = ['company_id', 'date_posted', 'description', 'estimated_salary', 'expiration_date'] drop_cols.extend(['job_id', 'pay_rate', 'position', 'skill_count', 'source_uri', 'estimated_seniority']) drop_cols.extend(['name', 'company_industry', 'company_type', 'is_public', 'number_of_employees', 'status_name']) drop_cols.extend(['status_code', 'lnkn_description', 'websiteUrl', 'employee_count_code', 'lnkn_universal_name']) drop_cols.extend(['company_type_name', 'lnkn_name', 'employee_count_name', 'company_type_code', 'clean_pay_rate_annualized']) # Use records that have the pay rate provided in the job post - this is a small set pay_rate_data = clean_job_data[clean_job_data.clean_pay_rate_annualized.notnull()] pay_cols = get_clean_column_names(pay_rate_data, drop_cols) print 'Number of Clean Pay Rate Records: {}'.format(len(pay_rate_data)) x1 = pay_rate_data[pay_cols].astype(int) y1 = pay_rate_data.clean_pay_rate_annualized _, _, y1_train, y1_test = get_train_test_sets(x1, y1, True) print '{} Training Records / {} Testing Records'.format(y1_train.size, y1_test.size) # Use records that have an estimated salary, which we will round to nearest 1k print 'Number of Estimated Salary Records: {}'.format(len(clean_job_data)) est_sal_cols = get_clean_column_names(clean_job_data, drop_cols) x2 = clean_job_data[est_sal_cols].astype(int) y2 = clean_job_data.estimated_salary.apply(round_to_thousands) _, _, y2_train, y2_test = get_train_test_sets(x2, y2, False) print '{} Training Records / {} Testing Records'.format(y2_train.size, y2_test.size) # Different approach - groups salaries in amounts of 10k and see if we can get better results y3 = pay_rate_data.clean_pay_rate_annualized.apply(convert_to_salary_range) # Convert Pay Rate to a range y4 = clean_job_data.estimated_salary.apply(convert_to_salary_range) # Convert Est Salary to a range # Transform the independent variables using PCA to see if that helps on some of the models pca = PCA().set_params(n_components = 0.9) x3 = normalize_and_apply_pca(x1, pca) x4 = normalize_and_apply_pca(x2, pca) results_book = xlwt.Workbook() head_style = get_header_style() pyrt_sh = results_book.add_sheet('Pay Rate') pyrt_sh.write(0, 0, "Model Name", head_style) pyrt_sh.write(0, 1, "Dataset", head_style) pyrt_sh.write(0, 2, "Training Score", head_style) pyrt_sh.write(0, 3, "Testing Score", head_style) pyrt_sh.write(0, 4, "Training MSE", head_style) pyrt_sh.write(0, 5, "Testing MSE", head_style) pyrt_sh.write(0, 6, "Best K", head_style) pyrt_sh.write(0, 7, "Best Parameters", head_style) estsal_sh = results_book.add_sheet('Est Salary') estsal_sh.write(0, 0, "Model Name", head_style) estsal_sh.write(0, 1, "Dataset", head_style) estsal_sh.write(0, 2, "Training Score", head_style) estsal_sh.write(0, 3, "Testing Score", head_style) estsal_sh.write(0, 4, "Training MSE", head_style) estsal_sh.write(0, 5, "Testing MSE", head_style) estsal_sh.write(0, 6, "Best K", head_style) estsal_sh.write(0, 7, "Best Parameters", head_style) # Do an initial test using linear models with different shapes for the dependent variable linear_datasets = [("Pay Rate", x1, y1, True), ("Log Pay Rate", x1, np.log(y1), True), ("Sqrt Pay Rate", x1, np.sqrt(y1), True), ("Est Salary", x2, y2, False), ("Log Est Salary", x2, np.log(y2), False), ("Sqrt Est Salary", x2, np.sqrt(y2), False), ("Pay Rate Range", x1, y3, True), ("Log Pay Rate Range", x1, np.log(y3), True), ("Sqrt Pay Rate Range", x1, np.sqrt(y3), True), ("Est Salary Range", x2, y4, False), ("Log Est Salary Range", x2, np.log(y4), False), ("Sqrt Est Salary Range", x2, np.sqrt(y4), False) ] linear_models = [("OLS", linear_model.LinearRegression()), ("Ridge", linear_model.RidgeCV(normalize = True, fit_intercept = False, scoring = 'mean_squared_error', cv = 5)), ("Lasso", linear_model.LassoCV(normalize = True, fit_intercept = False, cv = 5))] prow = 1 erow = 1 for data in linear_datasets: x_train, x_test, y_train, y_test = get_train_test_sets(data[1], data[2], data[3]) for model in linear_models: train_score, test_score, train_mse, test_mse = get_model_values(model[1], x_train, y_train, x_test, y_test) if data[3] == True: prow = write_to_spreadsheet(model[0], data[0], train_score, test_score, train_mse, test_mse, None, None, pyrt_sh, prow) else: erow = write_to_spreadsheet(model[0], data[0], train_score, test_score, train_mse, test_mse, None, None, estsal_sh, erow) # Test on a different set of models, where we're applying PCA to reduce the number of features datasets = [("Pay Rate", x1, y1, True), ("PCA Pay Rate", x3, y1, True), ("Pay Rate Range", x1, y3, True), ("PCA Pay Rate Range", x3, y3, True), ("Est Salary", x2, y2, False), ("PCA Est Salary", x4, y2, False), ("Est Salary Range", x2, y4, False), ("PCA Est Salary Range", x4, y4, False) ] models = [("KNN", neighbors.KNeighborsClassifier(), {'n_neighbors' : np.arange(3, 9), 'weights' : ['uniform', 'distance'], 'p' : [1, 2]}), ("Decision Tree", tree.DecisionTreeClassifier(), {'criterion' : ['gini', 'entropy'], 'max_features' : [None, 'auto', 'log2']}), ("Random Forest", ensemble.RandomForestClassifier(), {'criterion': ['gini', 'entropy'], 'max_features' : [None, 'auto', 'log2'], 'n_estimators': np.arange(10, 110, 10)}) ] for data in datasets: x_train, x_test, y_train, y_test = get_train_test_sets(data[1], data[2], data[3]) for model in models: _, best_params, train_score, test_score, train_mse, test_mse = get_grid_search_values(model[1], model[2], x_train, y_train, x_test, y_test, 'accuracy') if data[3] == True: prow = write_to_spreadsheet(model[0], data[0], train_score, test_score, train_mse, test_mse, None, best_params, pyrt_sh, prow) else: erow = write_to_spreadsheet(model[0], data[0], train_score, test_score, train_mse, test_mse, None, best_params, estsal_sh, erow) # Use the best K on LDA - had collinearity issues with full feature set datasets = [("Pay Rate Range Best K", x1, y3.values.ravel(), True), ("Est Salary Range Best K", x2, y4.values.ravel(), False) ] models = [("LDA", lda.LDA())] for data in datasets: for model in models: best_k, train_score, test_score, train_mse, test_mse = get_best_k_model(model[1], 20, data[1], data[2]) if data[3] == True: prow = write_to_spreadsheet(model[0], data[0], train_score, test_score, train_mse, test_mse, best_k, None, pyrt_sh, prow) else: erow = write_to_spreadsheet(model[0], data[0], train_score, test_score, train_mse, test_mse, best_k, None, estsal_sh, erow) results_book.save("Model Results.xls")
def update_company_data_from_linkedin(): # Retrieves all of the company names from the job postings, # and queries LinkedIn for additional information # Define CONSUMER_KEY, CONSUMER_SECRET, # USER_TOKEN, and USER_SECRET from the credentials # provided in your LinkedIn application # Instantiate the developer authentication class authentication = linkedin.LinkedInDeveloperAuthentication(LINKEDIN_CONSUMER_KEY, LINKEDIN_CONSUMER_SECRET, LINKEDIN_OAUTH_USER_TOKEN, LINKEDIN_OAUTH_USER_SECRET, RETURN_URL, linkedin.PERMISSIONS.enums.values()) # Pass it in to the app... application = linkedin.LinkedInApplication(authentication) job_data = read_pickle('Job Data.pkl') company_list = np.unique(job_data.name.values.ravel()) # Set dict of return values and inputs comp_sels = [{'companies': ['name', 'universal-name', 'description', 'company-type', 'industries', 'status', 'employee-count-range', 'specialties', 'website-url']}] comp_params = {'keywords' : None} # Data dictionaries - going to convert them into Pandas dataframes linkedin_companies = {} linkedin_industries = {} linkedin_specialities = {} # Loop through the unique set of companies for idx, comp_name in enumerate(company_list): comp_params['keywords'] = comp_name # Set company name as keyword comp_vals = application.search_company(selectors = comp_sels, params = comp_params) if comp_vals['companies']['_total'] == 0: # No results returned continue # Calculate the edit distance between the returned results and the input name dist_vals = [] for jdx, company in enumerate(comp_vals['companies']['values']): link_comp_name = company['name'] name_dist = fuzzy_match(comp_name, link_comp_name) dist_vals.append([link_comp_name, name_dist, jdx]) # Sort the values and choose the best one sort_dist_vals = sorted(dist_vals, key=lambda s: s[1]) best_guess_company = comp_vals['companies']['values'][sort_dist_vals[0][2]] best_guess_name = sort_dist_vals[0][0] status_code, status_name = get_lnkin_code_name(best_guess_company, 'status') company_type_code, company_type_name = get_lnkin_code_name(best_guess_company, 'companyType') employee_count_code, employee_count_name = get_lnkin_code_name(best_guess_company, 'employeeCountRange') # Store company related data in a dictionary linkedin_company = {} linkedin_company['name'] = comp_name linkedin_company['lnkn_name'] = best_guess_name linkedin_company['lnkn_universal_name'] = best_guess_company.get('universalName') linkedin_company['lnkn_description'] = best_guess_company.get('description') linkedin_company['status_code'] = status_code linkedin_company['status_name'] = status_name linkedin_company['company_type_code'] = company_type_code linkedin_company['company_type_name'] = company_type_name linkedin_company['employee_count_code'] = employee_count_code linkedin_company['employee_count_name'] = employee_count_name linkedin_company['websiteUrl'] = best_guess_company.get('websiteUrl') linkedin_companies[idx] = linkedin_company # Store industry data in a separate dict if 'industries' in best_guess_company: if best_guess_company['industries']['_total'] > 0: ind_start = len(linkedin_industries) for jdx, industry in enumerate(best_guess_company['industries']['values']): linkedin_industry = {} linkedin_industry['lnkn_name'] = best_guess_name linkedin_industry['industry_type_code'] = industry['code'] linkedin_industry['industry_type_name'] = industry['name'] linkedin_industries[ind_start + jdx] = linkedin_industry # Store speciality data in a separate dict if 'specialties' in best_guess_company: if best_guess_company['specialties']['_total'] > 0: spec_start = len(linkedin_specialities) for jdx, speciality in enumerate(best_guess_company['specialties']['values']): linkedin_speciality = {} linkedin_speciality['lnkn_name'] = best_guess_name linkedin_speciality['speciality'] = speciality linkedin_specialities[spec_start + jdx] = linkedin_speciality
def test_ds_words(): # Reads the description and job position text from a file of job posts, # tokenizing the results to view top n words, bigrams, and trigrams. Results # are written to a text file. job_data = read_pickle('Job Data.pkl') # Tokenize the job text summ_words = [] desr_words = [] for _, row in job_data.iterrows(): if row['position'] is not None: summ_words += get_word_tokenize(row['position']) if row['description'] is not None: desr_words += get_word_tokenize(row['description']) stopwords = nltk.corpus.stopwords.words('english') stopwords += [',', '.', ':', '(', ')', '-', ';', '&', '!', '?','\'s'] words_file = open("top_N_words.log","w") # Get the Top N words top_n_summ_words = get_top_n_words(summ_words, 10, stopwords) top_n_desr_words = get_top_n_words(desr_words, 50, stopwords) print >> words_file, 'Top 10 Job Position Words\n' for top_word in top_n_summ_words: print >> words_file, top_word print >> words_file, '\n\n' print >> words_file, 'Top 50 Job Description Words\n' for top_word in top_n_desr_words: print >> words_file, top_word min_hits = int(len(job_data) * 0.05) print >> words_file, 'Top 50 Job Description Bigrams\n' # Get the Bigrams big_2_words = get_ngrams(2, desr_words, min_hits, 50, stopwords) print >> words_file, '\n\n' for top_word in big_2_words: print >> words_file, ' '.join(top_word) print >> words_file, '\n\n' print >> words_file, 'Top 50 Job Description Trigrams\n' # Get the Trigrams big_3_words = get_ngrams(3, desr_words, min_hits, 50, stopwords) for top_word in big_3_words: print >> words_file, ' '.join(top_word) words_file.close()