def do_collate_pnpn(f_prefixes, minpos, minperc, mingenes, minsamples, minsamples_gene):
    tmpdir = mkdirifnotexists(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 'tmpfiles'))
    for fname in glob(f_prefixes + '*.pnpn.df'):
        _collate_pnpn_inner(fname, mingenes, minsamples, minsamples_gene, tmpdir)
    ret = []
    ret_g1 = []
    ret_g2 = []
    for fname in glob(join(tmpdir, '*.tmp.df')):
        ret.append(read_pickle(fname).T)
    for fname in glob(join(tmpdir, '*.tmp.g1.df')):
        ret_g1.append(read_pickle(fname).T)
    for fname in glob(join(tmpdir, '*.tmp.g2.df')):
        ret_g2.append(read_pickle(fname).T)
    outdir = join(SNP.OM_RGC.OutDirCollate, 'pnpn')
    with open(join(outdir, 'pNpNCases.txt'), 'w') as ftxt:
        ftxt.write('Conditions for pN groups in this analysis\n')
        ftxt.write('Always pN(G1)/pN(G2) so invert if G1 is more conservative\n\n')
        bigdf = concat(ret, sort=False)
        bigg1 = concat(ret_g1, sort=False)
        bigg2 = concat(ret_g2, sort=False)
        for j, col in enumerate(bigdf.index.get_level_values(0).unique()):
            ftxt.write('Case {}: {}\n'.format(j,col))
            bigdf.loc[col].T.to_csv(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 
                                            'pNpN_Case_{}_{}_{}_{}_{}_{}_{}.csv'\
                                  .format(j, f_prefixes.split('/')[-1], minpos, minperc, mingenes, 
                                          minsamples, minsamples_gene)))
            bigg1.loc[col].T.to_csv(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 
                                            'pNpN_Case_{}_{}_{}_{}_{}_{}_{}.g1.csv'\
                                  .format(j, f_prefixes.split('/')[-1], minpos, minperc, mingenes, 
                                          minsamples, minsamples_gene)))
            bigg2.loc[col].T.to_csv(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 
                                            'pNpN_Case_{}_{}_{}_{}_{}_{}_{}.g2.csv'\
                                  .format(j, f_prefixes.split('/')[-1], minpos, minperc, mingenes, 
                                          minsamples, minsamples_gene)))
def unite_sampledata():
    concat([
        read_pickle(Biodata.bioGEOTRACES.metadataDF),
        read_pickle(Biodata.ALOHA_BATS.metadataDF),
        read_pickle(Biodata.TARA.metadataDF)
    ],
           sort=False).to_pickle(Biodata.United.metadataDF)
def do_one_group_ffdeg_piwit(nm, f_prefix, genegroup, analysisclass, mingenes=3):
    ret = []
    retlens = []
    for prefix in set([g[:-4] for g in genegroup]):
        f_in = join(analysisclass.OutDir, prefix + '.ffdeg_pi_wit.df')
        if not exists(f_in):
            continue
        try:
            ldf = read_pickle(f_in)
            ldf = ldf.loc[[g for g in genegroup if g in ldf.index]]
            llensdf = read_pickle(f_in.replace('_pi_wit','_poss'))
            llensdf = llensdf.loc[[g for g in genegroup if g in llensdf.index]]
        except KeyError:
            continue
        if ldf.shape[0] > 0:
            ret.append(ldf)
            retlens.append(llensdf)
    if len(ret) == 0:
        return
    outdf = concat(ret, sort = False).dropna(how='all').dropna(how='all', axis = 1)
    if outdf.shape[0] < mingenes:
        return
    outdf_lens = concat(retlens)
    outdf_lens.name = 'Length'
    ret = {}
    for col in outdf:
        coldf = outdf[[col]].multiply(outdf_lens,axis=0).join(outdf_lens).dropna().sum()
        ret[col] = {'pi':coldf[col] / coldf['Length'], 'length':coldf['Length'], 
                    'num_genes':len(outdf[col].dropna())}
    outpath = mkdirifnotexists(join(analysisclass.OutDirCollate, 'ffdeg'))
    DataFrame(ret).to_pickle(join(outpath, f_prefix + '_' + nm + '.ffdeg_pi_wit.df'))
Beispiel #4
0
def create_codon_trans_matrix(dirsname, tmpdir, grpnm):
    chdirmkifnotexist(join(tmpdir, 'tmpmq'))
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    for fname in glob(join(dirsname, '*codons*')):
        if exists(
                join(tmpdir,
                     grpnm + basename(fname).replace('codons', 'sums'))):
            continue
        do_one_unite(fname, grpnm, tmpdir)
    df = None
    for f in glob(join(tmpdir, grpnm + '*sums.df')):
        print(basename(f))
        if df is None:
            df = read_pickle(f)
        else:
            df = concat([df, read_pickle(f)],
                        sort=False).groupby(level=[0, 1]).sum()
    df.columns = MultiIndex.from_tuples([('per',i) if '_s' not in i \
                                         else ('sums',i.replace('_s','')) for i in df.columns])
    sms = df.sums.sum().truediv(3)

    def applyfunc(ldf):
        ret = ldf.per.truediv(ldf.sums).loc[ldf.name].iloc[:, 0]
        ret[ldf.name] = 1 - ret.sum()
        return ret

    def applyfunc_counts(ldf):
        adds = ldf.sums.iloc[0][0]
        ret = ldf.per.loc[ldf.name].iloc[:, 0]
        ret[ldf.name] = adds - ret.sum()
        return ret

    df_counts = df.groupby(
        level=1,
        axis=1).apply(lambda x: x.groupby(level=0).apply(applyfunc_counts))
    df_norm = df.groupby(
        level=1, axis=1).apply(lambda x: x.groupby(level=0).apply(applyfunc))
    aas, _, _ = get_codon_table()
    df_aas = df_counts
    df_aas['aa_start'] = [aas[a] for a in df_counts.index.get_level_values(0)]
    df_aas['aa_end'] = [aas[a2] + ('.' if a1!=a2 and aas[a1]==aas[a2] else '') \
                        for a1, a2 in \
                        zip(df_counts.index.get_level_values(0),df_counts.index.get_level_values(1))]
    df_aas = df_aas.groupby(['aa_start', 'aa_end']).sum()
    df_norm.to_pickle(
        join(General.Basepath, grpnm + '_4_60_mutation_codons.df'))
    df_counts.to_pickle(
        join(General.Basepath, grpnm + '_4_60_mutation_codon_counts.df'))
    df_aas.to_pickle(
        join(General.Basepath, grpnm + '_4_60_mutation_aas_counts.df'))
    sms.to_pickle(join(General.Basepath, grpnm + '_4_60_nucleotide_count.df'))
Beispiel #5
0
def loadSitePhreeqcData(site, processedSitesDir = DEFAULT_DIR):
    """
    Retrieves site PHREEQC data for an individual site from a directory of processed sites.

    Parameters
    ----------
    site : string
        name of site to retrieve, with or without USGS- tag at beginning.

    processedSitesDir : string (optional)
        directory that contains the processed site directory associated with the desired site. It is important to change this if the default is not correct. (default='./Processed-Sites')

    Returns
    -------
    sitedf : pandas.core.frame.DataFrame
        A pandas dataframe object with PHREEQC data from the requested site.

    """
    #Add USGS tag if needed
    if not(site.startswith('USGS-')):
        site = 'USGS-'+site
    try:
        phreeqcFile = os.path.join(processedSitesDir, site, site+'-PHREEQC.pkl')
        sitedf = read_pickle(phreeqcFile)
    except IOError:
        print ("Problem reading pickle file: " + phreeqcFile )
        return None
    return sitedf
def _collate_pnpn_inner(fname, mingenes, minsamples, minsamples_gene, outdir):
    grpname = split3way(fname)[1].replace('.pnpn','').split(':')[1]
    ret = defaultdict(dict)
    ret_g1 = defaultdict(dict)
    ret_g2 = defaultdict(dict)
    df = read_pickle(fname)
    for nm, ldf in df.groupby(level=1):
        keepinds = ldf.index.get_level_values(0).isin(\
                ((ldf.groupby(level=0).count() > 1).sum(1) >= minsamples_gene)\
                .replace(False, np.nan).dropna().index)
        ldf = ldf.loc[keepinds]
        ldf = ldf.loc[:,(ldf.groupby(level=0).count() > 1).sum(0) >= mingenes]
        if ldf.shape[1] <= minsamples:
            continue
        gs = ldf[['GeneSites']]
        ldf = ldf.drop('GeneSites', axis = 1)
        for col in ldf.columns:
            coldf = ldf[[col]].join(gs).dropna()
            coldf = coldf.groupby(level=2).sum()
            coldf = coldf[col].truediv(coldf['GeneSites'])
            ret_g1[(nm,grpname)][col] = coldf['G1']
            ret_g2[(nm,grpname)][col] = coldf['G2']
            ret[(nm,grpname)][col] = (coldf['G1']/coldf['G2']) if coldf['G2'] !=0 else np.nan
    outdf = DataFrame(ret)
    outdf_g1 = DataFrame(ret_g1)
    outdf_g2 = DataFrame(ret_g2)
    if outdf.shape != (0,0):
        outdf.to_pickle(join(outdir, grpname + '.tmp.df'))
    if outdf_g1.shape != (0,0):
        outdf_g1.to_pickle(join(outdir, grpname + '.tmp.g1.df'))
    if outdf_g2.shape != (0,0):
        outdf_g2.to_pickle(join(outdir, grpname + '.tmp.g2.df'))
Beispiel #7
0
def get_measurements_TARA():
    tara_md = read_pickle(Biodata.TARA.metadataDF)
    tara_ixs = tara_md.groupby(level=0).first().index
    carbchem = readTARAxls(Biodata.TARA.CarbChemXL, Biodata.TARA.XLHeaderLine)

    nutrient = readTARAxls(Biodata.TARA.NutrientsXL, Biodata.TARA.XLHeaderLine)
    hplc = readTARAxls(Biodata.TARA.HPLCXL, Biodata.TARA.XLHeaderLine)
    sensors = read_csv(Biodata.TARA.DepthSensorCSV,
                       sep='\t',
                       header=Biodata.TARA.DepthSensorHeader)
    sensors = sensors.rename(columns = {sensors.columns[2]:'SampleID',
                                        sensors.columns[0]:'TARA_SampleID'})\
                     .drop(sensors.columns[1], axis = 1).set_index('SampleID')
    sensors.columns = [c.split(' (')[0] + ('_min' if '(minimum' in c \
                                           else '_25p' if '(lower quartile' in c \
                                           else '_median' if '(median' in c \
                                           else '_75p' if '(upper quartile' in c \
                                           else '_max' if '(maximum' in c \
                                           else '' if 'OXYGEN' in c and c.endswith(')') \
                                           else '!@#$%' if '(calculated' in c or '(Calculated' in c \
                                           else '') for c in sensors.columns]
    sensors = sensors[[c for c in sensors.columns if '!@#$%' not in c]]
    concat([sensors.loc[tara_ixs],
            nutrient.loc[tara_md.groupby(level=0).first().index].loc[tara_ixs].iloc[:,16:],
            carbchem.loc[tara_md.groupby(level=0).first().index].loc[tara_ixs].iloc[:,16:],
            hplc.loc[tara_md.groupby(level=0).first().index].loc[tara_ixs].iloc[:,16:]], axis=1)\
        .to_pickle(Biodata.TARA.SampleMeasurementsDF)
Beispiel #8
0
def get_measurements_HOT():
    hotmd = read_pickle(Biodata.ALOHA_BATS.metadataDF)
    hotmd = hotmd[hotmd.Cruise_series == 'HOT']

    def read_ds(f):
        ds = open_dataset(f, decode_times=False)
        df = ds.to_dataframe().reset_index()
        if ds.time_coverage_start != ds.time_coverage_end:
            raise
        dsdt = datetime.strptime(ds.time_coverage_start, '%Y-%m-%dT%H:%M:%SZ')
        if abs(hotmd.Collection_datetime - dsdt).min() > timedelta(days=2):
            return None
        df['TIME'] = dsdt
        return df

    ctd = concat(
        [read_ds(f) for f in glob(join(Biodata.ALOHA_BATS.ALOHACTD, '*.nc'))])
    ctd = ctd.rename(columns={'TIME': 'Collection_datetime'}).reset_index()
    water = concat([
        read_ds(f) for f in glob(join(Biodata.ALOHA_BATS.ALOHAWater, '*.nc'))
    ])
    water = water.rename(columns={'TIME': 'Collection_datetime'}).reset_index()
    ret = {}
    for nm, row in hotmd.iterrows():
        watrow = _getrowmd(water, row, 'DEPTH', False)
        ctdrow = _getrowmd(ctd, row, 'DEPTH', False)
        watrow.index = [i.replace('Orig', 'BotOrig') for i in watrow.index]
        ret[nm[0]] = concat([ctdrow, watrow[11:]])
    df = DataFrame({k: v for k, v in ret.items() if type(v) == Series}).T
    df = df[[c for c in df.columns if not c.endswith('_QC')]]
    df.replace('nan', np.nan).drop('index', axis=1).to_pickle(
        Biodata.ALOHA_BATS.ALOHASampleMeasurementsDF)
Beispiel #9
0
def loadSitePhreeqcData(site, processedSitesDir=DEFAULT_DIR):
    """
    Retrieves site PHREEQC data for an individual site from a directory of processed sites.

    Parameters
    ----------
    site : string
        name of site to retrieve, with or without USGS- tag at beginning.

    processedSitesDir : string (optional)
        directory that contains the processed site directory associated with the desired site. It is important to change this if the default is not correct. (default='./Processed-Sites')

    Returns
    -------
    sitedf : pandas.core.frame.DataFrame
        A pandas dataframe object with PHREEQC data from the requested site.

    """
    #Add USGS tag if needed
    if not (site.startswith('USGS-')):
        site = 'USGS-' + site
    try:
        phreeqcFile = os.path.join(processedSitesDir, site,
                                   site + '-PHREEQC.pkl')
        sitedf = read_pickle(phreeqcFile)
    except IOError:
        print("Problem reading pickle file: " + phreeqcFile)
        return None
    return sitedf
Beispiel #10
0
def prepare_and_merge_data():
    
    # Retrieves all dataframes and merges into a single dataframe
    # which is then pickled
    
    job_data = read_pickle('Job Data.pkl')
    company_data = read_pickle('LinkedIn Company Data.pkl')
    industry_data = read_pickle('LinkedIn Industry Data.pkl')
    speciality_data = read_pickle('LinkedIn Speciality Data.pkl')    
    
    # Add in derived data and fill in blank data
        
    job_data['post_year'] = job_data.date_posted.apply(get_year)    # Get date_posted year
    job_data['post_month'] = job_data.date_posted.apply(get_month)  # Get date_posted month
    job_data['desc_word_count'] = job_data.description.apply(get_word_count)    # Number of words in job description
    job_data['desc_char_count'] = job_data.description.apply(get_char_count)    # Number of characters in job description
    job_data['estimated_seniority_value'] = job_data.estimated_seniority.apply(get_est_seniority_value) # Convert estimated seniority to an integer
        
    company_data.loc[company_data.employee_count_code.isnull(), 'employee_count_code'] = 'D'    # '51-200'
    company_data.loc[company_data.company_type_code.isnull(), 'company_type_code'] = 'P'    # 'Privately Held'        
    company_data['employee_count_value'] = company_data.employee_count_code.apply(get_emply_count_value) # Convert employee count code to an integer
    company_data['company_type_value'] = company_data.company_type_code.apply(get_cmpny_type_value) # Convert company type code to an integer
    
    industry_data = pd.merge(industry_data, company_data[['lnkn_name']], how = 'right', on = 'lnkn_name')
    industry_data.loc[industry_data.industry_type_name.isnull(), 'industry_type_name'] = 'Unknown'
            
    # Converting the Industry and Speciality data into dataframes of frequencies
    # Only counting a subset of specialities as data science-y
    industry_group = industry_data[['lnkn_name', 'industry_type_name']].groupby(['lnkn_name', 'industry_type_name']).size().unstack('industry_type_name')        
    industry_group[industry_group.notnull()] = 1
    industry_group[industry_group.isnull()] = 0
        
    ds_specialities = ['Big Data', 'Analytics', 'Machine Learning', 'analytics', 'Data Science']
    ds_specialities.extend(['Big Data Analytics', 'Natural Language Processing', 'Predictive Analytics', 'Data Mining'])
    speciality_group = speciality_data[speciality_data.speciality.isin(ds_specialities)].groupby(['lnkn_name', 'speciality']).size().unstack('speciality')    
    speciality_group = pd.merge(speciality_group, company_data[['lnkn_name']], how = 'right', right_on = 'lnkn_name', left_index = True)   
    speciality_group.set_index('lnkn_name', inplace = True)
    speciality_group[speciality_group.notnull()] = 1
    speciality_group[speciality_group.isnull()] = 0
        
    # Merge the dataframes
    merge_data = pd.merge(job_data, company_data, on = 'name') 
    merge_data = pd.merge(merge_data, industry_group, left_on = 'lnkn_name', right_index = True)
    merge_data = pd.merge(merge_data, speciality_group, how = 'left', left_on = 'lnkn_name', right_index = True)
        
    merge_data.to_pickle('Clean Job Data.pkl')
 def getLsiDict():
     externalFilePath = '..' + os.sep + "projectMidPoint" + os.sep + "tmp" + os.sep + "LsiModel" + os.sep + "mergeLsiData.dict"
     externalFile = Path(externalFilePath)
     if not externalFile.is_file():
         print("Running Will's LSI Model\n")
         os.chdir('..' + os.sep + "projectMidPoint" + os.sep)
         call(["python", "LsiModel.py"])
     dictionary = pickle.read_pickle(externalFilePath)
     return dictionary
Beispiel #12
0
def do_one_unite(fname, grpnm, tmpdir):
    cod = read_pickle(fname)
    fcod = read_pickle(fname.replace('codons', 'percmut'))
    allcodons = get_relevant_codonpairs()

    def apply_counts(ldf):
        nm = ldf.index.get_level_values(0)[0]
        ldf.index = MultiIndex.from_tuples(ldf.index.get_level_values(1))
        ldf = ldf.dropna(how='all', axis=1)
        codldf = Series({
            i: (cod.loc[nm, i[0]] if i[0] in cod.loc[nm] else 0)
            for i in allcodons
        })
        ldf = codldf.to_frame('cod').join(ldf).drop('cod', axis=1)
        for col in ldf.columns:
            ldf[col + '_s'] = codldf
        return ldf.fillna(0)

    fcod = fcod.groupby(level=0).apply(apply_counts)
    fcod.groupby(level=[1, 2]).sum().to_pickle(
        join(tmpdir, grpnm + basename(fname).replace('codons', 'sums')))
def main():
    os.chdir(mkdirifnotexists(join(Calling.OM_RGC.CallDir, 'tmp')))
    bioG_m = read_pickle(Biodata.bioGEOTRACES.metadataDF)
    ALOHA_m = read_pickle(Biodata.ALOHA_BATS.metadataDF)
    TARA_m = read_pickle(Biodata.TARA.metadataDF)
    allbams = concat([TARA_m, ALOHA_m, bioG_m],
                     sort=False)[['ICRABAM_1', 'ICRABAM_2']]
    dirnames = sorted(list(set([ref[:-4] for ref \
                        in pysam.AlignmentFile(allbams.iloc[-1]['ICRABAM_1']).header.references])))
    # This is set to process 80 genes (each with all samples) at a time.
    # Changing it to a higher setting will cause everything to run faster on a hpc system, but
    # take up more memory and space for intermediate files
    dirnamegrps = [dirnames[i:i + 80] for i in range(0, len(dirnames), 80)]
    for reference_list in dirnamegrps:
        # TODO: IMPORTANT! Wrap the loops in the called method with your hpc job submission pipeline
        # Also IMPORTANT! Make sure each loop runs synchronously with the next (wait for one to
        # finish before you start the next)
        # Estimated total CPU time for this part >25,000 hours (Intel(R) Xeon(R) CPU E5-2690 v3)
        do_references(allbams, Calling.OM_RGC.CallDir, reference_list,
                      Calling.OM_RGC.FilterThreshold, Calling.OM_RGC.DbFasta,
                      Calling.OM_RGC.mpileupParams, THREADS)
Beispiel #14
0
    def from_pickle(path: Union[str, Path]) -> "Metropolis":
        """
        Load pickled Metropolis object from file.


        Returns
        -------
        [Metropolis]
            Deserialized Metropolis python object.
        """
        metro = read_pickle(path)
        metro.lattice._set_recursion_limit()
        return metro
Beispiel #15
0
def demorinex(obsfn,maxchunk=None):
    #switchyard based on filename extension
    name,ext = splitext(obsfn)
    if ext[-1] == 'o':
        f = RINEXFile(obsfn,maxchunk)
        f.save_pickle(name + '.pickle')
        f.save_hdf5(name+'.h5') #this can crash some Python with incompatible PyTables/Pandas/HDF5 versions
        return f.data
    elif ext in ('.pkl','.pickle'):
        return read_pickle(expanduser(obsfn))
    elif ext in ('.h5','.hdf5'):
        print('not implemented yet')
        return None
def unite_measurements():
    biog = read_pickle(Biodata.bioGEOTRACES.SampleMeasurementsDF)\
            .rename(columns = {'DEPTH [m]':DEPTH, 'CTDTMP [deg C]':TEMPERATURE,
                               'CTDSAL':SALINITY, 'CTDOXY [umol/kg]':OXYGEN,
                               'PHOSPHATE_D_CONC_BOTTLE [umol/kg]':PHOSPHATE,
                               'SILICATE_D_CONC_BOTTLE [umol/kg]':SILICATE,
                               'NITRATE_D_CONC_BOTTLE [umol/kg]':NITRATE,
                               'NO2+NO3_D_CONC_BOTTLE [umol/kg]':NITRIRA})
    tara = read_pickle(Biodata.TARA.SampleMeasurementsDF)\
            .rename(columns = {'Depth, nominal':DEPTH, 'Temp [°C]_median':TEMPERATURE,
                               'Sal_median':SALINITY, 'OXYGEN [µmol/kg]':OXYGEN,
                               'Phosphate_median':PHOSPHATE,  #umol/l - https://doi.pangaea.de/10.1594/PANGAEA.839233
                               'Silicate_median':SILICATE, #umol/l
                               '[NO3]- [µmol/l]_median':NITRATE,
                               'Nitrate and Nitrite_median':NITRIRA}) #umol/l
    tara[SILICATE] = tara[SILICATE].astype(float)
    bats = read_pickle(Biodata.ALOHA_BATS.BATSSampleMeasurementsDF)\
            .rename(columns = {'Depth':DEPTH, 'Temperature [c]':TEMPERATURE,
                               'Salinity':SALINITY, 'Oxygen [umol/kg]':OXYGEN,
                               'Phosphate':PHOSPHATE, #umol/kg
                               'Silicate':SILICATE, #umol/kg
                               'Nitrate+Nitrite':NITRIRA}) #umol/kg
    bats[[DEPTH,TEMPERATURE,SALINITY,OXYGEN,PHOSPHATE,SILICATE,NITRIRA]] = \
            bats[[DEPTH,TEMPERATURE,SALINITY,OXYGEN,PHOSPHATE,SILICATE,NITRIRA]].astype(float)
    alha = read_pickle(Biodata.ALOHA_BATS.ALOHASampleMeasurementsDF)\
            .rename(columns = {'DEPTH':DEPTH, 'TEMP':TEMPERATURE,
                               'PSAL':SALINITY, 'DOXY1':OXYGEN,
                               'PO41':PHOSPHATE, #umol/kg
                               'SILC1':SILICATE, #umol/kg
                               'NO31':NITRATE}) #umol/kg
    tara[DEPTH] = tara[DEPTH].apply(lambda x: float(x)
                                    if '-' not in str(x) else np.nan)
    tara[[PHOSPHATE, SILICATE, NITRATE, NITRIRA]] = tara[[PHOSPHATE, SILICATE, NITRATE, NITRIRA]]\
            .astype(float)\
            .truediv(1 + tara['Sigma-theta [kg/m**3]_median']/1000, axis=0) #umol/l --> umol/kg
    concat([tara,bats,alha,biog], sort=False)\
            [[DEPTH,TEMPERATURE,SALINITY,OXYGEN,PHOSPHATE,SILICATE,NITRATE,NITRIRA]]\
            .to_pickle(Biodata.United.SampleMeasurementsDF)
Beispiel #17
0
def demorinex(obsfn, maxchunk=None):
    #switchyard based on filename extension
    name, ext = splitext(obsfn)
    if ext[-1] == 'o':
        f = RINEXFile(obsfn, maxchunk)
        f.save_pickle(name + '.pickle')
        f.save_hdf5(
            name + '.h5'
        )  #this can crash some Python with incompatible PyTables/Pandas/HDF5 versions
        return f.data
    elif ext in ('.pkl', '.pickle'):
        return read_pickle(expanduser(obsfn))
    elif ext in ('.h5', '.hdf5'):
        print('not implemented yet')
        return None
def analyze_genes(genes_df_f, db_fasta, outdir, cachedir, min_pos_reads,
                  min_perc_poss, min_total_var_support, min_maf, min_samples,
                  min_variants):
    # This method potentially calculates all metrics (pN/pS, pi_within, etc.) for one gene file
    # out of approximately 2500, containing up to 10,000 separate genes (usually 500-1000)
    # It iterates all genes and calculates everything for each.
    # Then it concatenates and saves everything. The calling for each gene is to 'analyze_one'.
    df = read_pickle(genes_df_f)
    pnpn_groups = create_all_pnpn_groups()
    log_.info('Analyzing file {} minpos {} minperc {}'.format(
        basename(genes_df_f), min_pos_reads, min_perc_poss))
    gene_seqs = _getgeneseqs(genes_df_f, db_fasta,
                             df.index.get_level_values(0).unique(), cachedir)
    sitess, pnpss, percmuts, ffdeg_pws, pnpns = [], [], [], [], []
    ffdeg_poss = {}
    for nm, genedf in df.groupby(level=0):
        sites, pnps, percmut, ffdeg_pw, pnpn = analyze_one(
            nm, genedf, min_pos_reads, min_perc_poss, min_total_var_support,
            min_maf, min_samples, min_variants, gene_seqs, pnpn_groups)
        if sites is not None:
            sitess.append(sites)
        if pnps is not None:
            pnpss.append(pnps)
        if percmut is not None:
            percmuts.append(percmut)
        if ffdeg_pw is not None:
            ffdeg_pws.append(ffdeg_pw[0])
            ffdeg_poss[nm] = ffdeg_pw[1]
        if pnpn is not None:
            pnpns.append(pnpn)
    baseout = basename(genes_df_f).replace('.df', '')
    if len(sitess) > 0 and any([s is not None for s in sitess]):
        concat(sitess, sort=False).to_pickle(join(outdir,
                                                  baseout + '.muts.df'))
    if len(pnpss) > 0 and any([p is not None for p in pnpss]):
        concat(pnpss, sort=False).to_pickle(join(outdir, baseout + '.pnps.df'))
    if len(percmuts) > 0:
        concat([p[0] for p in percmuts],
               sort=False).to_pickle(join(outdir, baseout + '.percmut.df'))
        concat([p[1] for p in percmuts],
               sort=False).to_pickle(join(outdir, baseout + '.codons.df'))
    if len(ffdeg_pws) > 0:
        concat(ffdeg_pws,
               sort=False).to_pickle(join(outdir,
                                          baseout + '.ffdeg_pi_wit.df'))
        Series(ffdeg_poss).to_pickle(join(outdir, baseout + '.ffdeg_poss.df'))
    if len(pnpns) > 0:
        concat(pnpns, sort=False).to_pickle(join(outdir, baseout + '.pnpn.df'))
Beispiel #19
0
def million_codes():
    # This creates one million permutations of the genetic code
    aas, _, _ = get_codon_table()
    df = read_pickle(
        join(General.Basepath, 'All_4_60_mutation_codon_counts.df'))
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    for i in range(100):
        codon_risk(df, aas, 'All_{:02d}'.format(i), True, subdir='Million')
    compiled_f = join(CodeAnalysis.CodonsDir, 'Codon_risk_compiled.dat')
    ret = defaultdict(list)
    for i, fn in enumerate(
            glob(join(CodeAnalysis.CodonsDir, 'Million', '*.dat'))):
        ret_l = Utils.Load(fn)
        for var in ['n+_risk', 'c+_risk', 'o+_risk', 'hyd_risk', 'PR_risk']:
            ret[var].extend((ret_l[var] if i == 0 else ret_l[var][1:]))
        print(i)
    Utils.Write(compiled_f, ret)
    return compiled_f
def filter_db(dbdct, analysisclass, mingenes):
    indir = analysisclass.OutDir
    pnps_piwig_f = join(indir, 'PNPSPiWiGenes.dat')
    genes_in_use = []
    if exists(pnps_piwig_f):
        pnpsgenes = Utils.Load(pnps_piwig_f)
    else:
        for fname in glob(join(indir, '*pnps.df')) + glob(join(indir, '*pi_wit.df')):
            print(fname)
            genes_in_use.extend(list(read_pickle(fname).index.get_level_values(0).unique()))
        pnpsgenes = list(set(genes_in_use))
        Utils.Write(pnps_piwig_f, genes_in_use)
    ret = {}
    pnpsgenes = set(pnpsgenes)
    for k,v in dbdct.items():
        if len(set(v).intersection(pnpsgenes)) >= mingenes:
            ret[k] = v
    return(ret)
Beispiel #21
0
def get_retweet_factors_impressions():
    # Use the Twitter search API to retrieve recent tweets that contain high retweet scoring reference data
    # (User Mentions and Hashtags) as well as Chase mentions. To be run after Twitter_Analysis.analyze_retweet_factors.

    twitter = get_twitter_conn()

    score_table = read_pickle('k_score_table.pkl')
    sorted_score_table = score_table[score_table.Type.isin(
        ['User Mention', 'Hashtag'])].sort(columns=['Score'], ascending=False)

    # Retrieve names of top scoring reference data as well as anything that has 'chase' or 'sapphire' in the name
    top_score_table = sorted_score_table[sorted_score_table.Score > 1000.0]
    chase_tags = np.array([
        'chase' in name_val
        for name_val in sorted_score_table.Name.values.ravel()
    ])
    sapphire_tags = np.array([
        'sapphire' in name_val
        for name_val in sorted_score_table.Name.values.ravel()
    ])
    chase_score_table = sorted_score_table[chase_tags | sapphire_tags]

    table_array = [top_score_table, chase_score_table]
    retweet_tag_tweets = {}

    # Loop through each tag and retrieve the most recent 1000 results
    for data_table in table_array:
        for _, score_row in data_table.iterrows():
            tag_results = []
            results = twitter.cursor(twitter.search, q=score_row[0], count=100)

            n = 1
            for result in results:
                tag_results.append(result)
                check_twitter_timeout(twitter)
                n += 1
                if n > 1000:
                    break

            retweet_tag_tweets[score_row[0]] = tag_results

    pickle.dump(retweet_tag_tweets, open("retweet_tag_tweets.pkl", "wb"))
def do_one_group_pnpn(nm, f_prefix, genegroup, analysisclass, mingenes=3):
    ret = []
    for prefix in set([g[:-4] for g in genegroup]):
        f_in = join(analysisclass.OutDir, prefix + '.pnpn.df')
        if not exists(f_in):
            continue
        try:
            ldf = read_pickle(f_in)
            ldf = ldf.loc[[g for g in genegroup if g in ldf.index]] 
        except KeyError:
            continue
        if ldf.shape[0] > 0:
            ret.append(ldf)
    if len(ret) == 0:
        return
    outdf = concat(ret, sort = False)
    outpath = mkdirifnotexists(join(analysisclass.OutDirCollate, 'pnpn'))
    if outdf.groupby(level=0).first().shape[0] < mingenes:
        return
    outdf.to_pickle(join(outpath, f_prefix + '_' + nm + '.pnpn.df'))
def calc_codon_costs(out_f=None, force_rerun=False):
    if exists(out_f) and not force_rerun:
        return read_pickle(out_f)
    aas, _, _ = get_codon_table()
    ret = {}
    for cod_1, aa1 in aas.to_dict().items():
        for cod_2, aa2 in aas.to_dict().items():
            ret[(cod_1, cod_2)] = {'aa_s': aa1, 'aa_e': aa2}
    aa_props = read_csv('./aa_NCHP.csv', index_col=0)
    ret = DataFrame(ret).T
    ret.index.names = ['Codon_s', 'Codon_e']
    codon_props = ret.join(aa_props, on='aa_s').join(aa_props,
                                                     on='aa_e',
                                                     lsuffix='_s',
                                                     rsuffix='_e')
    for v in ['C', 'N', 'hyd', 'PR']:
        codon_props[v + '_d'] = codon_props[v + '_e'] - codon_props[v + '_s']
        codon_props[v + '_abs_d'] = abs(codon_props[v + '_d'])
    if out_f is not None:
        codon_props.to_pickle(out_f)
    return codon_props
Beispiel #24
0
def get_measurements_BATS():
    ctd = concat([read_csv(f, header=None, sep='\t') \
                  for f in glob(join(Biodata.ALOHA_BATS.BATSASCIIDir, '*ctd.txt'))],
                 sort = False).dropna().replace(-999, np.nan)
    ctd.columns = [
        'ID', 'Collection_datetime', 'Latitude', 'Longitude',
        'Pressure [dbar]', 'Depth', 'Temperature [c]', 'Conductivity [S/m]',
        'Salinity', 'Oxygen [umol/kg]', 'Beam Attenuation Coefficient [1/m]',
        'Flourescence', 'PAR [uE/m2/s]'
    ]

    def todatetime(x):
        year = int(x)
        rem = x - year
        basedt = datetime(year, 1, 1)
        return basedt + timedelta(seconds=(basedt.replace(year=year + 1) -
                                           basedt).total_seconds() * rem)

    ctd.Collection_datetime = ctd.Collection_datetime.apply(todatetime)
    bot = read_csv(Biodata.ALOHA_BATS.BATSBottle,
                   header=Biodata.ALOHA_BATS.BATSBottleHeader,
                   sep='\t').reset_index().replace(-999, np.nan)
    bot.columns = [
        'ID', 'yyyymmdd', 'Collection_datetime', 'time', 'Latitude',
        'Longitude', 'Depth', 'Temp', 'CTD_S', 'Sal1', 'Sig-th', 'O2_1',
        'OxFix', 'Anom1', 'CO2', 'Alk', 'Nitrate+Nitrite', 'Nitrite',
        'Phosphate', 'Silicate', 'POC', 'PON', 'TOC', 'TN', 'Bact', 'POP',
        'TDP', 'SRP', 'BSi', 'LSi', 'Pro', 'Syn', 'Piceu', 'Naneu'
    ]
    bot.Collection_datetime = bot.Collection_datetime.apply(todatetime)
    batsmd = read_pickle(Biodata.ALOHA_BATS.metadataDF)
    batsmd = batsmd[batsmd.Cruise_series == 'BATS']
    ret = {}
    for nm, row in batsmd.iterrows():
        botrow = _getrowmd(bot, row, 'Depth', False)
        ctdrow = _getrowmd(ctd, row, 'Depth', False)
        botrow.index = [i.replace('Orig', 'BotOrig') for i in botrow.index]
        ret[nm[0]] = concat([ctdrow, botrow[10:]])
    DataFrame({k:v for k,v in ret.items() if type(v) == Series}).T\
        .to_pickle(Biodata.ALOHA_BATS.BATSSampleMeasurementsDF)
Beispiel #25
0
def codon_bias(outdir):
    # Data source: https://elifesciences.org/articles/41043
    df = read_pickle('./resource/Proc_strains_codons.df')
    thr = df.loc[['ACT', 'ACA', 'ACC', 'ACG']]
    thr = thr.truediv(thr.sum()).T
    ile = df.loc[['ATT', 'ATA', 'ATC']]
    ile = ile.truediv(ile.sum()).T
    print(wilcoxon(thr['ACT'], thr['ACA']))
    print(wilcoxon(thr['ACC'], thr['ACG']))
    print(wilcoxon(ile['ATT'], ile['ATA']))
    _, ax = plt.subplots(1, figsize=(8.2, 5.2), dpi=144)
    bp = ax.violinplot(concat([thr, ile], axis=1).T,
                       positions=[1, 2, 3, 4, 6, 7, 8])
    for partname in ('cbars', 'cmins', 'cmaxes'):
        vp = bp[partname]
        vp.set_edgecolor('k')
        vp.set_linewidth(1)

    [m.set_color('#0d4c7c') for m in bp['bodies'][:4]]
    [m.set_color('#891919') for m in bp['bodies'][-3:]]
    ax.set_ylim(0, 1.)
    ax.set_xticks(range(1, 9))
    plt.savefig(join(outdir, 'Codon_usage.png'), dpi=144)
Beispiel #26
0
def parseeggnog():
    df_f = join(Annotate.OM_RGC.EggnogDir, 'OM-RGC_annotations.df')
    if exists(df_f):
        df = read_pickle(df_f)
    else:
        df = concat([read_csv(f, sep='\t', header=None) \
                     for f in sorted(glob(join(Annotate.OM_RGC.AnnotDir, '*.annotations')))])
        df.columns = [
            'GeneID', 'seed_eggNOG_ortholog', 'seed_ortholog_evalue',
            'seed_ortholog_score', 'Predicted_taxonomic_group',
            'Predicted_protein_name', 'GeneOntology', 'EC_number', 'KEGG_ko',
            'KEGG_Pathway', 'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass',
            'BRITE', 'KEGG_TC', 'CAZy', 'BiGG_Reaction', 'tax_scope',
            'eggNOG_OGs', 'bestOG', 'COG_Functional_Category',
            'eggNOG_description'
        ]
        df = df.set_index('GeneID')
        df.to_pickle(df_f)

    for col in df.columns[3:19]:
        if col == 'bestOG':
            continue
        if exists(join(Annotate.OM_RGC.EggnogDir, col + '.dat')):
            continue
        ret = defaultdict(list)
        locser = df[col].dropna()
        for nm in locser.index:
            if col == 'eggNOG_OGs':
                terms = set([x.split('@')[0] for x in locser[nm].split(',')])
            elif col == 'COG_Functional_Category':
                terms = set([x for x in locser[nm]])
            else:
                terms = set(locser[nm].split(','))
            for term in terms:
                ret[term].append(nm)
        Write(join(Annotate.OM_RGC.EggnogDir, col + '.dat'), ret)
        print(col)
Beispiel #27
0
def multi_organism_analyze():
    # This replicates the analysis presented in Fig. 4
    # Source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5581930/
    codons_all = read_pickle('./resource/ModelOrganisms.df').set_index('Taxid')
    # Take only the organisms with more than 50K codons in the calculation
    codons_all = codons_all.loc[codons_all.iloc[:, 11:].sum(1) >= 50000]
    aas, _, _ = get_codon_table()
    # Create alternative codes for each organism and transition-transverskion rate
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    for taxid, row in codons_all.iterrows():
        codons = row[11:].astype(float)
        for titv in [0.2, 0.25, 0.333, 0.5, 0.667, 1, 1.5, 2, 3, 4, 5]:
            ti = (2 * titv) / (1 + 2 * titv)
            codon_risk(None,
                       aas,
                       'Tax_{}_Rate_{}'.format(taxid, titv),
                       all_mutations=False,
                       external_counts=codons,
                       external_titv=(ti, 1 - ti),
                       subdir='MultiOrg')
    # Collate the results in one table
    proc_stats = {}
    for fnm in glob(join(CodeAnalysis.CodonsDir, 'MultiOrg/*.dat')):
        tax = float(basename(fnm).split('Tax_')[-1].split('_')[0])
        rate = float(basename(fnm).split('Rate_')[-1].split('_')[0])
        ret = Utils.Load(fnm)
        npr = np.array(ret['n+_risk'])
        cpr = np.array(ret['c+_risk'])
        proc_stats[(tax, rate)] = {
            'cpr_p': sum(cpr[1:] <= cpr[0]) / 10000.,
            'npr_p': sum(npr[1:] <= npr[0]) / 10000.,
            'ncpr_p': sum((cpr[1:] <= cpr[0]) & (npr[1:] <= npr[0])),
            'cpr': cpr[0],
            'npr': npr[0]
        }
    DataFrame(proc_stats).to_pickle(
        join(CodeAnalysis.CodonsDir, 'MultiOrg_rates.df'))
Beispiel #28
0
def get_measurements_GEOTRACES(depth_tolerance=None):
    sample_md = read_pickle(Biodata.bioGEOTRACES.metadataDF)
    disc_df = read_csv(Biodata.bioGEOTRACES.DiscreteSampleTXT, sep='\t')
    disc_df = disc_df.rename(
        columns={'yyyy-mm-ddThh:mm:ss.sss': 'Collection_datetime'})
    disc_df.Collection_datetime = disc_df.Collection_datetime\
                                         .apply(lambda x:datetime.strptime(x,'%Y-%m-%dT%H:%M:%S'))
    disc_df = disc_df[disc_df.Collection_datetime > datetime(2000, 1, 1)]
    ret = {}
    for nm, row in sample_md.iterrows():
        d_row = _getrowmd(disc_df, row, depth_tolerance=depth_tolerance)
        if depth_tolerance is not None:
            if type(d_row) == Series:
                ret[(*nm, d_row.name)] = d_row[[i for i in d_row.index \
                        if (not i.startswith('QV') and (not i.startswith('STANDARD_DEV')))]]
            else:
                #if more than one date, take closest
                datediffs = (d_row.Collection_datetime.astype(np.datetime64) -
                             d_row.Orig_datetime).apply(np.abs)
                d_row = d_row.loc[datediffs[datediffs ==
                                            datediffs.min()].index]
                for rownm, row in d_row.iterrows():
                    ret[(*nm, rownm)] = row[[i for i in row.index \
                        if (not i.startswith('QV') and (not i.startswith('STANDARD_DEV')))]]
        else:
            ret[nm] = d_row[[i for i in d_row.index \
                            if (not i.startswith('QV') and (not i.startswith('STANDARD_DEV')))]]
    if depth_tolerance is None:
        DataFrame(ret).dropna(how='all').T.groupby(level=0).first()\
            .to_pickle(Biodata.bioGEOTRACES.SampleMeasurementsDF)
    else:
        df = DataFrame(ret).dropna(how='all').T.reset_index()\
            .rename(columns={'level_0':'SampleID','level_2':'MeasurementID'})\
            .set_index(['SampleID','MeasurementID']).drop('level_1',axis=1)
        df.to_pickle(Biodata.bioGEOTRACES.SampleMeasurementsDF\
                     .replace('.df','.tol_{}.df'.format(depth_tolerance)))
def do_collate(f_prefixes, minpos, minperc, mingenes, minsamples, minsamples_gene):
    ret = defaultdict(dict)
    ps = defaultdict(dict)
    pn = defaultdict(dict)
    for fname in glob(f_prefixes + '*.pnps.df'):
        grpname = split3way(fname)[1].replace('.pnps','')
        df = read_pickle(fname)
        keepinds = df.index.get_level_values(0).isin(\
                ((df.groupby(level=0).count() > 1).sum(1) >= minsamples_gene)\
                .replace(False, np.nan).dropna().index)
        df = df.loc[keepinds]
        df = df.loc[:,(df.groupby(level=0).count() > 1).sum(0) >= mingenes]
        if df.shape[1] <= minsamples:
            continue
        gs = df[['GeneSites']]
        df = df.drop('GeneSites', axis = 1)
        for col in df.columns:
            coldf = df[[col]].join(gs).dropna()
            coldf = coldf.groupby(level=1).sum()
            coldf = coldf[col].truediv(coldf['GeneSites'])
            ret[grpname][col] = coldf.NS/coldf.S
            pn[grpname][col] = coldf.NS
            ps[grpname][col] = coldf.S
        print(grpname)
    df = DataFrame(ret)
    ps = DataFrame(ps)
    pn = DataFrame(pn)
    df.to_csv(join(General.Basepath, '{}_{}_{}_{}_{}_{}.csv'\
                                  .format(f_prefixes.split('/')[-1], minpos, minperc, mingenes, 
                                          minsamples, minsamples_gene)))
    pn.to_csv(join(General.Basepath, '{}_{}_{}_{}_{}_{}.pn.csv'\
                                  .format(f_prefixes.split('/')[-1], minpos, minperc, mingenes, 
                                          minsamples, minsamples_gene)))
    ps.to_csv(join(General.Basepath.OutDirCollate, '{}_{}_{}_{}_{}_{}.ps.csv'\
                                  .format(f_prefixes.split('/')[-1], minpos, minperc, mingenes, 
                                          minsamples, minsamples_gene)))
Beispiel #30
0
 def load(self, path):  # TODO remove in 0.13
     import warnings
     from pandas.io.pickle import read_pickle
     warnings.warn("load is deprecated, use pd.read_pickle", FutureWarning)
     return read_pickle(path)
Beispiel #31
0
 def load(self, path):  # TODO remove in 0.13
     import warnings
     from pandas.io.pickle import read_pickle
     warnings.warn("load is deprecated, use pd.read_pickle", FutureWarning)
     return read_pickle(path)
Beispiel #32
0
def run_model():    
    # Run the models against the data and write the results to a spreadsheet
    
    # Retrieve the data that we processed in prepare_and_merge_data()
    clean_job_data = read_pickle('Clean Job Data.pkl')
    clean_job_data = clean_job_data[clean_job_data.estimated_salary &gt; 0.0]
        
    # These are the columns that we don't need
    drop_cols = ['company_id', 'date_posted', 'description', 'estimated_salary', 'expiration_date']
    drop_cols.extend(['job_id', 'pay_rate', 'position', 'skill_count', 'source_uri', 'estimated_seniority'])
    drop_cols.extend(['name', 'company_industry', 'company_type', 'is_public', 'number_of_employees', 'status_name'])
    drop_cols.extend(['status_code', 'lnkn_description', 'websiteUrl', 'employee_count_code', 'lnkn_universal_name'])
    drop_cols.extend(['company_type_name', 'lnkn_name', 'employee_count_name', 'company_type_code', 'clean_pay_rate_annualized'])
    
    # Use records that have the pay rate provided in the job post - this is a small set
    pay_rate_data = clean_job_data[clean_job_data.clean_pay_rate_annualized.notnull()]
    pay_cols = get_clean_column_names(pay_rate_data, drop_cols)
    print 'Number of Clean Pay Rate Records: {}'.format(len(pay_rate_data))                 
    x1 = pay_rate_data[pay_cols].astype(int)
    y1 = pay_rate_data.clean_pay_rate_annualized
    _, _, y1_train, y1_test = get_train_test_sets(x1, y1, True)
    print '{} Training Records / {} Testing Records'.format(y1_train.size, y1_test.size)
    
    # Use records that have an estimated salary, which we will round to nearest 1k
    print 'Number of Estimated Salary Records: {}'.format(len(clean_job_data))
    est_sal_cols = get_clean_column_names(clean_job_data, drop_cols)
    x2 = clean_job_data[est_sal_cols].astype(int)
    y2 = clean_job_data.estimated_salary.apply(round_to_thousands) 
    _, _, y2_train, y2_test = get_train_test_sets(x2, y2, False)
    print '{} Training Records / {} Testing Records'.format(y2_train.size, y2_test.size)
    
    # Different approach - groups salaries in amounts of 10k and see if we can get better results
    y3 = pay_rate_data.clean_pay_rate_annualized.apply(convert_to_salary_range) # Convert Pay Rate to a range
    y4 = clean_job_data.estimated_salary.apply(convert_to_salary_range) # Convert Est Salary to a range
    
    # Transform the independent variables using PCA to see if that helps on some of the models
    pca = PCA().set_params(n_components = 0.9)     
    x3 = normalize_and_apply_pca(x1, pca)
    x4 = normalize_and_apply_pca(x2, pca)
                
    results_book = xlwt.Workbook()    
    head_style = get_header_style()
    
    pyrt_sh = results_book.add_sheet('Pay Rate')
    pyrt_sh.write(0, 0, "Model Name", head_style)
    pyrt_sh.write(0, 1, "Dataset", head_style)
    pyrt_sh.write(0, 2, "Training Score", head_style)
    pyrt_sh.write(0, 3, "Testing Score", head_style)
    pyrt_sh.write(0, 4, "Training MSE", head_style)
    pyrt_sh.write(0, 5, "Testing MSE", head_style)
    pyrt_sh.write(0, 6, "Best K", head_style)
    pyrt_sh.write(0, 7, "Best Parameters", head_style)
    
    estsal_sh = results_book.add_sheet('Est Salary')
    estsal_sh.write(0, 0, "Model Name", head_style)
    estsal_sh.write(0, 1, "Dataset", head_style)
    estsal_sh.write(0, 2, "Training Score", head_style)
    estsal_sh.write(0, 3, "Testing Score", head_style)
    estsal_sh.write(0, 4, "Training MSE", head_style)
    estsal_sh.write(0, 5, "Testing MSE", head_style)
    estsal_sh.write(0, 6, "Best K", head_style)
    estsal_sh.write(0, 7, "Best Parameters", head_style)    
    
    # Do an initial test using linear models with different shapes for the dependent variable
    linear_datasets = [("Pay Rate", x1, y1, True),
                ("Log Pay Rate", x1, np.log(y1), True),
                ("Sqrt Pay Rate", x1, np.sqrt(y1), True),
                ("Est Salary", x2, y2, False),
                ("Log Est Salary", x2, np.log(y2), False),
                ("Sqrt Est Salary", x2, np.sqrt(y2), False),
                ("Pay Rate Range", x1, y3, True),
                ("Log Pay Rate Range", x1, np.log(y3), True),
                ("Sqrt Pay Rate Range", x1, np.sqrt(y3), True),
                ("Est Salary Range", x2, y4, False),
                ("Log Est Salary Range", x2, np.log(y4), False),
                ("Sqrt Est Salary Range", x2, np.sqrt(y4), False)                
                ]
         
    linear_models = [("OLS", linear_model.LinearRegression()),
              ("Ridge", linear_model.RidgeCV(normalize = True, fit_intercept = False, scoring = 'mean_squared_error', cv = 5)),
              ("Lasso", linear_model.LassoCV(normalize = True, fit_intercept = False, cv = 5))]
    
    prow = 1
    erow = 1
    for data in linear_datasets:
        x_train, x_test, y_train, y_test = get_train_test_sets(data[1], data[2], data[3])
          
        for model in linear_models:            
            train_score, test_score, train_mse, test_mse = get_model_values(model[1], x_train, y_train, x_test, y_test)
            
            if data[3] == True:
                prow = write_to_spreadsheet(model[0], data[0], train_score, test_score, train_mse, test_mse, None, None, pyrt_sh, prow)
            else:
                erow = write_to_spreadsheet(model[0], data[0], train_score, test_score, train_mse, test_mse, None, None, estsal_sh, erow)
       
    # Test on a different set of models, where we're applying PCA to reduce the number of features        
    datasets = [("Pay Rate", x1, y1, True),
                ("PCA Pay Rate", x3, y1, True),
                ("Pay Rate Range", x1, y3, True),
                ("PCA Pay Rate Range", x3, y3, True),
                ("Est Salary", x2, y2, False),
                ("PCA Est Salary", x4, y2, False),
                ("Est Salary Range", x2, y4, False),
                ("PCA Est Salary Range", x4, y4, False)             
                ]
    
    models = [("KNN", neighbors.KNeighborsClassifier(), {'n_neighbors' : np.arange(3, 9), 'weights' : ['uniform', 'distance'], 'p' : [1, 2]}),
              ("Decision Tree", tree.DecisionTreeClassifier(), {'criterion' : ['gini', 'entropy'], 'max_features' : [None, 'auto', 'log2']}),
              ("Random Forest", ensemble.RandomForestClassifier(), {'criterion': ['gini', 'entropy'], 'max_features' : [None, 'auto', 'log2'], 'n_estimators': np.arange(10, 110, 10)})
              ]
    
    for data in datasets:         
        x_train, x_test, y_train, y_test = get_train_test_sets(data[1], data[2], data[3])    
     
        for model in models:
            _, best_params, train_score, test_score, train_mse, test_mse = get_grid_search_values(model[1], model[2], x_train, y_train, x_test, y_test, 'accuracy')                 
            
            if data[3] == True:
                prow = write_to_spreadsheet(model[0], data[0], train_score, test_score, train_mse, test_mse, None, best_params, pyrt_sh, prow)
            else:
                erow = write_to_spreadsheet(model[0], data[0], train_score, test_score, train_mse, test_mse, None, best_params, estsal_sh, erow)            

    # Use the best K on LDA - had collinearity issues with full feature set              
    datasets = [("Pay Rate Range Best K", x1, y3.values.ravel(), True),
                ("Est Salary Range Best K", x2, y4.values.ravel(), False)              
                ]
    
    models = [("LDA", lda.LDA())]
    
    for data in datasets:         
        for model in models: 
            best_k, train_score, test_score, train_mse, test_mse = get_best_k_model(model[1], 20, data[1], data[2])                            
            
            if data[3] == True:
                prow = write_to_spreadsheet(model[0], data[0], train_score, test_score, train_mse, test_mse, best_k, None, pyrt_sh, prow)
            else:
                erow = write_to_spreadsheet(model[0], data[0], train_score, test_score, train_mse, test_mse, best_k, None, estsal_sh, erow)            
                        
    results_book.save("Model Results.xls")         
Beispiel #33
0
def update_company_data_from_linkedin():
    
    # Retrieves all of the company names from the job postings,
    # and queries LinkedIn for additional information
    
    # Define CONSUMER_KEY, CONSUMER_SECRET,  
    # USER_TOKEN, and USER_SECRET from the credentials 
    # provided in your LinkedIn application
    
    # Instantiate the developer authentication class
    
    authentication = linkedin.LinkedInDeveloperAuthentication(LINKEDIN_CONSUMER_KEY, LINKEDIN_CONSUMER_SECRET, 
                                                              LINKEDIN_OAUTH_USER_TOKEN, LINKEDIN_OAUTH_USER_SECRET, 
                                                              RETURN_URL, linkedin.PERMISSIONS.enums.values())
    
    # Pass it in to the app...
    
    application = linkedin.LinkedInApplication(authentication)    
    
    job_data = read_pickle('Job Data.pkl')
    company_list = np.unique(job_data.name.values.ravel())
        
    # Set dict of return values and inputs
    comp_sels = [{'companies': ['name', 'universal-name', 'description', 'company-type', 'industries', 'status', 'employee-count-range', 'specialties', 'website-url']}]
    comp_params = {'keywords' : None}
    
    # Data dictionaries - going to convert them into Pandas dataframes
    linkedin_companies = {}
    linkedin_industries = {}
    linkedin_specialities = {}
    
    # Loop through the unique set of companies
    for idx, comp_name in enumerate(company_list):
        comp_params['keywords'] = comp_name # Set company name as keyword       
        comp_vals = application.search_company(selectors = comp_sels, params = comp_params)
        
        if comp_vals['companies']['_total'] == 0:   # No results returned
            continue
        
        # Calculate the edit distance between the returned results and the input name
        dist_vals = []        
        for jdx, company in enumerate(comp_vals['companies']['values']):
            link_comp_name = company['name']
            name_dist = fuzzy_match(comp_name, link_comp_name)
            dist_vals.append([link_comp_name, name_dist, jdx])
            
        # Sort the values and choose the best one
        sort_dist_vals = sorted(dist_vals, key=lambda s: s[1])
        best_guess_company = comp_vals['companies']['values'][sort_dist_vals[0][2]]
        best_guess_name = sort_dist_vals[0][0]
        
        status_code, status_name = get_lnkin_code_name(best_guess_company, 'status')
        company_type_code, company_type_name = get_lnkin_code_name(best_guess_company, 'companyType')
        employee_count_code, employee_count_name = get_lnkin_code_name(best_guess_company, 'employeeCountRange')
        
        # Store company related data in a dictionary
        linkedin_company = {}
        linkedin_company['name'] = comp_name        
        linkedin_company['lnkn_name'] = best_guess_name        
        linkedin_company['lnkn_universal_name'] = best_guess_company.get('universalName')
        linkedin_company['lnkn_description'] = best_guess_company.get('description')
        linkedin_company['status_code'] = status_code
        linkedin_company['status_name'] = status_name
        linkedin_company['company_type_code'] = company_type_code
        linkedin_company['company_type_name'] = company_type_name
        linkedin_company['employee_count_code'] = employee_count_code
        linkedin_company['employee_count_name'] = employee_count_name
        linkedin_company['websiteUrl'] = best_guess_company.get('websiteUrl')                
        linkedin_companies[idx] = linkedin_company
                        
        # Store industry data in a separate dict
        if 'industries' in best_guess_company:
            if best_guess_company['industries']['_total'] &gt; 0:
                ind_start = len(linkedin_industries)
                for jdx, industry in enumerate(best_guess_company['industries']['values']):
                    linkedin_industry = {}
                    linkedin_industry['lnkn_name'] = best_guess_name
                    linkedin_industry['industry_type_code'] = industry['code']
                    linkedin_industry['industry_type_name'] = industry['name']
                    linkedin_industries[ind_start + jdx] = linkedin_industry
                
        # Store speciality data in a separate dict
        if 'specialties' in best_guess_company:
            if best_guess_company['specialties']['_total'] &gt; 0:
                spec_start = len(linkedin_specialities)
                for jdx, speciality in enumerate(best_guess_company['specialties']['values']):
                    linkedin_speciality = {}
                    linkedin_speciality['lnkn_name'] = best_guess_name
                    linkedin_speciality['speciality'] = speciality
                    linkedin_specialities[spec_start + jdx] = linkedin_speciality                
Beispiel #34
0
def test_ds_words():
    
    # Reads the description and job position text from a file of job posts,
    # tokenizing the results to view top n words, bigrams, and trigrams. Results
    # are written to a text file. 
   
    job_data = read_pickle('Job Data.pkl')
    
    # Tokenize the job text
    summ_words = []
    desr_words = []    
    for _, row in job_data.iterrows():
        if row['position'] is not None:
            summ_words += get_word_tokenize(row['position'])    
        if row['description'] is not None:
            desr_words += get_word_tokenize(row['description']) 
      
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords += [',', '.', ':', '(', ')', '-', ';', '&amp;', '!', '?','\'s']
    
    words_file = open("top_N_words.log","w")
      
    # Get the Top N words
    top_n_summ_words = get_top_n_words(summ_words, 10, stopwords)
    top_n_desr_words = get_top_n_words(desr_words, 50, stopwords) 
    
    print &gt;&gt; words_file, 'Top 10 Job Position Words\n'    
                     
    for top_word in top_n_summ_words:
        print &gt;&gt; words_file, top_word
          
    print &gt;&gt; words_file, '\n\n'
    
    print &gt;&gt; words_file, 'Top 50 Job Description Words\n'    
          
    for top_word in top_n_desr_words:
        print &gt;&gt; words_file, top_word
          
    min_hits = int(len(job_data) * 0.05)
    
    print &gt;&gt; words_file, 'Top 50 Job Description Bigrams\n'
          
    # Get the Bigrams
    big_2_words = get_ngrams(2, desr_words, min_hits, 50, stopwords)
      
    print &gt;&gt; words_file, '\n\n'
          
    for top_word in big_2_words:
        print &gt;&gt; words_file, ' '.join(top_word)
          
    print &gt;&gt; words_file, '\n\n'  
    
    print &gt;&gt; words_file, 'Top 50 Job Description Trigrams\n'          
               
    # Get the Trigrams
    big_3_words = get_ngrams(3, desr_words, min_hits, 50, stopwords)
      
    for top_word in big_3_words:
        print &gt;&gt; words_file, ' '.join(top_word)            
          
    words_file.close()