class OM_RGC: IndexFasta = Mapping.OM_RGC.IndexFasta IndexFaa = IndexFasta.replace('fasta','faa') SplitDir = mkdirifnotexists(Mapping.OM_RGC.IndexFasta, 'split') AnnotDir = mkdirifnotexists(Mapping.OM_RGC.IndexFasta, 'annot') EggnogDir = mkdirifnotexists(Mapping.OM_RGC.IndexFasta, 'eggnog') # TODO: change these to the directory and python file of eggnog-mapper EmapperDir = '~/eggnog/eggnog-mapper' EmapperPy = join(EmapperDir, 'emapper.py') # TODO: change this to python2.7 path Py27 = '~/anaconda3/envs/py27/bin/python'
class General: #TODO: replace with filesystem base (the project requires ~35TB of storage for all intermediates Scratch = '~' #TODO: replace with output path for finished dataframes Basepath = mkdirifnotexists(join(Scratch, 'Analyses/2019-Oceans/DFOut')) #TODO: replace with output path for intermediates Tmppath = mkdirifnotexists(join(Scratch, 'Analyses/2019-Oceans/tmp')) #TODO: replace to where you place your data, i.e., samples and databases Data = join(Scratch, 'Data') #TODO: replace with where you place your samples (e.g. Tara) Samples = join(Data, 'Samples') #TODO: replace with where you place your sample metadata Metadata = join(Data, 'Metadata') #TODO: replace with where you place your databases (e.g. OM-RGC) Databases = join(Data, 'Databases')
def do_one_group_ffdeg_piwit(nm, f_prefix, genegroup, analysisclass, mingenes=3): ret = [] retlens = [] for prefix in set([g[:-4] for g in genegroup]): f_in = join(analysisclass.OutDir, prefix + '.ffdeg_pi_wit.df') if not exists(f_in): continue try: ldf = read_pickle(f_in) ldf = ldf.loc[[g for g in genegroup if g in ldf.index]] llensdf = read_pickle(f_in.replace('_pi_wit','_poss')) llensdf = llensdf.loc[[g for g in genegroup if g in llensdf.index]] except KeyError: continue if ldf.shape[0] > 0: ret.append(ldf) retlens.append(llensdf) if len(ret) == 0: return outdf = concat(ret, sort = False).dropna(how='all').dropna(how='all', axis = 1) if outdf.shape[0] < mingenes: return outdf_lens = concat(retlens) outdf_lens.name = 'Length' ret = {} for col in outdf: coldf = outdf[[col]].multiply(outdf_lens,axis=0).join(outdf_lens).dropna().sum() ret[col] = {'pi':coldf[col] / coldf['Length'], 'length':coldf['Length'], 'num_genes':len(outdf[col].dropna())} outpath = mkdirifnotexists(join(analysisclass.OutDirCollate, 'ffdeg')) DataFrame(ret).to_pickle(join(outpath, f_prefix + '_' + nm + '.ffdeg_pi_wit.df'))
def do_collate_pnpn(f_prefixes, minpos, minperc, mingenes, minsamples, minsamples_gene): tmpdir = mkdirifnotexists(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 'tmpfiles')) for fname in glob(f_prefixes + '*.pnpn.df'): _collate_pnpn_inner(fname, mingenes, minsamples, minsamples_gene, tmpdir) ret = [] ret_g1 = [] ret_g2 = [] for fname in glob(join(tmpdir, '*.tmp.df')): ret.append(read_pickle(fname).T) for fname in glob(join(tmpdir, '*.tmp.g1.df')): ret_g1.append(read_pickle(fname).T) for fname in glob(join(tmpdir, '*.tmp.g2.df')): ret_g2.append(read_pickle(fname).T) outdir = join(SNP.OM_RGC.OutDirCollate, 'pnpn') with open(join(outdir, 'pNpNCases.txt'), 'w') as ftxt: ftxt.write('Conditions for pN groups in this analysis\n') ftxt.write('Always pN(G1)/pN(G2) so invert if G1 is more conservative\n\n') bigdf = concat(ret, sort=False) bigg1 = concat(ret_g1, sort=False) bigg2 = concat(ret_g2, sort=False) for j, col in enumerate(bigdf.index.get_level_values(0).unique()): ftxt.write('Case {}: {}\n'.format(j,col)) bigdf.loc[col].T.to_csv(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 'pNpN_Case_{}_{}_{}_{}_{}_{}_{}.csv'\ .format(j, f_prefixes.split('/')[-1], minpos, minperc, mingenes, minsamples, minsamples_gene))) bigg1.loc[col].T.to_csv(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 'pNpN_Case_{}_{}_{}_{}_{}_{}_{}.g1.csv'\ .format(j, f_prefixes.split('/')[-1], minpos, minperc, mingenes, minsamples, minsamples_gene))) bigg2.loc[col].T.to_csv(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 'pNpN_Case_{}_{}_{}_{}_{}_{}_{}.g2.csv'\ .format(j, f_prefixes.split('/')[-1], minpos, minperc, mingenes, minsamples, minsamples_gene)))
class OM_RGC: InputDir = Calling.OM_RGC.CallDir GeneDFDir = mkdirifnotexists(join(General.Tmppath, 'OM-RGC_GeneDFs')) OutDir = mkdirifnotexists(join(General.Tmppath, 'OM-RGC_SNPAnalysis')) OutDirCollate = mkdirifnotexists(join(General.Tmppath, 'OM-RGC_Collate')) OutDirCodons = mkdirifnotexists(join(General.Tmppath, 'OM-RGC_Codons')) # Replace this with the path to which you ran eggNOG mapper on OM-RGC eggNOGMapper_outpath = join(Mapping.OM_RGC.IndexDir,'eggnog') GeneGroupCollateDBs = [join(eggNOGMapper_outpath,d) for d in \ ['KEGG_ko.dat','eggNOG_OGs.dat']] CacheDir = mkdirifnotexists(join(General.Tmppath, 'OM-RGC_Cache')) GeneLengths = join(Mapping.OM_RGC.IndexDir, 'OM-RGC_seq.lengths') OnlySNPs = True # Only SNPS or also indels QualThresh = 30 # Compound quality threshold MinSamples = 20 # Minimum number of samples MinVariants = 1 # Minimum number of variants per gene MinTotalVarSupport = 3 # Minimum number of variants to support a call MinMaf = 0.01 # Minimum minor allele frequency MinPosReads = 4 #Minimum coverage of reads per position to call SNP MinPercPoss = 60 #Minimal percent of positions complying with above number of reads MinGenes = 5 #Minimal number of genes per KEGG KO / eggNOG OG
def do_one_group_pnpn(nm, f_prefix, genegroup, analysisclass, mingenes=3): ret = [] for prefix in set([g[:-4] for g in genegroup]): f_in = join(analysisclass.OutDir, prefix + '.pnpn.df') if not exists(f_in): continue try: ldf = read_pickle(f_in) ldf = ldf.loc[[g for g in genegroup if g in ldf.index]] except KeyError: continue if ldf.shape[0] > 0: ret.append(ldf) if len(ret) == 0: return outdf = concat(ret, sort = False) outpath = mkdirifnotexists(join(analysisclass.OutDirCollate, 'pnpn')) if outdf.groupby(level=0).first().shape[0] < mingenes: return outdf.to_pickle(join(outpath, f_prefix + '_' + nm + '.pnpn.df'))
class OM_RGC: IndexDir = join(General.Databases, 'OM-RGC') IndexFile = join(IndexDir,'OM-RGC') IndexFasta = join(IndexDir,'OM-RGC_seq.fasta') LengthsFile = join(Mapping.OM_RGC.IndexDir,'OM-RGC_seq.lens') MapDir = mkdirifnotexists(join(General.Tmppath,'OM-RGC_Mapping')) MapParams = dict(preset=MapPreset.SENSITIVE, report_alns=20, minins=0, maxins=500, no_mixed=False, no_discordant=False, dovetail=False, no_contain=False, no_overlap=False) ICRAParams = dict(max_mismatch=12, consider_lengths=True, epsilon=1e-6, \ max_iterations=30, min_bins=4, max_bins=100, min_reads=10, dense_region_coverage=60, length_minimum=300, \ length_maximum=2e5, use_theta=False, average_read_length=None, force_save_delta=True) ICRAUsage = OCEAN_GENES RemoveUnmapped = True RemoveNotDelta=False DeltaThresh=0.999 DeletePMP = True DeleteOldBAM=True
def map_sample(force_rerun=False): #Single run operation to create a lengths database _create_length_db() indexf = Mapping.OM_RGC.IndexFile os.chdir(mkdirifnotexists(join(Mapping.OM_RGC.MapDir, 'tmp'))) all_files = glob(join(RawFastq.ALOHA_BATS.FastqDir, '*.fastq.gz')) + \ glob(join(RawFastq.TARA.FastqDir, '*.fastq.gz')) + \ glob(join(RawFastq.bioGEOTRACES.FastqDir, '*.fastq.gz')) flist = [] for fq in all_files: prefix = basename(fq.replace('.fastq.gz', '')) prefix = join(Mapping.OM_RGC.MapDir, prefix) flist.append((fq, prefix)) # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline # estimated total CPU time for this part >10,000 hours (Intel(R) Xeon(R) CPU E5-2690 v3) for fq, prefix in flist: if exists(prefix + '.icrabamdone') and not force_rerun: print('{} pipeline completed, no need to rerun'.format(prefix)) continue args = (fq, None, prefix, indexf) mapkwargs = dict(threads=16, map_param_dict=Mapping.OM_RGC.MapParams) if not exists(prefix + '.pmpdone') or force_rerun: do_single_mappmp(*args, **mapkwargs) # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline # estimated total CPU time for this part >25,000 hours for fq, prefix in flist: args = (fq, None, prefix, indexf) icrakwargs = dict(icra_usage=Mapping.OM_RGC.ICRAUsage, icra_param_dict=Mapping.OM_RGC.ICRAParams, remove_unmapped=Mapping.OM_RGC.RemoveUnmapped, remove_not_delta=Mapping.OM_RGC.RemoveNotDelta, delta_thresh=Mapping.OM_RGC.DeltaThresh, delete_pmp=Mapping.OM_RGC.DeletePMP, delete_old_bam=Mapping.OM_RGC.DeleteOldBAM) if exists(prefix + '.icrabamdone') and not force_rerun: continue # NOTE: this process may be memory intensive, especially for bam files larger than 25GB. # To avoid problems, process these files only on nodes with > 256GB memory do_single_icrabam(args, icrakwargs)
def main(): os.chdir(mkdirifnotexists(join(Calling.OM_RGC.CallDir, 'tmp'))) bioG_m = read_pickle(Biodata.bioGEOTRACES.metadataDF) ALOHA_m = read_pickle(Biodata.ALOHA_BATS.metadataDF) TARA_m = read_pickle(Biodata.TARA.metadataDF) allbams = concat([TARA_m, ALOHA_m, bioG_m], sort=False)[['ICRABAM_1', 'ICRABAM_2']] dirnames = sorted(list(set([ref[:-4] for ref \ in pysam.AlignmentFile(allbams.iloc[-1]['ICRABAM_1']).header.references]))) # This is set to process 80 genes (each with all samples) at a time. # Changing it to a higher setting will cause everything to run faster on a hpc system, but # take up more memory and space for intermediate files dirnamegrps = [dirnames[i:i + 80] for i in range(0, len(dirnames), 80)] for reference_list in dirnamegrps: # TODO: IMPORTANT! Wrap the loops in the called method with your hpc job submission pipeline # Also IMPORTANT! Make sure each loop runs synchronously with the next (wait for one to # finish before you start the next) # Estimated total CPU time for this part >25,000 hours (Intel(R) Xeon(R) CPU E5-2690 v3) do_references(allbams, Calling.OM_RGC.CallDir, reference_list, Calling.OM_RGC.FilterThreshold, Calling.OM_RGC.DbFasta, Calling.OM_RGC.mpileupParams, THREADS)
def splitbam(bam_fname, outbasedir, reference_list): ## Counting on input to be sorted with pysam.AlignmentFile(bam_fname) as af_in: # @UndefinedVariable for dirnm, grp in groupby(af_in, _dirfromrec): if reference_list is not None and dirnm not in reference_list: continue lgrp = list(grp) if len(lgrp) == 0: continue reference_names = [ af_in.get_reference_name(i) for i in range(lgrp[-1].reference_id + 1) ] reference_lengths = [ af_in.header.get_reference_length(nm) for nm in reference_names ] outsplitdir = mkdirifnotexists(join(outbasedir, dirnm)) with pysam.AlignmentFile( join(outsplitdir, basename(bam_fname.replace( '.s.filt', ''))), # @UndefinedVariable 'wb', reference_names=reference_names, reference_lengths=reference_lengths) as af_out: for rec in lgrp: af_out.write(rec)
positions=[1, 2, 3, 4, 6, 7, 8]) for partname in ('cbars', 'cmins', 'cmaxes'): vp = bp[partname] vp.set_edgecolor('k') vp.set_linewidth(1) [m.set_color('#0d4c7c') for m in bp['bodies'][:4]] [m.set_color('#891919') for m in bp['bodies'][-3:]] ax.set_ylim(0, 1.) ax.set_xticks(range(1, 9)) plt.savefig(join(outdir, 'Codon_usage.png'), dpi=144) if __name__ == '__main__': create_codon_trans_matrix( SNP.OM_RGC.OutDir, mkdirifnotexists(join(SNP.OM_RGC.OutDir, 'play')), 'All') # Replicates analysis presented in Fig. 2, 3 # Replace outdir with a desired output directory compiled_f = million_codes() plot_code_histograms(compiled_f, outdir='.') # Replicates analysis presented in Fig. 4 and saves it as a table multi_organism_analyze() # Replicates analysis in Fig. 5A and creates plot # Replace outdir with a desired output directory codon_bias(outdir='.') # Replicates analysis in Fig. 5B and creates plot
def codon_risk(df, aas, prefix, all_mutations=True, external_counts=None, external_titv=None, subdir=None): stops = ['TAA', 'TAG', 'TGA'] if external_counts is None: df = df.sum(1).to_frame('all_muts') dfnostop = df.loc[[(i, j) for i, j in df.index if i not in stops and j not in stops]] codonabuns = df.groupby(level=0).sum() else: codonabuns = external_counts.to_frame('all_muts') codonabuns['AAs'] = [aas[i] for i in codonabuns.index] meanaas = codonabuns.groupby('AAs').median() codonabuns = codonabuns.join(meanaas, on='AAs', lsuffix='_1').drop(['all_muts_1', 'AAs'], axis=1) codonabuns = codonabuns.truediv( codonabuns.sum()).rename(columns={'all_muts': 'codon_abuns'}) ret = defaultdict(list) for it in range(10001): if it == 0: newaas = aas newcodonabuns = codonabuns else: # To maintain the abundances of the codons coding for the same amino acids codonaas = codonabuns.copy() codonaas['AAs'] = [aas[i] for i in codonabuns.index] codonaas = codonaas.reset_index().groupby('AAs').apply(lambda x:x.reset_index())\ .drop(['level_0','AAs'], axis=1) newaas = scramble_codons(aas) codon_shuf = newaas.reset_index().groupby('AAs').apply(lambda x:x.reset_index())\ .drop(['level_0','AAs'], axis=1) newcodonabuns = codonaas.join( codon_shuf, lsuffix='_1').set_index('index')['codon_abuns'] # Estimate the abundance of mutations using fourfold degenerate synonymous mutations if external_titv is None: mutabuns = getmuts(filter_nonsyn(dfnostop, aas, True), all_mutations)\ .drop('mutation_pos', axis=1).groupby('mutation_type').mean().drop('None') mutabuns = mutabuns.truediv( mutabuns.sum()).rename(columns={'all_muts': 'mut_abuns'}) else: mutabuns = DataFrame({ 'mut_abuns': { 'Transition': external_titv[0], 'Transversion': external_titv[1] } }) mutabuns.index.name = 'mutation_type' mut_costs = get_mut_costs(newaas) newstops = newaas[newaas == '*'].index allabuns = getmuts(mut_costs, all_mutations).join(mutabuns, on='mutation_type').reset_index()\ .join(newcodonabuns, on='aa_start').set_index(['aa_start','aa_end'])\ .drop(['mutation_pos','mutation_type'], axis=1) allabuns = allabuns.loc[[(i,j) for i,j in allabuns.index \ if i not in newstops and j not in newstops]] def applyfunc(x, col): return x[col] * x.mut_abuns * x.codon_abuns ret['hyd_risk'].append( allabuns.apply(lambda x: applyfunc(x, col='Hyd_d'), axis=1).sum()) ret['PR_risk'].append( allabuns.apply(lambda x: applyfunc(x, col='PR_d'), axis=1).sum()) ret['n+_risk'].append(allabuns[allabuns.N_d > 0].apply( lambda x: applyfunc(x, col='N_d'), axis=1).sum()) ret['c+_risk'].append(allabuns[allabuns.C_d > 0].apply( lambda x: applyfunc(x, col='C_d'), axis=1).sum()) ret['o+_risk'].append(allabuns[allabuns.O_d > 0].apply( lambda x: applyfunc(x, col='O_d'), axis=1).sum()) ret['code'].append(newaas) if subdir is not None: outfol = mkdirifnotexists(join(CodeAnalysis.CodonsDir, subdir)) outfname = 'Codon_risk_{}_{}_pstop_medaa.dat'.format( 'allmut' if all_mutations else 'TiTv', prefix) print(outfname) Utils.Write(join(outfol, outfname), ret) for k in ret: if k != 'code': print('{}: {}'.format(k, sum(ret[k] < ret[k][0]) / 10000.))
class CodeAnalysis: CodonsDir = mkdirifnotexists(General.Tmppath, 'Codons')
class OM_RGC: CallDir = mkdirifnotexists(join(General.Tmppath, 'OM-RGC_Call')) DbFasta = join(Mapping.OM_RGC.IndexDir,'OM-RGC_seq.fasta') FilterThreshold = 0.9 mpileupParams = dict(min_base_qual = 15, max_depth_indel = 1000, max_depth = 100000, min_iReads = 2)