def plot_code_histograms(compiled_f, outdir): ret = Utils.Load(compiled_f) npr = np.array(ret['n+_risk']) cpr = np.array(ret['c+_risk']) hydr = np.array(ret['hyd_risk']) prr = np.array(ret['PR_risk']) for nm, riskarr, color in zip( ['N_plus', 'C_plus', 'hyd', 'PR'], [npr, cpr, hydr, prr], ['#0d4c7c', '#151515', '#018571', '#660099']): for nm1, riskarr1 in zip(['N_plus', 'C_plus', 'hyd', 'PR'], [npr, cpr, hydr, prr]): if nm == nm1: locarr = riskarr[1:] stan = riskarr[0] else: locarr = riskarr[1:][tuple([riskarr1[1:] <= riskarr1[0]])] stan = riskarr[0] _, ax = plt.subplots(1, figsize=(3.5, 2.333), dpi=144) ax.hist(locarr, color=color, bins=100, density=True) ax.axvline(stan, color='yellow', lw=1) ax.axvline(stan, color='k', lw=0.6) print('{} given {} {} p={}'.format( nm, nm1, stan, sum(locarr <= stan) / len(locarr))) plt.savefig(join( outdir, 'Code_cost_million_hist_{}_{}.png'.format(nm, nm1)), dpi=144) plt.close('all')
def main(): # Filter VCF files for high quality SNPs and save them in DataFrame # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline # estimated total CPU time for this part ~1,000 hours (Intel(R) Xeon(R) CPU E5-2690 v3) for fname in glob(join(SNP.OM_RGC.InputDir, '*.vcf')): if exists( join(SNP.OM_RGC.GeneDFDir, basename(fname).replace('.vcf', '.df'))): continue getvariants(fname, SNP.OM_RGC.GeneDFDir, only_snps=SNP.OM_RGC.OnlySNPs, qual_thresh=SNP.OM_RGC.QualThresh, min_samples=SNP.OM_RGC.MinSamples, min_variants=SNP.OM_RGC.MinVariants) # Calculate selection metrics per gene (e.g. pN/pS) # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline # estimated total CPU time for this part >5,000 hours for fname in glob(join(SNP.OM_RGC.GeneDFDir, '*.df')): outdir = SNP.OM_RGC.OutDir if all([exists(join(outdir, basename(fname).replace('.df',ext)))\ for ext in ['.pnps.df', '.ffdeg_pi_wit.df', '.pnpn.df']]): continue analyze_genes(fname, Calling.OM_RGC.DbFasta, outdir, SNP.OM_RGC.CacheDir, min_pos_reads=SNP.OM_RGC.MinPosReads, min_perc_poss=SNP.OM_RGC.MinPosReads, min_total_var_support=SNP.OM_RGC.MinTotalVarSupport, min_maf=SNP.OM_RGC.MinMaf, min_samples=SNP.OM_RGC.MinSamples, min_variants=SNP.OM_RGC.MinVariants) # Collate selection metrics per KEGG KO / eggNOG OG. Requires running eggnogMapper on OM-RGC # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline # Note: this calculates only pN/pS per KEGG KO / eggNOG OG. To calculate fourfold degenerate # pi within (validation) or pN(conservative AA substitutions) vs pN(radical AA substitutions) # (also validation), refer to the relevant methods within SNP/CollatedGeneGroups.py for db in SNP.OM_RGC.GeneGroupCollateDBs: dbdct = Utils.Load(db) dbdct = filter_db(dbdct, SNP.OM_RGC, SNP.OM_RGC.MinGenes) for nm, genes in dbdct.items(): if not exists( join(SNP.OM_RGC.OutDirCollate, 'pnps', split3way(db)[1] + '_' + nm + '.pnps.df')): do_one_group_pnps(nm, split3way(db)[1], genes, SNP.OM_RGC, SNP.OM_RGC.MinGenes) # This groups all of the files and saves them in the output folder defined as General.Basepath do_collate(join(SNP.OM_RGC.OutDirCollate, 'pnps', 'KEGG'), 4, 60, 5, 50, 5) do_collate(join(SNP.OM_RGC.OutDirCollate, 'pnps', 'eggNOG'), 4, 60, 5, 50, 5)
def _getgeneseqs(genes_df_f, db_fasta, gene_names, cachedir): cache_f = join(cachedir, basename(genes_df_f).replace('.df', '.genes.dat')) if exists(cache_f): return Utils.Load(cache_f) ret = {} for rec in SeqIO.parse(db_fasta, 'fasta'): if rec.id in gene_names: ret[rec.id] = str(rec.seq) if len(ret) == len(gene_names): break Utils.Write(cache_f, ret) return ret
def million_codes(): # This creates one million permutations of the genetic code aas, _, _ = get_codon_table() df = read_pickle( join(General.Basepath, 'All_4_60_mutation_codon_counts.df')) # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline for i in range(100): codon_risk(df, aas, 'All_{:02d}'.format(i), True, subdir='Million') compiled_f = join(CodeAnalysis.CodonsDir, 'Codon_risk_compiled.dat') ret = defaultdict(list) for i, fn in enumerate( glob(join(CodeAnalysis.CodonsDir, 'Million', '*.dat'))): ret_l = Utils.Load(fn) for var in ['n+_risk', 'c+_risk', 'o+_risk', 'hyd_risk', 'PR_risk']: ret[var].extend((ret_l[var] if i == 0 else ret_l[var][1:])) print(i) Utils.Write(compiled_f, ret) return compiled_f
def filter_db(dbdct, analysisclass, mingenes): indir = analysisclass.OutDir pnps_piwig_f = join(indir, 'PNPSPiWiGenes.dat') genes_in_use = [] if exists(pnps_piwig_f): pnpsgenes = Utils.Load(pnps_piwig_f) else: for fname in glob(join(indir, '*pnps.df')) + glob(join(indir, '*pi_wit.df')): print(fname) genes_in_use.extend(list(read_pickle(fname).index.get_level_values(0).unique())) pnpsgenes = list(set(genes_in_use)) Utils.Write(pnps_piwig_f, genes_in_use) ret = {} pnpsgenes = set(pnpsgenes) for k,v in dbdct.items(): if len(set(v).intersection(pnpsgenes)) >= mingenes: ret[k] = v return(ret)
def square_vs_diag(codon_permutation_f, outdir): # Replicates the analysis presented in Fig. 5B and creates plot ret = Utils.Load(codon_permutation_f) npr = np.array(ret['n+_risk']) codes = ret['code'] squares = np.array([issquare(i) for i in codes[1:]]) diags = np.array([isdiag(i) for i in codes[1:]]) print('n diags: {}'.format(sum(diags))) print('n squares: {}'.format(sum(squares))) _, ax = plt.subplots(1, figsize=(4.7, 5.2), dpi=144) grps = [npr[1:][squares], npr[1:][~squares & ~diags], npr[1:][diags]] ax.boxplot(grps, showfliers=False, whis=(5, 95), flierprops={ 'color': 'k', 'marker': 'x', 'markersize': 2 }, boxprops={ 'color': 'k', 'lw': 0.6 }, capprops={ 'color': 'k', 'lw': 0.6 }, whiskerprops={ 'color': 'k', 'lw': 0.6 }, medianprops={ 'color': '', 'lw': 1.2 }) ax.set_ylim(0.15, 0.31) ax.set_yticks([0.15, 0.2, 0.25, 0.3]) print('squares vs all: {}'.format(mannwhitneyu(grps[0], grps[1]))) print('squares vs diags: {}'.format(mannwhitneyu(grps[0], grps[2]))) print('diags vs all: {}'.format(mannwhitneyu(grps[2], grps[1]))) plt.savefig(join(outdir, 'Squares_diags.png'), dpi=144)
def multi_organism_analyze(): # This replicates the analysis presented in Fig. 4 # Source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5581930/ codons_all = read_pickle('./resource/ModelOrganisms.df').set_index('Taxid') # Take only the organisms with more than 50K codons in the calculation codons_all = codons_all.loc[codons_all.iloc[:, 11:].sum(1) >= 50000] aas, _, _ = get_codon_table() # Create alternative codes for each organism and transition-transverskion rate # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline for taxid, row in codons_all.iterrows(): codons = row[11:].astype(float) for titv in [0.2, 0.25, 0.333, 0.5, 0.667, 1, 1.5, 2, 3, 4, 5]: ti = (2 * titv) / (1 + 2 * titv) codon_risk(None, aas, 'Tax_{}_Rate_{}'.format(taxid, titv), all_mutations=False, external_counts=codons, external_titv=(ti, 1 - ti), subdir='MultiOrg') # Collate the results in one table proc_stats = {} for fnm in glob(join(CodeAnalysis.CodonsDir, 'MultiOrg/*.dat')): tax = float(basename(fnm).split('Tax_')[-1].split('_')[0]) rate = float(basename(fnm).split('Rate_')[-1].split('_')[0]) ret = Utils.Load(fnm) npr = np.array(ret['n+_risk']) cpr = np.array(ret['c+_risk']) proc_stats[(tax, rate)] = { 'cpr_p': sum(cpr[1:] <= cpr[0]) / 10000., 'npr_p': sum(npr[1:] <= npr[0]) / 10000., 'ncpr_p': sum((cpr[1:] <= cpr[0]) & (npr[1:] <= npr[0])), 'cpr': cpr[0], 'npr': npr[0] } DataFrame(proc_stats).to_pickle( join(CodeAnalysis.CodonsDir, 'MultiOrg_rates.df'))