Ejemplo n.º 1
0
 class OM_RGC:
     IndexFasta = Mapping.OM_RGC.IndexFasta
     IndexFaa = IndexFasta.replace('fasta','faa')
     SplitDir = mkdirifnotexists(Mapping.OM_RGC.IndexFasta, 'split')
     AnnotDir = mkdirifnotexists(Mapping.OM_RGC.IndexFasta, 'annot')
     EggnogDir = mkdirifnotexists(Mapping.OM_RGC.IndexFasta, 'eggnog')
     # TODO: change these to the directory and python file of eggnog-mapper
     EmapperDir = '~/eggnog/eggnog-mapper'
     EmapperPy = join(EmapperDir, 'emapper.py')
     # TODO: change this to python2.7 path
     Py27 = '~/anaconda3/envs/py27/bin/python'
Ejemplo n.º 2
0
class General:
    #TODO: replace with filesystem base (the project requires ~35TB of storage for all intermediates
    Scratch = '~' 
    #TODO: replace with output path for finished dataframes
    Basepath = mkdirifnotexists(join(Scratch, 'Analyses/2019-Oceans/DFOut')) 
    #TODO: replace with output path for intermediates
    Tmppath = mkdirifnotexists(join(Scratch, 'Analyses/2019-Oceans/tmp'))
    #TODO: replace to where you place your data, i.e., samples and databases
    Data = join(Scratch, 'Data')
    #TODO: replace with where you place your samples (e.g. Tara)
    Samples = join(Data, 'Samples')
    #TODO: replace with where you place your sample metadata 
    Metadata = join(Data, 'Metadata')
    #TODO: replace with where you place your databases (e.g. OM-RGC) 
    Databases = join(Data, 'Databases')
def do_one_group_ffdeg_piwit(nm, f_prefix, genegroup, analysisclass, mingenes=3):
    ret = []
    retlens = []
    for prefix in set([g[:-4] for g in genegroup]):
        f_in = join(analysisclass.OutDir, prefix + '.ffdeg_pi_wit.df')
        if not exists(f_in):
            continue
        try:
            ldf = read_pickle(f_in)
            ldf = ldf.loc[[g for g in genegroup if g in ldf.index]]
            llensdf = read_pickle(f_in.replace('_pi_wit','_poss'))
            llensdf = llensdf.loc[[g for g in genegroup if g in llensdf.index]]
        except KeyError:
            continue
        if ldf.shape[0] > 0:
            ret.append(ldf)
            retlens.append(llensdf)
    if len(ret) == 0:
        return
    outdf = concat(ret, sort = False).dropna(how='all').dropna(how='all', axis = 1)
    if outdf.shape[0] < mingenes:
        return
    outdf_lens = concat(retlens)
    outdf_lens.name = 'Length'
    ret = {}
    for col in outdf:
        coldf = outdf[[col]].multiply(outdf_lens,axis=0).join(outdf_lens).dropna().sum()
        ret[col] = {'pi':coldf[col] / coldf['Length'], 'length':coldf['Length'], 
                    'num_genes':len(outdf[col].dropna())}
    outpath = mkdirifnotexists(join(analysisclass.OutDirCollate, 'ffdeg'))
    DataFrame(ret).to_pickle(join(outpath, f_prefix + '_' + nm + '.ffdeg_pi_wit.df'))
def do_collate_pnpn(f_prefixes, minpos, minperc, mingenes, minsamples, minsamples_gene):
    tmpdir = mkdirifnotexists(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 'tmpfiles'))
    for fname in glob(f_prefixes + '*.pnpn.df'):
        _collate_pnpn_inner(fname, mingenes, minsamples, minsamples_gene, tmpdir)
    ret = []
    ret_g1 = []
    ret_g2 = []
    for fname in glob(join(tmpdir, '*.tmp.df')):
        ret.append(read_pickle(fname).T)
    for fname in glob(join(tmpdir, '*.tmp.g1.df')):
        ret_g1.append(read_pickle(fname).T)
    for fname in glob(join(tmpdir, '*.tmp.g2.df')):
        ret_g2.append(read_pickle(fname).T)
    outdir = join(SNP.OM_RGC.OutDirCollate, 'pnpn')
    with open(join(outdir, 'pNpNCases.txt'), 'w') as ftxt:
        ftxt.write('Conditions for pN groups in this analysis\n')
        ftxt.write('Always pN(G1)/pN(G2) so invert if G1 is more conservative\n\n')
        bigdf = concat(ret, sort=False)
        bigg1 = concat(ret_g1, sort=False)
        bigg2 = concat(ret_g2, sort=False)
        for j, col in enumerate(bigdf.index.get_level_values(0).unique()):
            ftxt.write('Case {}: {}\n'.format(j,col))
            bigdf.loc[col].T.to_csv(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 
                                            'pNpN_Case_{}_{}_{}_{}_{}_{}_{}.csv'\
                                  .format(j, f_prefixes.split('/')[-1], minpos, minperc, mingenes, 
                                          minsamples, minsamples_gene)))
            bigg1.loc[col].T.to_csv(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 
                                            'pNpN_Case_{}_{}_{}_{}_{}_{}_{}.g1.csv'\
                                  .format(j, f_prefixes.split('/')[-1], minpos, minperc, mingenes, 
                                          minsamples, minsamples_gene)))
            bigg2.loc[col].T.to_csv(join(SNP.OM_RGC.OutDirCollate, 'pnpn', 
                                            'pNpN_Case_{}_{}_{}_{}_{}_{}_{}.g2.csv'\
                                  .format(j, f_prefixes.split('/')[-1], minpos, minperc, mingenes, 
                                          minsamples, minsamples_gene)))
Ejemplo n.º 5
0
 class OM_RGC:
     InputDir = Calling.OM_RGC.CallDir
     GeneDFDir = mkdirifnotexists(join(General.Tmppath, 'OM-RGC_GeneDFs'))
     OutDir = mkdirifnotexists(join(General.Tmppath, 'OM-RGC_SNPAnalysis'))
     OutDirCollate = mkdirifnotexists(join(General.Tmppath, 'OM-RGC_Collate'))
     OutDirCodons = mkdirifnotexists(join(General.Tmppath, 'OM-RGC_Codons'))
     # Replace this with the path to which you ran eggNOG mapper on OM-RGC
     eggNOGMapper_outpath = join(Mapping.OM_RGC.IndexDir,'eggnog')
     GeneGroupCollateDBs = [join(eggNOGMapper_outpath,d) for d in \
                            ['KEGG_ko.dat','eggNOG_OGs.dat']]
     CacheDir = mkdirifnotexists(join(General.Tmppath, 'OM-RGC_Cache'))
     GeneLengths = join(Mapping.OM_RGC.IndexDir, 'OM-RGC_seq.lengths')
     OnlySNPs = True # Only SNPS or also indels
     QualThresh = 30 # Compound quality threshold
     MinSamples = 20 # Minimum number of samples
     MinVariants = 1 # Minimum number of variants per gene
     MinTotalVarSupport = 3 # Minimum number of variants to support a call
     MinMaf = 0.01 # Minimum minor allele frequency
     MinPosReads = 4 #Minimum coverage of reads per position to call SNP
     MinPercPoss = 60 #Minimal percent of positions complying with above number of reads
     MinGenes = 5 #Minimal number of genes per KEGG KO / eggNOG OG
def do_one_group_pnpn(nm, f_prefix, genegroup, analysisclass, mingenes=3):
    ret = []
    for prefix in set([g[:-4] for g in genegroup]):
        f_in = join(analysisclass.OutDir, prefix + '.pnpn.df')
        if not exists(f_in):
            continue
        try:
            ldf = read_pickle(f_in)
            ldf = ldf.loc[[g for g in genegroup if g in ldf.index]] 
        except KeyError:
            continue
        if ldf.shape[0] > 0:
            ret.append(ldf)
    if len(ret) == 0:
        return
    outdf = concat(ret, sort = False)
    outpath = mkdirifnotexists(join(analysisclass.OutDirCollate, 'pnpn'))
    if outdf.groupby(level=0).first().shape[0] < mingenes:
        return
    outdf.to_pickle(join(outpath, f_prefix + '_' + nm + '.pnpn.df'))
Ejemplo n.º 7
0
 class OM_RGC:
     IndexDir = join(General.Databases, 'OM-RGC')
     IndexFile = join(IndexDir,'OM-RGC')
     IndexFasta = join(IndexDir,'OM-RGC_seq.fasta')
     LengthsFile = join(Mapping.OM_RGC.IndexDir,'OM-RGC_seq.lens')
     MapDir = mkdirifnotexists(join(General.Tmppath,'OM-RGC_Mapping'))
     MapParams = dict(preset=MapPreset.SENSITIVE, report_alns=20, minins=0, maxins=500, 
                      no_mixed=False, no_discordant=False, dovetail=False, no_contain=False, 
                      no_overlap=False)
     ICRAParams = dict(max_mismatch=12, consider_lengths=True, epsilon=1e-6, \
                        max_iterations=30, min_bins=4, max_bins=100, min_reads=10, 
                        dense_region_coverage=60, length_minimum=300, \
                        length_maximum=2e5, use_theta=False, average_read_length=None, 
                        force_save_delta=True)
     ICRAUsage = OCEAN_GENES
     RemoveUnmapped = True 
     RemoveNotDelta=False
     DeltaThresh=0.999
     DeletePMP = True
     DeleteOldBAM=True
Ejemplo n.º 8
0
def map_sample(force_rerun=False):
    #Single run operation to create a lengths database
    _create_length_db()
    indexf = Mapping.OM_RGC.IndexFile
    os.chdir(mkdirifnotexists(join(Mapping.OM_RGC.MapDir, 'tmp')))
    all_files = glob(join(RawFastq.ALOHA_BATS.FastqDir, '*.fastq.gz')) + \
                glob(join(RawFastq.TARA.FastqDir, '*.fastq.gz')) + \
                glob(join(RawFastq.bioGEOTRACES.FastqDir, '*.fastq.gz'))
    flist = []
    for fq in all_files:
        prefix = basename(fq.replace('.fastq.gz', ''))
        prefix = join(Mapping.OM_RGC.MapDir, prefix)
        flist.append((fq, prefix))

    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    # estimated total CPU time for this part >10,000 hours (Intel(R) Xeon(R) CPU E5-2690 v3)
    for fq, prefix in flist:
        if exists(prefix + '.icrabamdone') and not force_rerun:
            print('{} pipeline completed, no need to rerun'.format(prefix))
            continue
        args = (fq, None, prefix, indexf)
        mapkwargs = dict(threads=16, map_param_dict=Mapping.OM_RGC.MapParams)
        if not exists(prefix + '.pmpdone') or force_rerun:
            do_single_mappmp(*args, **mapkwargs)

    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    # estimated total CPU time for this part >25,000 hours
    for fq, prefix in flist:
        args = (fq, None, prefix, indexf)
        icrakwargs = dict(icra_usage=Mapping.OM_RGC.ICRAUsage,
                          icra_param_dict=Mapping.OM_RGC.ICRAParams,
                          remove_unmapped=Mapping.OM_RGC.RemoveUnmapped,
                          remove_not_delta=Mapping.OM_RGC.RemoveNotDelta,
                          delta_thresh=Mapping.OM_RGC.DeltaThresh,
                          delete_pmp=Mapping.OM_RGC.DeletePMP,
                          delete_old_bam=Mapping.OM_RGC.DeleteOldBAM)
        if exists(prefix + '.icrabamdone') and not force_rerun:
            continue
        # NOTE: this process may be memory intensive, especially for bam files larger than 25GB.
        # To avoid problems, process these files only on nodes with > 256GB memory
        do_single_icrabam(args, icrakwargs)
Ejemplo n.º 9
0
def main():
    os.chdir(mkdirifnotexists(join(Calling.OM_RGC.CallDir, 'tmp')))
    bioG_m = read_pickle(Biodata.bioGEOTRACES.metadataDF)
    ALOHA_m = read_pickle(Biodata.ALOHA_BATS.metadataDF)
    TARA_m = read_pickle(Biodata.TARA.metadataDF)
    allbams = concat([TARA_m, ALOHA_m, bioG_m],
                     sort=False)[['ICRABAM_1', 'ICRABAM_2']]
    dirnames = sorted(list(set([ref[:-4] for ref \
                        in pysam.AlignmentFile(allbams.iloc[-1]['ICRABAM_1']).header.references])))
    # This is set to process 80 genes (each with all samples) at a time.
    # Changing it to a higher setting will cause everything to run faster on a hpc system, but
    # take up more memory and space for intermediate files
    dirnamegrps = [dirnames[i:i + 80] for i in range(0, len(dirnames), 80)]
    for reference_list in dirnamegrps:
        # TODO: IMPORTANT! Wrap the loops in the called method with your hpc job submission pipeline
        # Also IMPORTANT! Make sure each loop runs synchronously with the next (wait for one to
        # finish before you start the next)
        # Estimated total CPU time for this part >25,000 hours (Intel(R) Xeon(R) CPU E5-2690 v3)
        do_references(allbams, Calling.OM_RGC.CallDir, reference_list,
                      Calling.OM_RGC.FilterThreshold, Calling.OM_RGC.DbFasta,
                      Calling.OM_RGC.mpileupParams, THREADS)
Ejemplo n.º 10
0
def splitbam(bam_fname, outbasedir, reference_list):
    ## Counting on input to be sorted
    with pysam.AlignmentFile(bam_fname) as af_in:  # @UndefinedVariable
        for dirnm, grp in groupby(af_in, _dirfromrec):
            if reference_list is not None and dirnm not in reference_list:
                continue
            lgrp = list(grp)
            if len(lgrp) == 0: continue
            reference_names = [
                af_in.get_reference_name(i)
                for i in range(lgrp[-1].reference_id + 1)
            ]
            reference_lengths = [
                af_in.header.get_reference_length(nm) for nm in reference_names
            ]
            outsplitdir = mkdirifnotexists(join(outbasedir, dirnm))
            with pysam.AlignmentFile(
                    join(outsplitdir, basename(bam_fname.replace(
                        '.s.filt', ''))),  # @UndefinedVariable
                    'wb',
                    reference_names=reference_names,
                    reference_lengths=reference_lengths) as af_out:
                for rec in lgrp:
                    af_out.write(rec)
Ejemplo n.º 11
0
                       positions=[1, 2, 3, 4, 6, 7, 8])
    for partname in ('cbars', 'cmins', 'cmaxes'):
        vp = bp[partname]
        vp.set_edgecolor('k')
        vp.set_linewidth(1)

    [m.set_color('#0d4c7c') for m in bp['bodies'][:4]]
    [m.set_color('#891919') for m in bp['bodies'][-3:]]
    ax.set_ylim(0, 1.)
    ax.set_xticks(range(1, 9))
    plt.savefig(join(outdir, 'Codon_usage.png'), dpi=144)


if __name__ == '__main__':
    create_codon_trans_matrix(
        SNP.OM_RGC.OutDir, mkdirifnotexists(join(SNP.OM_RGC.OutDir, 'play')),
        'All')

    # Replicates analysis presented in Fig. 2, 3
    # Replace outdir with a desired output directory
    compiled_f = million_codes()
    plot_code_histograms(compiled_f, outdir='.')

    # Replicates analysis presented in Fig. 4 and saves it as a table
    multi_organism_analyze()

    # Replicates analysis in Fig. 5A and creates plot
    # Replace outdir with a desired output directory
    codon_bias(outdir='.')

    # Replicates analysis in Fig. 5B and creates plot
Ejemplo n.º 12
0
def codon_risk(df,
               aas,
               prefix,
               all_mutations=True,
               external_counts=None,
               external_titv=None,
               subdir=None):
    stops = ['TAA', 'TAG', 'TGA']
    if external_counts is None:
        df = df.sum(1).to_frame('all_muts')
        dfnostop = df.loc[[(i, j) for i, j in df.index
                           if i not in stops and j not in stops]]
        codonabuns = df.groupby(level=0).sum()
    else:
        codonabuns = external_counts.to_frame('all_muts')
    codonabuns['AAs'] = [aas[i] for i in codonabuns.index]
    meanaas = codonabuns.groupby('AAs').median()
    codonabuns = codonabuns.join(meanaas, on='AAs',
                                 lsuffix='_1').drop(['all_muts_1', 'AAs'],
                                                    axis=1)
    codonabuns = codonabuns.truediv(
        codonabuns.sum()).rename(columns={'all_muts': 'codon_abuns'})
    ret = defaultdict(list)
    for it in range(10001):
        if it == 0:
            newaas = aas
            newcodonabuns = codonabuns
        else:
            # To maintain the abundances of the codons coding for the same amino acids
            codonaas = codonabuns.copy()
            codonaas['AAs'] = [aas[i] for i in codonabuns.index]
            codonaas = codonaas.reset_index().groupby('AAs').apply(lambda x:x.reset_index())\
                        .drop(['level_0','AAs'], axis=1)
            newaas = scramble_codons(aas)
            codon_shuf = newaas.reset_index().groupby('AAs').apply(lambda x:x.reset_index())\
                        .drop(['level_0','AAs'], axis=1)
            newcodonabuns = codonaas.join(
                codon_shuf, lsuffix='_1').set_index('index')['codon_abuns']
        # Estimate the abundance of mutations using fourfold degenerate synonymous mutations
        if external_titv is None:
            mutabuns = getmuts(filter_nonsyn(dfnostop, aas, True), all_mutations)\
                        .drop('mutation_pos', axis=1).groupby('mutation_type').mean().drop('None')
            mutabuns = mutabuns.truediv(
                mutabuns.sum()).rename(columns={'all_muts': 'mut_abuns'})
        else:
            mutabuns = DataFrame({
                'mut_abuns': {
                    'Transition': external_titv[0],
                    'Transversion': external_titv[1]
                }
            })
            mutabuns.index.name = 'mutation_type'
        mut_costs = get_mut_costs(newaas)
        newstops = newaas[newaas == '*'].index
        allabuns = getmuts(mut_costs, all_mutations).join(mutabuns, on='mutation_type').reset_index()\
                    .join(newcodonabuns, on='aa_start').set_index(['aa_start','aa_end'])\
                    .drop(['mutation_pos','mutation_type'], axis=1)
        allabuns = allabuns.loc[[(i,j) for i,j in allabuns.index \
                                 if i not in newstops and j not in newstops]]

        def applyfunc(x, col):
            return x[col] * x.mut_abuns * x.codon_abuns

        ret['hyd_risk'].append(
            allabuns.apply(lambda x: applyfunc(x, col='Hyd_d'), axis=1).sum())
        ret['PR_risk'].append(
            allabuns.apply(lambda x: applyfunc(x, col='PR_d'), axis=1).sum())
        ret['n+_risk'].append(allabuns[allabuns.N_d > 0].apply(
            lambda x: applyfunc(x, col='N_d'), axis=1).sum())
        ret['c+_risk'].append(allabuns[allabuns.C_d > 0].apply(
            lambda x: applyfunc(x, col='C_d'), axis=1).sum())
        ret['o+_risk'].append(allabuns[allabuns.O_d > 0].apply(
            lambda x: applyfunc(x, col='O_d'), axis=1).sum())
        ret['code'].append(newaas)
    if subdir is not None:
        outfol = mkdirifnotexists(join(CodeAnalysis.CodonsDir, subdir))
    outfname = 'Codon_risk_{}_{}_pstop_medaa.dat'.format(
        'allmut' if all_mutations else 'TiTv', prefix)
    print(outfname)
    Utils.Write(join(outfol, outfname), ret)
    for k in ret:
        if k != 'code':
            print('{}: {}'.format(k, sum(ret[k] < ret[k][0]) / 10000.))
Ejemplo n.º 13
0
class CodeAnalysis:
    CodonsDir = mkdirifnotexists(General.Tmppath, 'Codons')
Ejemplo n.º 14
0
 class OM_RGC:
     CallDir = mkdirifnotexists(join(General.Tmppath, 'OM-RGC_Call'))
     DbFasta = join(Mapping.OM_RGC.IndexDir,'OM-RGC_seq.fasta')
     FilterThreshold = 0.9
     mpileupParams = dict(min_base_qual = 15, max_depth_indel = 1000, 
                          max_depth = 100000, min_iReads = 2)