Beispiel #1
0
def compare_sigs(sag_id, sag_file, mhr_path, sig_path, mg_sig_list,
                 jacc_threshold):
    sag_subcontigs = s_utils.get_seqs(sag_file)
    if isfile(o_join(mhr_path, sag_id + '.mhr_recruits.tsv')):
        logging.info('[SABer]: Loading %s and MetaG signature recruit list\n' %
                     sag_id)
        with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'),
                  'r') as mhr_in:
            pass_list = [
                x.rstrip('\n').split('\t') for x in mhr_in.readlines()
            ]
    else:
        # Calculate\Load MinHash Signatures with SourMash for SAG subseqs
        if isfile(o_join(sig_path, sag_id + '.SAG.sig')):
            logging.info('[SABer]: Loading Signature for %s\n' % sag_id)
            sag_sig = sourmash.signature.load_one_signature(
                o_join(sig_path, sag_id + '.SAG.sig'))
        else:
            logging.info('[SABer]: Building Signature for %s\n' % sag_id)
            sag_minhash = sourmash.MinHash(n=0, ksize=51, scaled=100)
            for sg_head in sag_subcontigs:
                sag_subseq = str(sag_subcontigs[sg_head].seq)
                sag_minhash.add_sequence(sag_subseq, force=True)
            sag_sig = sourmash.SourmashSignature(sag_minhash, name=sag_id)
            with open(o_join(sig_path, sag_id + '.SAG.sig'), 'w') as sags_out:
                sourmash.signature.save_signatures([sag_sig], fp=sags_out)
        logging.info('[SABer]: Comparing  %s and MetaG signature\n' % sag_id)
        pass_list = []
        for mg_sig in mg_sig_list:
            jacc_sim = mg_sig.similarity(sag_sig)
            mg_nm = mg_sig.name()
            if jacc_sim >= jacc_threshold:
                pass_list.append([sag_id, mg_nm, mg_nm.rsplit('_', 1)[0]])

        with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'),
                  'w') as mhr_out:
            mhr_out.write('\n'.join(['\t'.join(x) for x in pass_list]))
    pass_list = tuple(pass_list)

    return pass_list
Beispiel #2
0
def run_tetra_recruiter(tra_path, sag_sub_files, mg_sub_file, rpkm_max_df,
                        gmm_per_pass):
    # TODO: 1. Think about using Minimum Description Length (MDL) instead of AIC/BIC
    #        2. [Normalized Maximum Likelihood or Fish Information Approximation]
    #        3. Can TetraNuc Hz be calc'ed for each sample? Does that improve things?
    #            (think about http://merenlab.org/2020/01/02/visualizing-metagenomic-bins/#introduction)

    mg_id = mg_sub_file[0]
    mg_subcontigs = s_utils.get_seqs(mg_sub_file[1])
    mg_headers = tuple(mg_subcontigs.keys())

    mg_subs = tuple([r.seq for r in mg_subcontigs])
    #mg_id, mg_headers, mg_subs = mg_subcontigs

    # Build/Load tetramers for SAGs and MG subset by ara recruits
    if isfile(o_join(tra_path, mg_id + '.tetras.tsv')):
        logging.info('[SABer]: Loading tetramer Hz matrix for %s\n' % mg_id)
        mg_tetra_df = pd.read_csv(o_join(tra_path, mg_id + '.tetras.tsv'),
                                  sep='\t',
                                  index_col=0,
                                  header=0)
    else:
        logging.info('[SABer]: Calculating tetramer Hz matrix for %s\n' %
                     mg_id)
        mg_tetra_df = pd.DataFrame.from_dict(s_utils.tetra_cnt(mg_subs))
        mg_tetra_df['contig_id'] = mg_headers
        mg_tetra_df.set_index('contig_id', inplace=True)
        mg_tetra_df.to_csv(o_join(tra_path, mg_id + '.tetras.tsv'), sep='\t')

    gmm_total_pass_list = []
    svm_total_pass_list = []
    iso_total_pass_list = []
    comb_total_pass_list = []
    #for sag_id, sag_headers, sag_subs in sag_subcontigs:
    for i, sag_rec in enumerate(sag_sub_files):
        sag_id, sag_file = sag_rec
        sag_subcontigs = s_utils.get_seqs(sag_file)
        sag_headers = tuple(sag_subcontigs.keys())
        sag_subs = tuple([r.seq for r in sag_subcontigs.itervalues()])

        if (isfile(o_join(tra_path, sag_id + '.gmm_recruits.tsv'))
                & isfile(o_join(tra_path, sag_id + '.svm_recruits.tsv'))
                & isfile(o_join(tra_path, sag_id + '.iso_recruits.tsv'))
                & isfile(o_join(tra_path, sag_id + '.comb_recruits.tsv'))):
            logging.info('[SABer]: Loading  %s tetramer Hz recruit list\n' %
                         sag_id)
            with open(o_join(tra_path, sag_id + '.gmm_recruits.tsv'),
                      'r') as tra_in:
                gmm_pass_list = [
                    x.rstrip('\n').split('\t') for x in tra_in.readlines()
                ]
            with open(o_join(tra_path, sag_id + '.svm_recruits.tsv'),
                      'r') as tra_in:
                svm_pass_list = [
                    x.rstrip('\n').split('\t') for x in tra_in.readlines()
                ]
            with open(o_join(tra_path, sag_id + '.iso_recruits.tsv'),
                      'r') as tra_in:
                iso_pass_list = [
                    x.rstrip('\n').split('\t') for x in tra_in.readlines()
                ]
            with open(o_join(tra_path, sag_id + '.comb_recruits.tsv'),
                      'r') as tra_in:
                comb_pass_list = [
                    x.rstrip('\n').split('\t') for x in tra_in.readlines()
                ]
        else:
            if isfile(o_join(tra_path, sag_id + '.tetras.tsv')):
                logging.info('[SABer]: Loading tetramer Hz matrix for %s\n' %
                             sag_id)
                sag_tetra_df = pd.read_csv(o_join(tra_path,
                                                  sag_id + '.tetras.tsv'),
                                           sep='\t',
                                           index_col=0,
                                           header=0)
            else:
                logging.info(
                    '[SABer]: Calculating tetramer Hz matrix for %s\n' %
                    sag_id)
                sag_tetra_df = pd.DataFrame.from_dict(
                    s_utils.tetra_cnt(sag_subs))
                sag_tetra_df['contig_id'] = sag_headers
                sag_tetra_df.set_index('contig_id', inplace=True)
                sag_tetra_df.to_csv(o_join(tra_path, sag_id + '.tetras.tsv'),
                                    sep='\t')

            # Concat SAGs amd MG for GMM
            mg_rpkm_contig_list = list(rpkm_max_df.loc[
                rpkm_max_df['sag_id'] == sag_id]['subcontig_id'].values)

            mg_tetra_filter_df = mg_tetra_df.loc[mg_tetra_df.index.isin(
                mg_rpkm_contig_list)]
            #concat_tetra_df = pd.concat([sag_tetra_df, mg_tetra_filter_df])
            '''
            normed_tetra_df = concat_tetra_df
            sag_normed_tetra_df = normed_tetra_df[
                normed_tetra_df.index.isin(sag_tetra_df.index)
            ]
            mg_normed_tetra_df = normed_tetra_df.loc[
                normed_tetra_df.index.isin(mg_tetra_filter_df.index)
            ]

            # UMAP for Dimension reduction of tetras
            sag_features = sag_normed_tetra_df.values
            sag_targets = sag_normed_tetra_df.index.values
            mg_features = mg_normed_tetra_df.values
            mg_targets = mg_normed_tetra_df.index.values
            normed_features = normed_tetra_df.values
            normed_targets = normed_tetra_df.index.values

            logging.info('[SABer]: Dimension reduction of tetras with UMAP\n')
            umap_trans = umap.UMAP(n_neighbors=2, min_dist=0.0,
                                   n_components=num_components, metric='manhattan',
                                   random_state=42
                                   ).fit_transform(normed_features)
            pc_col_names = ['pc' + str(x) for x in range(1, num_components + 1)]
            umap_df = pd.DataFrame(umap_trans, columns=pc_col_names, index=normed_targets)

            sag_umap_df = umap_df.loc[umap_df.index.isin(sag_tetra_df.index)]
            mg_umap_df = umap_df.loc[umap_df.index.isin(mg_tetra_filter_df.index)]

            sag_tetra_df = concat_tetra_df.loc[
                                    concat_tetra_df.index.isin(sag_tetra_df.index)
                                    ]
            mg_tetra_df = concat_tetra_df.loc[
                                    concat_tetra_df.index.isin(mg_tetra_filter_df.index)
                                    ]
            '''

            logging.info('[SABer]: Calculating AIC/BIC for GMM components\n')
            sag_train_vals = [1 for x in sag_tetra_df.index]
            n_components = np.arange(1, 5, 1)
            models = [GMM(n, random_state=42) for n in n_components]
            bics = []
            aics = []
            for i, model in enumerate(models):
                n_comp = n_components[i]
                try:
                    bic = model.fit(sag_tetra_df.values,
                                    sag_train_vals).bic(sag_tetra_df.values)
                    bics.append(bic)
                except:
                    logging.info('[WARNING]: BIC failed with %s components\n' %
                                 n_comp)
                try:
                    aic = model.fit(sag_tetra_df.values,
                                    sag_train_vals).aic(sag_tetra_df.values)
                    aics.append(aic)
                except:
                    logging.info('[WARNING]: AIC failed with %s components\n' %
                                 n_comp)

            min_bic_comp = n_components[bics.index(min(bics))]
            min_aic_comp = n_components[aics.index(min(aics))]
            logging.info('[SABer]: Min AIC/BIC at %s/%s, respectively\n' %
                         (min_aic_comp, min_bic_comp))
            logging.info('[SABer]: Using BIC as guide for GMM components\n')
            logging.info('[SABer]: Training GMM on SAG tetras\n')
            gmm = GMM(n_components=min_bic_comp,
                      random_state=42).fit(sag_tetra_df.values)
            logging.info('[SABer]: GMM Converged: %s\n' % gmm.converged_)
            try:  # TODO: add predict and predict_proba to this and output all to table
                sag_scores = gmm.score_samples(sag_tetra_df.values)
                sag_scores_df = pd.DataFrame(data=sag_scores,
                                             index=sag_tetra_df.index.values)
                sag_scores_df.columns = ['wLogProb']
                sag_score_min = min(sag_scores_df.values)[0]
                sag_score_max = max(sag_scores_df.values)[0]
                mg_scores = gmm.score_samples(mg_tetra_filter_df.values)
                mg_scores_df = pd.DataFrame(
                    data=mg_scores, index=mg_tetra_filter_df.index.values)
                mg_scores_df.columns = ['wLogProb']
                gmm_pass_df = mg_scores_df.loc[
                    (mg_scores_df['wLogProb'] >= sag_score_min)
                    & (mg_scores_df['wLogProb'] <= sag_score_max)]
                # And is has to be from the RPKM pass list
                gmm_pass_df = gmm_pass_df.loc[gmm_pass_df.index.isin(
                    mg_rpkm_contig_list)]
                gmm_pass_list = []
                for md_nm in gmm_pass_df.index.values:
                    gmm_pass_list.append(
                        [sag_id, md_nm, md_nm.rsplit('_', 1)[0]])
            except:
                logging.info('[SABer]: Warning: No recruits found...\n')
                gmm_pass_list = []

            logging.info('[SABer]: Training OCSVM on SAG tetras\n')
            # fit OCSVM
            clf = svm.OneClassSVM()
            clf.fit(sag_tetra_df.values)
            sag_pred = clf.predict(sag_tetra_df.values)
            #sag_pred_df = pd.DataFrame(data=sag_pred, index=sag_tetra_df.index.values)
            mg_pred = clf.predict(mg_tetra_filter_df.values)
            mg_pred_df = pd.DataFrame(data=mg_pred,
                                      index=mg_tetra_filter_df.index.values)
            svm_pass_df = mg_pred_df.loc[mg_pred_df[0] != -1]
            # And is has to be from the RPKM pass list
            svm_pass_df = svm_pass_df.loc[svm_pass_df.index.isin(
                mg_rpkm_contig_list)]
            svm_pass_list = []
            for md_nm in svm_pass_df.index.values:
                svm_pass_list.append([sag_id, md_nm, md_nm.rsplit('_', 1)[0]])

            logging.info('[SABer]: Training Isolation Forest on SAG tetras\n')
            # fit IsoForest
            clf = IsolationForest(random_state=42)
            clf.fit(sag_tetra_df.values)
            #clf.set_params(n_estimators=20)  # add 10 more trees
            #clf.fit(sag_tetra_df.values)  # fit the added trees
            mg_pred = clf.predict(mg_tetra_filter_df.values)
            mg_pred_df = pd.DataFrame(data=mg_pred,
                                      index=mg_tetra_filter_df.index.values)
            iso_pass_df = mg_pred_df.loc[mg_pred_df[0] != -1]
            # And is has to be from the RPKM pass list
            iso_pass_df = iso_pass_df.loc[iso_pass_df.index.isin(
                mg_rpkm_contig_list)]
            iso_pass_list = []
            for md_nm in iso_pass_df.index.values:
                iso_pass_list.append([sag_id, md_nm, md_nm.rsplit('_', 1)[0]])

            gmm_id_list = [x[1] for x in gmm_pass_list]
            svm_id_list = [x[1] for x in svm_pass_list]
            iso_id_list = [x[1] for x in iso_pass_list]
            comb_set_list = list(
                set(gmm_id_list) & set(svm_id_list) & set(iso_id_list))
            #comb_set_list = list(set(gmm_id_list) & set(svm_id_list))
            comb_pass_list = []
            for md_nm in comb_set_list:
                comb_pass_list.append([sag_id, md_nm, md_nm.rsplit('_', 1)[0]])

            logging.info('[SABer]: Recruited %s subcontigs to %s with GMM\n' %
                         (len(gmm_pass_list), sag_id))
            logging.info('[SABer]: Recruited %s subcontigs to %s with SVM\n' %
                         (len(svm_pass_list), sag_id))
            logging.info(
                '[SABer]: Recruited %s subcontigs to %s with Isolation Forest\n'
                % (len(iso_pass_list), sag_id))
            logging.info(
                '[SABer]: Recruited %s subcontigs to %s with combined methods\n'
                % (len(comb_pass_list), sag_id))

            with open(o_join(tra_path, sag_id + '.gmm_recruits.tsv'),
                      'w') as tra_out:
                tra_out.write('\n'.join(['\t'.join(x) for x in gmm_pass_list]))
            with open(o_join(tra_path, sag_id + '.svm_recruits.tsv'),
                      'w') as tra_out:
                tra_out.write('\n'.join(['\t'.join(x) for x in svm_pass_list]))
            with open(o_join(tra_path, sag_id + '.iso_recruits.tsv'),
                      'w') as tra_out:
                tra_out.write('\n'.join(['\t'.join(x) for x in iso_pass_list]))
            with open(o_join(tra_path, sag_id + '.comb_recruits.tsv'),
                      'w') as tra_out:
                tra_out.write('\n'.join(['\t'.join(x)
                                         for x in comb_pass_list]))

        gmm_total_pass_list.extend(gmm_pass_list)
        svm_total_pass_list.extend(svm_pass_list)
        iso_total_pass_list.extend(iso_pass_list)
        comb_total_pass_list.extend(comb_pass_list)

    gmm_df = pd.DataFrame(gmm_total_pass_list,
                          columns=['sag_id', 'subcontig_id', 'contig_id'])
    svm_df = pd.DataFrame(svm_total_pass_list,
                          columns=['sag_id', 'subcontig_id', 'contig_id'])
    iso_df = pd.DataFrame(iso_total_pass_list,
                          columns=['sag_id', 'subcontig_id', 'contig_id'])
    comb_df = pd.DataFrame(comb_total_pass_list,
                           columns=['sag_id', 'subcontig_id', 'contig_id'])

    tetra_df_dict = {
        'gmm': gmm_df,
        'svm': svm_df,
        'iso': iso_df,
        'comb': comb_df
    }
    #tetra_df_dict = {'gmm':gmm_df, 'svm':svm_df, 'comb':comb_df}

    for tetra_id in tetra_df_dict:
        tetra_df = tetra_df_dict[tetra_id]
        #mg_id, mg_headers, mg_subs = mg_subcontigs

        # Count # of subcontigs recruited to each SAG
        gmm_cnt_df = tetra_df.groupby(['sag_id',
                                       'contig_id']).count().reset_index()
        gmm_cnt_df.columns = ['sag_id', 'contig_id', 'subcontig_recruits']
        # Build subcontig count for each MG contig
        mg_contig_list = [x.rsplit('_', 1)[0] for x in mg_headers]
        mg_tot_df = pd.DataFrame(zip(mg_contig_list, mg_headers),
                                 columns=['contig_id', 'subcontig_id'])
        mg_tot_cnt_df = mg_tot_df.groupby(['contig_id']).count().reset_index()
        mg_tot_cnt_df.columns = ['contig_id', 'subcontig_total']
        mg_recruit_df = gmm_cnt_df.merge(mg_tot_cnt_df,
                                         how='left',
                                         on='contig_id')
        mg_recruit_df['percent_recruited'] = mg_recruit_df['subcontig_recruits'] / \
                                             mg_recruit_df['subcontig_total']
        mg_recruit_df.sort_values(by='percent_recruited',
                                  ascending=False,
                                  inplace=True)
        # Only pass contigs that have the magjority of subcontigs recruited (>= N%)
        mg_recruit_filter_df = mg_recruit_df.loc[
            mg_recruit_df['percent_recruited'] >= float(gmm_per_pass)]
        mg_contig_per_max_df = mg_recruit_filter_df.groupby(
            ['contig_id'])['percent_recruited'].max().reset_index()
        mg_contig_per_max_df.columns = ['contig_id', 'percent_max']
        mg_recruit_max_df = mg_recruit_filter_df.merge(mg_contig_per_max_df,
                                                       how='left',
                                                       on='contig_id')
        # Now pass contigs that have the maximum recruit % of subcontigs
        mg_max_only_df = mg_recruit_max_df.loc[
            mg_recruit_max_df['percent_recruited'] >=
            mg_recruit_max_df['percent_max']]
        mg_max_only_df.to_csv(o_join(
            tra_path, mg_id + '.' + tetra_id + '.tra_trimmed_recruits.tsv'),
                              sep='\t',
                              index=False)

        tetra_df_dict[tetra_id] = mg_max_only_df

    return tetra_df_dict
Beispiel #3
0
def run_combine_recruits(final_path, ext_path, asm_path, check_path, mg_file,
                         tetra_df_dict, minhash_df,
                         sag_list):  # TODO: use logging instead of print

    mg_contigs = tuple([(r.name, r.seq)
                        for r in s_utils.get_seqs(mg_file).itervalues()])

    for tetra_id in tetra_df_dict:
        tetra_df = tetra_df_dict[tetra_id]
        # TODO: Use full contigs instead of subcontigs for co-asm, reduces asm time for Minimus2? CISA?
        # TODO: check for co-asm files before running
        # build SAG id to SAG path dict
        sag2path_dict = {}
        for sag_path in sag_list:
            base = basename(sag_path)
            sag_id = base.rsplit('.', 1)[0]
            sag2path_dict[sag_id] = sag_path

        # Merge MinHash and GMM Tetra (passed first by ABR)
        mh_gmm_merge_df = minhash_df[['sag_id', 'contig_id']].merge(
            tetra_df[['sag_id', 'contig_id']],
            how='outer',
            on=['sag_id', 'contig_id']).drop_duplicates()

        mh_gmm_merge_df.to_csv(o_join(final_path,
                                      tetra_id + '.final_recruits.tsv'),
                               sep='\t',
                               index=True)
        mg_contigs_df = pd.DataFrame(mg_contigs, columns=['contig_id', 'seq'])
        sag_de_df_list = []
        for sag_id in set(mh_gmm_merge_df['sag_id']):
            final_rec = o_join(
                final_path, sag_id + '.' + tetra_id + '.final_recruits.fasta')

            sub_merge_df = mh_gmm_merge_df.loc[mh_gmm_merge_df['sag_id'] ==
                                               sag_id]
            print('[SABer]: Recruited %s contigs from entire analysis for %s' %
                  (sub_merge_df.shape[0], sag_id))
            with open(
                    o_join(final_path,
                           sag_id + '.' + tetra_id + '.final_recruits.fasta'),
                    'w') as final_out:
                mg_sub_filter_df = mg_contigs_df.loc[
                    mg_contigs_df['contig_id'].isin(sub_merge_df['contig_id'])]
                final_mgsubs_list = [
                    '\n'.join(['>' + x[0], x[1]]) for x in zip(
                        mg_sub_filter_df['contig_id'], mg_sub_filter_df['seq'])
                ]
                final_out.write('\n'.join(final_mgsubs_list))
            '''
            # Combine SAG and final recruits # TODO: is this actually needed if MinHash is so good? I think not :)
            ext_SAG = o_join(ext_path, sag_id + '.extend_SAG.fasta')
            with open(ext_SAG, 'w') as cat_file:
                data = []
                with open(sag_file, 'r') as sag_in:
                    data.extend(sag_in.readlines())
                with open(o_join(final_path, sag_id + '.' + tetra_id + '.final_recruits.fasta'), 'r') as \
                        recruits_in:
                    data.extend(recruits_in.readlines())
                join_data = '\n'.join(data).replace('\n\n', '\n')
                cat_file.write(join_data)
            '''

            # Use BBTools dedupe.sh to deduplicate the extend SAG file
            dedupe_SAG = o_join(
                ext_path, sag_id + '.' + tetra_id + '.extended_SAG.fasta')
            dedupe_cmd = [
                'dedupe.sh', 'in=' + final_rec, 'out=' + dedupe_SAG,
                'threads=8', 'minidentity=97', 'overwrite=true'
            ]
            run_dedupe = Popen(dedupe_cmd, stdout=PIPE)
            print(run_dedupe.communicate()[0].decode())
            de_header_list = []
            with open(dedupe_SAG, 'r') as de_file:
                data = de_file.readlines()
                for line in data:
                    if '>' in line:
                        de_header_list.append(line.strip('>').strip('\n'))
            de_sag_df = pd.DataFrame(de_header_list, columns=['contig_id'])
            de_sag_df['sag_id'] = sag_id
            de_sag_df['tetra_id'] = tetra_id
            sag_de_df_list.append(de_sag_df)
        sag_de_df = pd.concat(sag_de_df_list)
        sag_de_df.to_csv(o_join(ext_path, tetra_id + '.extended_SAGs.tsv'),
                         sep='\t',
                         index=True)
        '''
            # Use minimus2 to merge the SAG and the recruits into one assembly
            toAmos_cmd = ['/home/rmclaughlin/bin/amos-3.1.0/bin/toAmos', '-s',
                            ext_SAG, '-o', o_join(asm_path, sag_id + '.afg')
                            ]
            run_toAmos = Popen(toAmos_cmd, stdout=PIPE)
            print(run_toAmos.communicate()[0].decode())
            minimus_cmd = ['/home/rmclaughlin/bin/amos-3.1.0/bin/minimus2',
                            o_join(asm_path, sag_id),
                            '-D', 'REFCOUNT=0', '-D', 'OVERLAP=200', '-D', 'MINID=95'
                            ]
            run_minimus = Popen(minimus_cmd, stdout=PIPE)
            print(run_minimus.communicate()[0].decode())
            if isfile(o_join(asm_path, sag_id + '.fasta')):
                filenames = [o_join(asm_path, sag_id + '.fasta'), o_join(asm_path, sag_id + '.singletons.seq')]
                with open(o_join(asm_path, sag_id + '.minimus2.asm.fasta'), 'w') as outfile:
                    for fname in filenames:
                        with open(fname) as infile:
                            for line in infile:
                                outfile.write(line)
                move_cmd = ['mv', o_join(asm_path, sag_id + '.fasta'),
                            o_join(asm_path, sag_id + '.minimus2_no_singles.asm.fasta')
                            ]

            run_move = Popen(move_cmd, stdout=PIPE)
            clean_cmd = ['rm', '-r', o_join(asm_path, sag_id + '.runAmos.log'),
                            o_join(asm_path, sag_id + '.afg'),
                            o_join(asm_path, sag_id + '.OVL'),
                            o_join(asm_path, sag_id + '.singletons'),
                            o_join(asm_path, sag_id + '.singletons.seq'),
                            o_join(asm_path, sag_id + '.contig'),
                            o_join(asm_path, sag_id + '.ovl'),
                            o_join(asm_path, sag_id + '.coords'),
                            o_join(asm_path, sag_id + '.qry.seq'),
                            o_join(asm_path, sag_id + '.delta'),
                            o_join(asm_path, sag_id + '.bnk'),
                            o_join(asm_path, sag_id + '.ref.seq')
                            ]
            run_clean = Popen(clean_cmd, stdout=PIPE)
        '''

    # Run CheckM on all new rebuilt/updated SAGs
    print('[SABer]: Checking all new SAG quality using CheckM')
    checkm_cmd = [
        'checkm', 'lineage_wf', '--tab_table', '-x', 'fasta', '--threads', '8',
        '--pplacer_threads', '8', '-f',
        o_join(check_path, 'checkM_stdout.tsv'), ext_path, check_path
    ]
    run_checkm = Popen(checkm_cmd, stdout=PIPE)
    print(run_checkm.communicate()[0].decode())
Beispiel #4
0
def run_minhash_recruiter(sig_path, mhr_path, sag_sub_files, mg_sub_file,
                          jacc_threshold, mh_per_pass, nthreads):
    logging.info('[SABer]: MinHash Recruitment Algorithm\n')
    # Calculate/Load MinHash Signatures with SourMash for MG subseqs
    mg_id = mg_sub_file[0]
    logging.info('[SABer]: Loading subcontigs for %s\n' % mg_id)
    mg_subcontigs = s_utils.get_seqs(mg_sub_file[1])
    mg_headers = tuple(mg_subcontigs.keys())
    #mg_id, mg_headers, mg_subs = mg_subcontigs
    if isfile(o_join(
            sig_path, mg_id +
            '.metaG.sig')):  # TODO: MG should only be loaded if required
        logging.info('[SABer]: Loading %s Signatures\n' % mg_id)
        mg_sig_list = tuple(sourmash.signature.load_signatures(o_join(sig_path, mg_id + \
                                                                      '.metaG.sig')
                                                                        ))
    else:
        logging.info('[SABer]: Building Signatures for %s\n' % mg_id)
        pool = multiprocessing.Pool(processes=nthreads)  # TODO: convert to Ray
        arg_list = []
        for i, mg_head in enumerate(mg_subcontigs):
            logging.info('\r[SABer]: Building multi-pool: {0:.0%} done'.format(
                i / len(mg_subcontigs)))
            arg_list.append([mg_head, str(mg_subcontigs[mg_head].seq)])
        logging.info('\n')
        results = pool.imap_unordered(build_signature, arg_list)
        logging.info('\r[SABer]: Executing multi-pool:')
        mg_sig_list = []
        for i, mg_sig in enumerate(results):
            logging.info(
                '\r[SABer]: Executing multi-pool: {0:.0%} done'.format(
                    i / len(arg_list)))
            mg_sig_list.append(mg_sig)
        logging.info('\n')
        pool.close()
        pool.join()

        with open(o_join(sig_path, mg_id + '.metaG.sig'), 'w') as mg_out:
            sourmash.signature.save_signatures(mg_sig_list, fp=mg_out)
        mg_sig_list = tuple(mg_sig_list)

    # Load comparisons OR Compare SAG sigs to MG sigs to find containment
    logging.info('[SABer]: Comparing Signatures of SAGs to MetaG contigs\n')
    l = 0
    b = 0
    build_list = []
    minhash_pass_list = []
    for i, sag_rec in enumerate(sag_sub_files):
        sag_id, sag_file = sag_rec
        if isfile(o_join(mhr_path, sag_id + '.mhr_recruits.tsv')):
            #logging.info('[SABer]: Loading %s and MetaG signature recruit list\n' % sag_id)
            with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'),
                      'r') as mhr_in:
                pass_list = tuple(
                    [x.rstrip('\n').split('\t') for x in mhr_in.readlines()])
                minhash_pass_list.extend(pass_list)
            l += 1
        else:
            build_list.append(sag_rec)
            b += 1
        logging.info(
            '\r[SABer]: Loading/Comparing SAG and MetaG signatures: {}/{} done'
            .format(l, b))
    logging.info('\n')
    if b != 0:
        logging.info(
            '[SABer]: Building/Comparing {} SAG signatures\n'.format(b))
        ray.init(num_cpus=nthreads)
        r_mg_sig_list = ray.put(mg_sig_list)
        r_mhr_path = ray.put(mhr_path)
        r_jacc_threshold = ray.put(jacc_threshold)
        futures = []
        for i, sag_rec in enumerate(build_list):
            sag_id, sag_file = sag_rec
            futures.append(
                compare_sigs.remote(sag_id, sag_file, r_mhr_path, sig_path,
                                    r_mg_sig_list, r_jacc_threshold))
            logging.info('\r[SABer]: Comparison {0:.0%} complete'.format(
                i / len(build_list)))
        logging.info('\n')

        ray_results = [v for r_list in ray.get(futures) for v in r_list]
        minhash_pass_list.extend(ray_results)

    minhash_df = pd.DataFrame(minhash_pass_list,
                              columns=['sag_id', 'subcontig_id', 'contig_id'])
    logging.info('[SABer]: Compiling all MinHash Recruits\n')
    # Count # of subcontigs recruited to each SAG via samsum
    mh_cnt_df = minhash_df.groupby(['sag_id',
                                    'contig_id']).count().reset_index()
    mh_cnt_df.columns = ['sag_id', 'contig_id', 'subcontig_recruits']
    # Build subcontig count for each MG contig
    mg_contig_list = [x.rsplit('_', 1)[0] for x in mg_headers]
    mg_tot_df = pd.DataFrame(zip(mg_contig_list, mg_headers),
                             columns=['contig_id', 'subcontig_id'])
    mg_tot_cnt_df = mg_tot_df.groupby(['contig_id']).count().reset_index()
    mg_tot_cnt_df.columns = ['contig_id', 'subcontig_total']
    mh_recruit_df = mh_cnt_df.merge(mg_tot_cnt_df, how='left', on='contig_id')
    mh_recruit_df['percent_recruited'] = mh_recruit_df['subcontig_recruits'] / \
                                           mh_recruit_df['subcontig_total']
    mh_recruit_df.sort_values(by='percent_recruited',
                              ascending=False,
                              inplace=True)
    # Only pass contigs that have the magjority of subcontigs recruited (>= 51%)
    mh_recruit_filter_df = mh_recruit_df.loc[
        mh_recruit_df['percent_recruited'] >= float(mh_per_pass)]
    mg_contig_per_max_df = mh_recruit_filter_df.groupby(
        ['contig_id'])['percent_recruited'].max().reset_index()
    mg_contig_per_max_df.columns = ['contig_id', 'percent_max']
    mh_recruit_max_df = mh_recruit_filter_df.merge(mg_contig_per_max_df,
                                                   how='left',
                                                   on='contig_id')
    # Now pass contigs that have the maximum recruit % of subcontigs
    mh_max_only_df = mh_recruit_max_df.loc[
        mh_recruit_max_df['percent_recruited'] >=
        mh_recruit_max_df['percent_max']]
    mh_max_df = minhash_df[minhash_df['contig_id'].isin(
        list(mh_max_only_df['contig_id']))]

    mh_max_df.to_csv(o_join(mhr_path, mg_id + '.mhr_trimmed_recruits.tsv'),
                     sep='\t',
                     index=False)
    logging.info('[SABer]: MinHash Recruitment Algorithm Complete\n')

    return mh_max_df
Beispiel #5
0
def run_abund_recruiter(subcontig_path, abr_path, mg_sub_file,
                        mg_raw_file_list, minhash_df, ss_per_pass, nthreads):

    mg_id = mg_sub_file[0]
    mg_subcontigs = s_utils.get_seqs(mg_sub_file[1])
    mg_headers = tuple(mg_subcontigs.keys())

    #mg_id, mg_headers = mg_subcontigs[0], mg_subcontigs[1]

    logging.info('[SABer]: Starting Abundance Recruitment Algorithm\n')
    logging.info('[SABer]: Checking for abundance table for %s\n' % mg_id)
    if isfile(o_join(abr_path, mg_id + '.samsum_merged.tsv')):
        logging.info('[SABer]: Loading  %s abundance table\n' % mg_id)
        mg_ss_df = pd.read_csv(o_join(abr_path, mg_id + '.samsum_merged.tsv'),
                               sep='\t',
                               header=0)
    else:
        logging.info('[SABer]: Building %s abundance table\n' % mg_id)
        mg_sub_path = o_join(subcontig_path, mg_id + '.subcontigs.fasta')
        # is is indexed?
        index_ext_list = ['amb', 'ann', 'bwt', 'pac', 'sa']
        check_ind_list = ['.'.join([mg_sub_path, x]) for x in index_ext_list]
        if False in (isfile(f) for f in check_ind_list):
            # Use BWA to build an index for metagenome assembly
            logging.info('[SABer]: Creating index with BWA\n')
            bwa_cmd = ['bwa', 'index', '-b', '500000000', mg_sub_path
                       ]  #TODO: how to get install path for executables?
            with open(o_join(abr_path, mg_id + '.stdout.txt'),
                      'w') as stdout_file:
                with open(o_join(abr_path, mg_id + '.stderr.txt'),
                          'w') as stderr_file:
                    run_bwa = Popen(bwa_cmd,
                                    stdout=stdout_file,
                                    stderr=stderr_file)
                    run_bwa.communicate()

        # Process raw metagenomes to calculate abundances
        with open(mg_raw_file_list, 'r') as raw_fa_in:
            raw_data = raw_fa_in.readlines()
        ss_output_list = []
        for line in raw_data:
            split_line = line.strip('\n').split('\t')
            if len(split_line) == 2:
                logging.info('[SABer]: Raw reads in FWD and REV file...\n')
                pe1 = split_line[0]
                pe2 = split_line[1]
                mem_cmd = [
                    'bwa', 'mem', '-t',
                    str(nthreads), '-p',
                    o_join(subcontig_path, mg_id + '.subcontigs.fasta'), pe1,
                    pe2
                ]  #TODO: add support for specifying number of threads
            else:  # if the fastq is interleaved
                logging.info('[SABer]: Raw reads in interleaved file...\n')
                pe1 = split_line[0]
                mem_cmd = [
                    'bwa', 'mem', '-t',
                    str(nthreads), '-p',
                    o_join(subcontig_path, mg_id + '.subcontigs.fasta'), pe1
                ]  #TODO: how to get install path for executables?
            pe_basename = basename(pe1)
            pe_id = pe_basename.split('.')[0]
            # BWA sam file exists?
            mg_sam_out = o_join(abr_path, pe_id + '.sam')
            if isfile(mg_sam_out) == False:
                logging.info('[SABer]: Running BWA mem on %s\n' % pe_id)
                with open(mg_sam_out, 'w') as sam_file:
                    with open(o_join(abr_path, pe_id + '.stderr.txt'),
                              'w') as stderr_file:
                        run_mem = Popen(mem_cmd,
                                        stdout=sam_file,
                                        stderr=stderr_file)
                        run_mem.communicate()

            logging.info('[SABer]: Calculating TPM with samsum for %s\n' %
                         pe_id)
            mg_input = o_join(subcontig_path, mg_id + '.subcontigs.fasta')
            sam_input = o_join(abr_path, pe_id + '.sam')
            # samsum API
            ref_seq_abunds = commands.ref_sequence_abundances(
                aln_file=sam_input, seq_file=mg_input, multireads=True)
            ss_output_list.append(ref_seq_abunds)

        logging.info('[SABer]: Merging results for all samsum output\n')
        # Merge API output for each raw sample file
        refseq_header_list = ss_output_list[0].keys()
        refseq_merge_list = []
        for refseq_header in refseq_header_list:
            for i, refseq_dict in enumerate(ss_output_list):
                refseq_obj = refseq_dict[refseq_header]
                rso_name = refseq_obj.name
                rso_length = refseq_obj.length
                rso_reads_mapped = refseq_obj.reads_mapped
                rso_weight_total = refseq_obj.weight_total
                rso_fpkm = refseq_obj.fpkm
                rso_tpm = refseq_obj.tpm
                rso_sample_index = i
                refseq_merge_list.append([
                    rso_name, rso_sample_index, rso_length, rso_reads_mapped,
                    rso_weight_total, rso_fpkm, rso_tpm
                ])
        mg_ss_df = pd.DataFrame(refseq_merge_list,
                                columns=[
                                    'subcontig_id', 'sample_index', 'length',
                                    'reads_mapped', 'weight_total', 'fpkm',
                                    'tpm'
                                ])
        mg_ss_df.to_csv(o_join(abr_path, mg_id + '.samsum_merged.tsv'),
                        sep='\t',
                        index=False)

    # extract TPM and pivot for MG
    mg_ss_trim_df = mg_ss_df[['subcontig_id', 'sample_index',
                              'tpm']].dropna(how='any')
    mg_ss_piv_df = pd.pivot_table(mg_ss_trim_df,
                                  values='tpm',
                                  index='subcontig_id',
                                  columns='sample_index')
    normed_ss_df = pd.DataFrame(normalize(mg_ss_piv_df.values),
                                columns=mg_ss_piv_df.columns,
                                index=mg_ss_piv_df.index)
    normed_ss_df.to_csv(o_join(abr_path, mg_id + '.samsum_normmed.tsv'),
                        sep='\t')
    # get MinHash "passed" mg ss
    ss_pass_list = []
    for sag_id in set(minhash_df['sag_id']):
        logging.info('[SABer]: Calulating/Loading abundance stats for %s\n' %
                     sag_id)
        if isfile(o_join(abr_path, sag_id + '.abr_recruits.tsv')):
            with open(o_join(abr_path, sag_id + '.abr_recruits.tsv'),
                      'r') as abr_in:
                pass_list = tuple(
                    [x.rstrip('\n').split('\t') for x in abr_in.readlines()])
        else:
            sag_mh_pass_df = minhash_df[minhash_df['sag_id'] == sag_id]
            mh_cntg_pass_list = set(sag_mh_pass_df['subcontig_id'])
            mg_ss_pass_df = mg_ss_piv_df[mg_ss_piv_df.index.isin(
                mh_cntg_pass_list)]
            mg_ss_test_df = mg_ss_piv_df[~mg_ss_piv_df.index.
                                         isin(mh_cntg_pass_list)]

            mg_ss_pass_stat_df = mg_ss_pass_df.mean().reset_index()
            mg_ss_pass_stat_df.columns = ['sample_id', 'mean']
            mg_ss_pass_stat_df['std'] = tuple(mg_ss_pass_df.std())
            mg_ss_pass_stat_df['var'] = tuple(mg_ss_pass_df.var())
            mg_ss_pass_stat_df['skew'] = tuple(mg_ss_pass_df.skew())
            mg_ss_pass_stat_df['kurt'] = tuple(mg_ss_pass_df.kurt())
            mg_ss_pass_stat_df['IQ_25'] = tuple(mg_ss_pass_df.quantile(0.25))
            mg_ss_pass_stat_df['IQ_75'] = tuple(mg_ss_pass_df.quantile(0.75))
            mg_ss_pass_stat_df['IQ_10'] = tuple(mg_ss_pass_df.quantile(0.10))
            mg_ss_pass_stat_df['IQ_90'] = tuple(mg_ss_pass_df.quantile(0.90))
            mg_ss_pass_stat_df['IQ_05'] = tuple(mg_ss_pass_df.quantile(0.05))
            mg_ss_pass_stat_df['IQ_95'] = tuple(mg_ss_pass_df.quantile(0.95))
            mg_ss_pass_stat_df['IQ_01'] = tuple(mg_ss_pass_df.quantile(0.01))
            mg_ss_pass_stat_df['IQ_99'] = tuple(mg_ss_pass_df.quantile(0.99))
            mg_ss_pass_stat_df['IQR'] = mg_ss_pass_stat_df['IQ_75'] - \
                                          mg_ss_pass_stat_df['IQ_25']
            # calc Tukey Fences
            mg_ss_pass_stat_df['upper_bound'] = mg_ss_pass_stat_df['IQ_75'] + \
                                                  (1.5 * mg_ss_pass_stat_df['IQR'])
            mg_ss_pass_stat_df['lower_bound'] = mg_ss_pass_stat_df['IQ_25'] - \
                                                  (1.5 * mg_ss_pass_stat_df['IQR'])

            mg_ss_pass_stat_df.to_csv(o_join(abr_path,
                                             sag_id + '.passed_ss_stats.tsv'),
                                      sep='\t')

            # Use passed MG from MHR to recruit more seqs,
            iqr_pass_df = mg_ss_test_df.copy()
            for i, col_nm in enumerate(mg_ss_test_df.columns):
                pass_stats = mg_ss_pass_stat_df.iloc[[i]]
                pass_max = pass_stats['upper_bound'].values[0]
                pass_min = pass_stats['lower_bound'].values[0]
                iqr_pass_df = iqr_pass_df.loc[
                    (iqr_pass_df[col_nm] >= pass_min)
                    & (iqr_pass_df[col_nm] <= pass_max)]

            pass_list = []
            join_ss_recruits = set(
                tuple(iqr_pass_df.index) + tuple(mh_cntg_pass_list))
            for md_nm in join_ss_recruits:
                pass_list.append([sag_id, md_nm, md_nm.rsplit('_', 1)[0]])
            with open(o_join(abr_path, sag_id + '.abr_recruits.tsv'),
                      'w') as abr_out:
                abr_out.write('\n'.join(['\t'.join(x) for x in pass_list]))
        logging.info('[SABer]: Recruited %s subcontigs to %s\n' %
                     (len(pass_list), sag_id))
        ss_pass_list.extend(tuple(pass_list))

    ss_df = pd.DataFrame(ss_pass_list,
                         columns=['sag_id', 'subcontig_id', 'contig_id'])
    # Count # of subcontigs recruited to each SAG via samsum
    ss_cnt_df = ss_df.groupby(['sag_id', 'contig_id']).count().reset_index()
    ss_cnt_df.columns = ['sag_id', 'contig_id', 'subcontig_recruits']
    # Build subcontig count for each MG contig
    mg_contig_list = [x.rsplit('_', 1)[0] for x in mg_headers]
    mg_tot_df = pd.DataFrame(zip(mg_contig_list, mg_headers),
                             columns=['contig_id', 'subcontig_id'])
    mg_tot_cnt_df = mg_tot_df.groupby(['contig_id']).count().reset_index()
    mg_tot_cnt_df.columns = ['contig_id', 'subcontig_total']
    ss_recruit_df = ss_cnt_df.merge(mg_tot_cnt_df, how='left', on='contig_id')
    ss_recruit_df['percent_recruited'] = ss_recruit_df['subcontig_recruits'] / \
                                           ss_recruit_df['subcontig_total']
    ss_recruit_df.sort_values(by='percent_recruited',
                              ascending=False,
                              inplace=True)
    # Only pass contigs that have the magjority of subcontigs recruited (>= 51%)
    ss_recruit_filter_df = ss_recruit_df.loc[
        ss_recruit_df['percent_recruited'] >= float(ss_per_pass)]
    mg_contig_per_max_df = ss_recruit_filter_df.groupby(
        ['contig_id'])['percent_recruited'].max().reset_index()
    mg_contig_per_max_df.columns = ['contig_id', 'percent_max']
    ss_recruit_max_df = ss_recruit_filter_df.merge(mg_contig_per_max_df,
                                                   how='left',
                                                   on='contig_id')
    # Now pass contigs that have the maximum recruit % of subcontigs
    ss_max_only_df = ss_recruit_max_df.loc[
        ss_recruit_max_df['percent_recruited'] >=
        ss_recruit_max_df['percent_max']]
    ss_max_df = ss_df[ss_df['contig_id'].isin(
        tuple(ss_max_only_df['contig_id']))]

    ss_max_df.to_csv(o_join(abr_path, mg_id + '.abr_trimmed_recruits.tsv'),
                     sep='\t',
                     index=False)

    return ss_max_df