def compare_sigs(sag_id, sag_file, mhr_path, sig_path, mg_sig_list, jacc_threshold): sag_subcontigs = s_utils.get_seqs(sag_file) if isfile(o_join(mhr_path, sag_id + '.mhr_recruits.tsv')): logging.info('[SABer]: Loading %s and MetaG signature recruit list\n' % sag_id) with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'), 'r') as mhr_in: pass_list = [ x.rstrip('\n').split('\t') for x in mhr_in.readlines() ] else: # Calculate\Load MinHash Signatures with SourMash for SAG subseqs if isfile(o_join(sig_path, sag_id + '.SAG.sig')): logging.info('[SABer]: Loading Signature for %s\n' % sag_id) sag_sig = sourmash.signature.load_one_signature( o_join(sig_path, sag_id + '.SAG.sig')) else: logging.info('[SABer]: Building Signature for %s\n' % sag_id) sag_minhash = sourmash.MinHash(n=0, ksize=51, scaled=100) for sg_head in sag_subcontigs: sag_subseq = str(sag_subcontigs[sg_head].seq) sag_minhash.add_sequence(sag_subseq, force=True) sag_sig = sourmash.SourmashSignature(sag_minhash, name=sag_id) with open(o_join(sig_path, sag_id + '.SAG.sig'), 'w') as sags_out: sourmash.signature.save_signatures([sag_sig], fp=sags_out) logging.info('[SABer]: Comparing %s and MetaG signature\n' % sag_id) pass_list = [] for mg_sig in mg_sig_list: jacc_sim = mg_sig.similarity(sag_sig) mg_nm = mg_sig.name() if jacc_sim >= jacc_threshold: pass_list.append([sag_id, mg_nm, mg_nm.rsplit('_', 1)[0]]) with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'), 'w') as mhr_out: mhr_out.write('\n'.join(['\t'.join(x) for x in pass_list])) pass_list = tuple(pass_list) return pass_list
def run_tetra_recruiter(tra_path, sag_sub_files, mg_sub_file, rpkm_max_df, gmm_per_pass): # TODO: 1. Think about using Minimum Description Length (MDL) instead of AIC/BIC # 2. [Normalized Maximum Likelihood or Fish Information Approximation] # 3. Can TetraNuc Hz be calc'ed for each sample? Does that improve things? # (think about http://merenlab.org/2020/01/02/visualizing-metagenomic-bins/#introduction) mg_id = mg_sub_file[0] mg_subcontigs = s_utils.get_seqs(mg_sub_file[1]) mg_headers = tuple(mg_subcontigs.keys()) mg_subs = tuple([r.seq for r in mg_subcontigs]) #mg_id, mg_headers, mg_subs = mg_subcontigs # Build/Load tetramers for SAGs and MG subset by ara recruits if isfile(o_join(tra_path, mg_id + '.tetras.tsv')): logging.info('[SABer]: Loading tetramer Hz matrix for %s\n' % mg_id) mg_tetra_df = pd.read_csv(o_join(tra_path, mg_id + '.tetras.tsv'), sep='\t', index_col=0, header=0) else: logging.info('[SABer]: Calculating tetramer Hz matrix for %s\n' % mg_id) mg_tetra_df = pd.DataFrame.from_dict(s_utils.tetra_cnt(mg_subs)) mg_tetra_df['contig_id'] = mg_headers mg_tetra_df.set_index('contig_id', inplace=True) mg_tetra_df.to_csv(o_join(tra_path, mg_id + '.tetras.tsv'), sep='\t') gmm_total_pass_list = [] svm_total_pass_list = [] iso_total_pass_list = [] comb_total_pass_list = [] #for sag_id, sag_headers, sag_subs in sag_subcontigs: for i, sag_rec in enumerate(sag_sub_files): sag_id, sag_file = sag_rec sag_subcontigs = s_utils.get_seqs(sag_file) sag_headers = tuple(sag_subcontigs.keys()) sag_subs = tuple([r.seq for r in sag_subcontigs.itervalues()]) if (isfile(o_join(tra_path, sag_id + '.gmm_recruits.tsv')) & isfile(o_join(tra_path, sag_id + '.svm_recruits.tsv')) & isfile(o_join(tra_path, sag_id + '.iso_recruits.tsv')) & isfile(o_join(tra_path, sag_id + '.comb_recruits.tsv'))): logging.info('[SABer]: Loading %s tetramer Hz recruit list\n' % sag_id) with open(o_join(tra_path, sag_id + '.gmm_recruits.tsv'), 'r') as tra_in: gmm_pass_list = [ x.rstrip('\n').split('\t') for x in tra_in.readlines() ] with open(o_join(tra_path, sag_id + '.svm_recruits.tsv'), 'r') as tra_in: svm_pass_list = [ x.rstrip('\n').split('\t') for x in tra_in.readlines() ] with open(o_join(tra_path, sag_id + '.iso_recruits.tsv'), 'r') as tra_in: iso_pass_list = [ x.rstrip('\n').split('\t') for x in tra_in.readlines() ] with open(o_join(tra_path, sag_id + '.comb_recruits.tsv'), 'r') as tra_in: comb_pass_list = [ x.rstrip('\n').split('\t') for x in tra_in.readlines() ] else: if isfile(o_join(tra_path, sag_id + '.tetras.tsv')): logging.info('[SABer]: Loading tetramer Hz matrix for %s\n' % sag_id) sag_tetra_df = pd.read_csv(o_join(tra_path, sag_id + '.tetras.tsv'), sep='\t', index_col=0, header=0) else: logging.info( '[SABer]: Calculating tetramer Hz matrix for %s\n' % sag_id) sag_tetra_df = pd.DataFrame.from_dict( s_utils.tetra_cnt(sag_subs)) sag_tetra_df['contig_id'] = sag_headers sag_tetra_df.set_index('contig_id', inplace=True) sag_tetra_df.to_csv(o_join(tra_path, sag_id + '.tetras.tsv'), sep='\t') # Concat SAGs amd MG for GMM mg_rpkm_contig_list = list(rpkm_max_df.loc[ rpkm_max_df['sag_id'] == sag_id]['subcontig_id'].values) mg_tetra_filter_df = mg_tetra_df.loc[mg_tetra_df.index.isin( mg_rpkm_contig_list)] #concat_tetra_df = pd.concat([sag_tetra_df, mg_tetra_filter_df]) ''' normed_tetra_df = concat_tetra_df sag_normed_tetra_df = normed_tetra_df[ normed_tetra_df.index.isin(sag_tetra_df.index) ] mg_normed_tetra_df = normed_tetra_df.loc[ normed_tetra_df.index.isin(mg_tetra_filter_df.index) ] # UMAP for Dimension reduction of tetras sag_features = sag_normed_tetra_df.values sag_targets = sag_normed_tetra_df.index.values mg_features = mg_normed_tetra_df.values mg_targets = mg_normed_tetra_df.index.values normed_features = normed_tetra_df.values normed_targets = normed_tetra_df.index.values logging.info('[SABer]: Dimension reduction of tetras with UMAP\n') umap_trans = umap.UMAP(n_neighbors=2, min_dist=0.0, n_components=num_components, metric='manhattan', random_state=42 ).fit_transform(normed_features) pc_col_names = ['pc' + str(x) for x in range(1, num_components + 1)] umap_df = pd.DataFrame(umap_trans, columns=pc_col_names, index=normed_targets) sag_umap_df = umap_df.loc[umap_df.index.isin(sag_tetra_df.index)] mg_umap_df = umap_df.loc[umap_df.index.isin(mg_tetra_filter_df.index)] sag_tetra_df = concat_tetra_df.loc[ concat_tetra_df.index.isin(sag_tetra_df.index) ] mg_tetra_df = concat_tetra_df.loc[ concat_tetra_df.index.isin(mg_tetra_filter_df.index) ] ''' logging.info('[SABer]: Calculating AIC/BIC for GMM components\n') sag_train_vals = [1 for x in sag_tetra_df.index] n_components = np.arange(1, 5, 1) models = [GMM(n, random_state=42) for n in n_components] bics = [] aics = [] for i, model in enumerate(models): n_comp = n_components[i] try: bic = model.fit(sag_tetra_df.values, sag_train_vals).bic(sag_tetra_df.values) bics.append(bic) except: logging.info('[WARNING]: BIC failed with %s components\n' % n_comp) try: aic = model.fit(sag_tetra_df.values, sag_train_vals).aic(sag_tetra_df.values) aics.append(aic) except: logging.info('[WARNING]: AIC failed with %s components\n' % n_comp) min_bic_comp = n_components[bics.index(min(bics))] min_aic_comp = n_components[aics.index(min(aics))] logging.info('[SABer]: Min AIC/BIC at %s/%s, respectively\n' % (min_aic_comp, min_bic_comp)) logging.info('[SABer]: Using BIC as guide for GMM components\n') logging.info('[SABer]: Training GMM on SAG tetras\n') gmm = GMM(n_components=min_bic_comp, random_state=42).fit(sag_tetra_df.values) logging.info('[SABer]: GMM Converged: %s\n' % gmm.converged_) try: # TODO: add predict and predict_proba to this and output all to table sag_scores = gmm.score_samples(sag_tetra_df.values) sag_scores_df = pd.DataFrame(data=sag_scores, index=sag_tetra_df.index.values) sag_scores_df.columns = ['wLogProb'] sag_score_min = min(sag_scores_df.values)[0] sag_score_max = max(sag_scores_df.values)[0] mg_scores = gmm.score_samples(mg_tetra_filter_df.values) mg_scores_df = pd.DataFrame( data=mg_scores, index=mg_tetra_filter_df.index.values) mg_scores_df.columns = ['wLogProb'] gmm_pass_df = mg_scores_df.loc[ (mg_scores_df['wLogProb'] >= sag_score_min) & (mg_scores_df['wLogProb'] <= sag_score_max)] # And is has to be from the RPKM pass list gmm_pass_df = gmm_pass_df.loc[gmm_pass_df.index.isin( mg_rpkm_contig_list)] gmm_pass_list = [] for md_nm in gmm_pass_df.index.values: gmm_pass_list.append( [sag_id, md_nm, md_nm.rsplit('_', 1)[0]]) except: logging.info('[SABer]: Warning: No recruits found...\n') gmm_pass_list = [] logging.info('[SABer]: Training OCSVM on SAG tetras\n') # fit OCSVM clf = svm.OneClassSVM() clf.fit(sag_tetra_df.values) sag_pred = clf.predict(sag_tetra_df.values) #sag_pred_df = pd.DataFrame(data=sag_pred, index=sag_tetra_df.index.values) mg_pred = clf.predict(mg_tetra_filter_df.values) mg_pred_df = pd.DataFrame(data=mg_pred, index=mg_tetra_filter_df.index.values) svm_pass_df = mg_pred_df.loc[mg_pred_df[0] != -1] # And is has to be from the RPKM pass list svm_pass_df = svm_pass_df.loc[svm_pass_df.index.isin( mg_rpkm_contig_list)] svm_pass_list = [] for md_nm in svm_pass_df.index.values: svm_pass_list.append([sag_id, md_nm, md_nm.rsplit('_', 1)[0]]) logging.info('[SABer]: Training Isolation Forest on SAG tetras\n') # fit IsoForest clf = IsolationForest(random_state=42) clf.fit(sag_tetra_df.values) #clf.set_params(n_estimators=20) # add 10 more trees #clf.fit(sag_tetra_df.values) # fit the added trees mg_pred = clf.predict(mg_tetra_filter_df.values) mg_pred_df = pd.DataFrame(data=mg_pred, index=mg_tetra_filter_df.index.values) iso_pass_df = mg_pred_df.loc[mg_pred_df[0] != -1] # And is has to be from the RPKM pass list iso_pass_df = iso_pass_df.loc[iso_pass_df.index.isin( mg_rpkm_contig_list)] iso_pass_list = [] for md_nm in iso_pass_df.index.values: iso_pass_list.append([sag_id, md_nm, md_nm.rsplit('_', 1)[0]]) gmm_id_list = [x[1] for x in gmm_pass_list] svm_id_list = [x[1] for x in svm_pass_list] iso_id_list = [x[1] for x in iso_pass_list] comb_set_list = list( set(gmm_id_list) & set(svm_id_list) & set(iso_id_list)) #comb_set_list = list(set(gmm_id_list) & set(svm_id_list)) comb_pass_list = [] for md_nm in comb_set_list: comb_pass_list.append([sag_id, md_nm, md_nm.rsplit('_', 1)[0]]) logging.info('[SABer]: Recruited %s subcontigs to %s with GMM\n' % (len(gmm_pass_list), sag_id)) logging.info('[SABer]: Recruited %s subcontigs to %s with SVM\n' % (len(svm_pass_list), sag_id)) logging.info( '[SABer]: Recruited %s subcontigs to %s with Isolation Forest\n' % (len(iso_pass_list), sag_id)) logging.info( '[SABer]: Recruited %s subcontigs to %s with combined methods\n' % (len(comb_pass_list), sag_id)) with open(o_join(tra_path, sag_id + '.gmm_recruits.tsv'), 'w') as tra_out: tra_out.write('\n'.join(['\t'.join(x) for x in gmm_pass_list])) with open(o_join(tra_path, sag_id + '.svm_recruits.tsv'), 'w') as tra_out: tra_out.write('\n'.join(['\t'.join(x) for x in svm_pass_list])) with open(o_join(tra_path, sag_id + '.iso_recruits.tsv'), 'w') as tra_out: tra_out.write('\n'.join(['\t'.join(x) for x in iso_pass_list])) with open(o_join(tra_path, sag_id + '.comb_recruits.tsv'), 'w') as tra_out: tra_out.write('\n'.join(['\t'.join(x) for x in comb_pass_list])) gmm_total_pass_list.extend(gmm_pass_list) svm_total_pass_list.extend(svm_pass_list) iso_total_pass_list.extend(iso_pass_list) comb_total_pass_list.extend(comb_pass_list) gmm_df = pd.DataFrame(gmm_total_pass_list, columns=['sag_id', 'subcontig_id', 'contig_id']) svm_df = pd.DataFrame(svm_total_pass_list, columns=['sag_id', 'subcontig_id', 'contig_id']) iso_df = pd.DataFrame(iso_total_pass_list, columns=['sag_id', 'subcontig_id', 'contig_id']) comb_df = pd.DataFrame(comb_total_pass_list, columns=['sag_id', 'subcontig_id', 'contig_id']) tetra_df_dict = { 'gmm': gmm_df, 'svm': svm_df, 'iso': iso_df, 'comb': comb_df } #tetra_df_dict = {'gmm':gmm_df, 'svm':svm_df, 'comb':comb_df} for tetra_id in tetra_df_dict: tetra_df = tetra_df_dict[tetra_id] #mg_id, mg_headers, mg_subs = mg_subcontigs # Count # of subcontigs recruited to each SAG gmm_cnt_df = tetra_df.groupby(['sag_id', 'contig_id']).count().reset_index() gmm_cnt_df.columns = ['sag_id', 'contig_id', 'subcontig_recruits'] # Build subcontig count for each MG contig mg_contig_list = [x.rsplit('_', 1)[0] for x in mg_headers] mg_tot_df = pd.DataFrame(zip(mg_contig_list, mg_headers), columns=['contig_id', 'subcontig_id']) mg_tot_cnt_df = mg_tot_df.groupby(['contig_id']).count().reset_index() mg_tot_cnt_df.columns = ['contig_id', 'subcontig_total'] mg_recruit_df = gmm_cnt_df.merge(mg_tot_cnt_df, how='left', on='contig_id') mg_recruit_df['percent_recruited'] = mg_recruit_df['subcontig_recruits'] / \ mg_recruit_df['subcontig_total'] mg_recruit_df.sort_values(by='percent_recruited', ascending=False, inplace=True) # Only pass contigs that have the magjority of subcontigs recruited (>= N%) mg_recruit_filter_df = mg_recruit_df.loc[ mg_recruit_df['percent_recruited'] >= float(gmm_per_pass)] mg_contig_per_max_df = mg_recruit_filter_df.groupby( ['contig_id'])['percent_recruited'].max().reset_index() mg_contig_per_max_df.columns = ['contig_id', 'percent_max'] mg_recruit_max_df = mg_recruit_filter_df.merge(mg_contig_per_max_df, how='left', on='contig_id') # Now pass contigs that have the maximum recruit % of subcontigs mg_max_only_df = mg_recruit_max_df.loc[ mg_recruit_max_df['percent_recruited'] >= mg_recruit_max_df['percent_max']] mg_max_only_df.to_csv(o_join( tra_path, mg_id + '.' + tetra_id + '.tra_trimmed_recruits.tsv'), sep='\t', index=False) tetra_df_dict[tetra_id] = mg_max_only_df return tetra_df_dict
def run_combine_recruits(final_path, ext_path, asm_path, check_path, mg_file, tetra_df_dict, minhash_df, sag_list): # TODO: use logging instead of print mg_contigs = tuple([(r.name, r.seq) for r in s_utils.get_seqs(mg_file).itervalues()]) for tetra_id in tetra_df_dict: tetra_df = tetra_df_dict[tetra_id] # TODO: Use full contigs instead of subcontigs for co-asm, reduces asm time for Minimus2? CISA? # TODO: check for co-asm files before running # build SAG id to SAG path dict sag2path_dict = {} for sag_path in sag_list: base = basename(sag_path) sag_id = base.rsplit('.', 1)[0] sag2path_dict[sag_id] = sag_path # Merge MinHash and GMM Tetra (passed first by ABR) mh_gmm_merge_df = minhash_df[['sag_id', 'contig_id']].merge( tetra_df[['sag_id', 'contig_id']], how='outer', on=['sag_id', 'contig_id']).drop_duplicates() mh_gmm_merge_df.to_csv(o_join(final_path, tetra_id + '.final_recruits.tsv'), sep='\t', index=True) mg_contigs_df = pd.DataFrame(mg_contigs, columns=['contig_id', 'seq']) sag_de_df_list = [] for sag_id in set(mh_gmm_merge_df['sag_id']): final_rec = o_join( final_path, sag_id + '.' + tetra_id + '.final_recruits.fasta') sub_merge_df = mh_gmm_merge_df.loc[mh_gmm_merge_df['sag_id'] == sag_id] print('[SABer]: Recruited %s contigs from entire analysis for %s' % (sub_merge_df.shape[0], sag_id)) with open( o_join(final_path, sag_id + '.' + tetra_id + '.final_recruits.fasta'), 'w') as final_out: mg_sub_filter_df = mg_contigs_df.loc[ mg_contigs_df['contig_id'].isin(sub_merge_df['contig_id'])] final_mgsubs_list = [ '\n'.join(['>' + x[0], x[1]]) for x in zip( mg_sub_filter_df['contig_id'], mg_sub_filter_df['seq']) ] final_out.write('\n'.join(final_mgsubs_list)) ''' # Combine SAG and final recruits # TODO: is this actually needed if MinHash is so good? I think not :) ext_SAG = o_join(ext_path, sag_id + '.extend_SAG.fasta') with open(ext_SAG, 'w') as cat_file: data = [] with open(sag_file, 'r') as sag_in: data.extend(sag_in.readlines()) with open(o_join(final_path, sag_id + '.' + tetra_id + '.final_recruits.fasta'), 'r') as \ recruits_in: data.extend(recruits_in.readlines()) join_data = '\n'.join(data).replace('\n\n', '\n') cat_file.write(join_data) ''' # Use BBTools dedupe.sh to deduplicate the extend SAG file dedupe_SAG = o_join( ext_path, sag_id + '.' + tetra_id + '.extended_SAG.fasta') dedupe_cmd = [ 'dedupe.sh', 'in=' + final_rec, 'out=' + dedupe_SAG, 'threads=8', 'minidentity=97', 'overwrite=true' ] run_dedupe = Popen(dedupe_cmd, stdout=PIPE) print(run_dedupe.communicate()[0].decode()) de_header_list = [] with open(dedupe_SAG, 'r') as de_file: data = de_file.readlines() for line in data: if '>' in line: de_header_list.append(line.strip('>').strip('\n')) de_sag_df = pd.DataFrame(de_header_list, columns=['contig_id']) de_sag_df['sag_id'] = sag_id de_sag_df['tetra_id'] = tetra_id sag_de_df_list.append(de_sag_df) sag_de_df = pd.concat(sag_de_df_list) sag_de_df.to_csv(o_join(ext_path, tetra_id + '.extended_SAGs.tsv'), sep='\t', index=True) ''' # Use minimus2 to merge the SAG and the recruits into one assembly toAmos_cmd = ['/home/rmclaughlin/bin/amos-3.1.0/bin/toAmos', '-s', ext_SAG, '-o', o_join(asm_path, sag_id + '.afg') ] run_toAmos = Popen(toAmos_cmd, stdout=PIPE) print(run_toAmos.communicate()[0].decode()) minimus_cmd = ['/home/rmclaughlin/bin/amos-3.1.0/bin/minimus2', o_join(asm_path, sag_id), '-D', 'REFCOUNT=0', '-D', 'OVERLAP=200', '-D', 'MINID=95' ] run_minimus = Popen(minimus_cmd, stdout=PIPE) print(run_minimus.communicate()[0].decode()) if isfile(o_join(asm_path, sag_id + '.fasta')): filenames = [o_join(asm_path, sag_id + '.fasta'), o_join(asm_path, sag_id + '.singletons.seq')] with open(o_join(asm_path, sag_id + '.minimus2.asm.fasta'), 'w') as outfile: for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) move_cmd = ['mv', o_join(asm_path, sag_id + '.fasta'), o_join(asm_path, sag_id + '.minimus2_no_singles.asm.fasta') ] run_move = Popen(move_cmd, stdout=PIPE) clean_cmd = ['rm', '-r', o_join(asm_path, sag_id + '.runAmos.log'), o_join(asm_path, sag_id + '.afg'), o_join(asm_path, sag_id + '.OVL'), o_join(asm_path, sag_id + '.singletons'), o_join(asm_path, sag_id + '.singletons.seq'), o_join(asm_path, sag_id + '.contig'), o_join(asm_path, sag_id + '.ovl'), o_join(asm_path, sag_id + '.coords'), o_join(asm_path, sag_id + '.qry.seq'), o_join(asm_path, sag_id + '.delta'), o_join(asm_path, sag_id + '.bnk'), o_join(asm_path, sag_id + '.ref.seq') ] run_clean = Popen(clean_cmd, stdout=PIPE) ''' # Run CheckM on all new rebuilt/updated SAGs print('[SABer]: Checking all new SAG quality using CheckM') checkm_cmd = [ 'checkm', 'lineage_wf', '--tab_table', '-x', 'fasta', '--threads', '8', '--pplacer_threads', '8', '-f', o_join(check_path, 'checkM_stdout.tsv'), ext_path, check_path ] run_checkm = Popen(checkm_cmd, stdout=PIPE) print(run_checkm.communicate()[0].decode())
def run_minhash_recruiter(sig_path, mhr_path, sag_sub_files, mg_sub_file, jacc_threshold, mh_per_pass, nthreads): logging.info('[SABer]: MinHash Recruitment Algorithm\n') # Calculate/Load MinHash Signatures with SourMash for MG subseqs mg_id = mg_sub_file[0] logging.info('[SABer]: Loading subcontigs for %s\n' % mg_id) mg_subcontigs = s_utils.get_seqs(mg_sub_file[1]) mg_headers = tuple(mg_subcontigs.keys()) #mg_id, mg_headers, mg_subs = mg_subcontigs if isfile(o_join( sig_path, mg_id + '.metaG.sig')): # TODO: MG should only be loaded if required logging.info('[SABer]: Loading %s Signatures\n' % mg_id) mg_sig_list = tuple(sourmash.signature.load_signatures(o_join(sig_path, mg_id + \ '.metaG.sig') )) else: logging.info('[SABer]: Building Signatures for %s\n' % mg_id) pool = multiprocessing.Pool(processes=nthreads) # TODO: convert to Ray arg_list = [] for i, mg_head in enumerate(mg_subcontigs): logging.info('\r[SABer]: Building multi-pool: {0:.0%} done'.format( i / len(mg_subcontigs))) arg_list.append([mg_head, str(mg_subcontigs[mg_head].seq)]) logging.info('\n') results = pool.imap_unordered(build_signature, arg_list) logging.info('\r[SABer]: Executing multi-pool:') mg_sig_list = [] for i, mg_sig in enumerate(results): logging.info( '\r[SABer]: Executing multi-pool: {0:.0%} done'.format( i / len(arg_list))) mg_sig_list.append(mg_sig) logging.info('\n') pool.close() pool.join() with open(o_join(sig_path, mg_id + '.metaG.sig'), 'w') as mg_out: sourmash.signature.save_signatures(mg_sig_list, fp=mg_out) mg_sig_list = tuple(mg_sig_list) # Load comparisons OR Compare SAG sigs to MG sigs to find containment logging.info('[SABer]: Comparing Signatures of SAGs to MetaG contigs\n') l = 0 b = 0 build_list = [] minhash_pass_list = [] for i, sag_rec in enumerate(sag_sub_files): sag_id, sag_file = sag_rec if isfile(o_join(mhr_path, sag_id + '.mhr_recruits.tsv')): #logging.info('[SABer]: Loading %s and MetaG signature recruit list\n' % sag_id) with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'), 'r') as mhr_in: pass_list = tuple( [x.rstrip('\n').split('\t') for x in mhr_in.readlines()]) minhash_pass_list.extend(pass_list) l += 1 else: build_list.append(sag_rec) b += 1 logging.info( '\r[SABer]: Loading/Comparing SAG and MetaG signatures: {}/{} done' .format(l, b)) logging.info('\n') if b != 0: logging.info( '[SABer]: Building/Comparing {} SAG signatures\n'.format(b)) ray.init(num_cpus=nthreads) r_mg_sig_list = ray.put(mg_sig_list) r_mhr_path = ray.put(mhr_path) r_jacc_threshold = ray.put(jacc_threshold) futures = [] for i, sag_rec in enumerate(build_list): sag_id, sag_file = sag_rec futures.append( compare_sigs.remote(sag_id, sag_file, r_mhr_path, sig_path, r_mg_sig_list, r_jacc_threshold)) logging.info('\r[SABer]: Comparison {0:.0%} complete'.format( i / len(build_list))) logging.info('\n') ray_results = [v for r_list in ray.get(futures) for v in r_list] minhash_pass_list.extend(ray_results) minhash_df = pd.DataFrame(minhash_pass_list, columns=['sag_id', 'subcontig_id', 'contig_id']) logging.info('[SABer]: Compiling all MinHash Recruits\n') # Count # of subcontigs recruited to each SAG via samsum mh_cnt_df = minhash_df.groupby(['sag_id', 'contig_id']).count().reset_index() mh_cnt_df.columns = ['sag_id', 'contig_id', 'subcontig_recruits'] # Build subcontig count for each MG contig mg_contig_list = [x.rsplit('_', 1)[0] for x in mg_headers] mg_tot_df = pd.DataFrame(zip(mg_contig_list, mg_headers), columns=['contig_id', 'subcontig_id']) mg_tot_cnt_df = mg_tot_df.groupby(['contig_id']).count().reset_index() mg_tot_cnt_df.columns = ['contig_id', 'subcontig_total'] mh_recruit_df = mh_cnt_df.merge(mg_tot_cnt_df, how='left', on='contig_id') mh_recruit_df['percent_recruited'] = mh_recruit_df['subcontig_recruits'] / \ mh_recruit_df['subcontig_total'] mh_recruit_df.sort_values(by='percent_recruited', ascending=False, inplace=True) # Only pass contigs that have the magjority of subcontigs recruited (>= 51%) mh_recruit_filter_df = mh_recruit_df.loc[ mh_recruit_df['percent_recruited'] >= float(mh_per_pass)] mg_contig_per_max_df = mh_recruit_filter_df.groupby( ['contig_id'])['percent_recruited'].max().reset_index() mg_contig_per_max_df.columns = ['contig_id', 'percent_max'] mh_recruit_max_df = mh_recruit_filter_df.merge(mg_contig_per_max_df, how='left', on='contig_id') # Now pass contigs that have the maximum recruit % of subcontigs mh_max_only_df = mh_recruit_max_df.loc[ mh_recruit_max_df['percent_recruited'] >= mh_recruit_max_df['percent_max']] mh_max_df = minhash_df[minhash_df['contig_id'].isin( list(mh_max_only_df['contig_id']))] mh_max_df.to_csv(o_join(mhr_path, mg_id + '.mhr_trimmed_recruits.tsv'), sep='\t', index=False) logging.info('[SABer]: MinHash Recruitment Algorithm Complete\n') return mh_max_df
def run_abund_recruiter(subcontig_path, abr_path, mg_sub_file, mg_raw_file_list, minhash_df, ss_per_pass, nthreads): mg_id = mg_sub_file[0] mg_subcontigs = s_utils.get_seqs(mg_sub_file[1]) mg_headers = tuple(mg_subcontigs.keys()) #mg_id, mg_headers = mg_subcontigs[0], mg_subcontigs[1] logging.info('[SABer]: Starting Abundance Recruitment Algorithm\n') logging.info('[SABer]: Checking for abundance table for %s\n' % mg_id) if isfile(o_join(abr_path, mg_id + '.samsum_merged.tsv')): logging.info('[SABer]: Loading %s abundance table\n' % mg_id) mg_ss_df = pd.read_csv(o_join(abr_path, mg_id + '.samsum_merged.tsv'), sep='\t', header=0) else: logging.info('[SABer]: Building %s abundance table\n' % mg_id) mg_sub_path = o_join(subcontig_path, mg_id + '.subcontigs.fasta') # is is indexed? index_ext_list = ['amb', 'ann', 'bwt', 'pac', 'sa'] check_ind_list = ['.'.join([mg_sub_path, x]) for x in index_ext_list] if False in (isfile(f) for f in check_ind_list): # Use BWA to build an index for metagenome assembly logging.info('[SABer]: Creating index with BWA\n') bwa_cmd = ['bwa', 'index', '-b', '500000000', mg_sub_path ] #TODO: how to get install path for executables? with open(o_join(abr_path, mg_id + '.stdout.txt'), 'w') as stdout_file: with open(o_join(abr_path, mg_id + '.stderr.txt'), 'w') as stderr_file: run_bwa = Popen(bwa_cmd, stdout=stdout_file, stderr=stderr_file) run_bwa.communicate() # Process raw metagenomes to calculate abundances with open(mg_raw_file_list, 'r') as raw_fa_in: raw_data = raw_fa_in.readlines() ss_output_list = [] for line in raw_data: split_line = line.strip('\n').split('\t') if len(split_line) == 2: logging.info('[SABer]: Raw reads in FWD and REV file...\n') pe1 = split_line[0] pe2 = split_line[1] mem_cmd = [ 'bwa', 'mem', '-t', str(nthreads), '-p', o_join(subcontig_path, mg_id + '.subcontigs.fasta'), pe1, pe2 ] #TODO: add support for specifying number of threads else: # if the fastq is interleaved logging.info('[SABer]: Raw reads in interleaved file...\n') pe1 = split_line[0] mem_cmd = [ 'bwa', 'mem', '-t', str(nthreads), '-p', o_join(subcontig_path, mg_id + '.subcontigs.fasta'), pe1 ] #TODO: how to get install path for executables? pe_basename = basename(pe1) pe_id = pe_basename.split('.')[0] # BWA sam file exists? mg_sam_out = o_join(abr_path, pe_id + '.sam') if isfile(mg_sam_out) == False: logging.info('[SABer]: Running BWA mem on %s\n' % pe_id) with open(mg_sam_out, 'w') as sam_file: with open(o_join(abr_path, pe_id + '.stderr.txt'), 'w') as stderr_file: run_mem = Popen(mem_cmd, stdout=sam_file, stderr=stderr_file) run_mem.communicate() logging.info('[SABer]: Calculating TPM with samsum for %s\n' % pe_id) mg_input = o_join(subcontig_path, mg_id + '.subcontigs.fasta') sam_input = o_join(abr_path, pe_id + '.sam') # samsum API ref_seq_abunds = commands.ref_sequence_abundances( aln_file=sam_input, seq_file=mg_input, multireads=True) ss_output_list.append(ref_seq_abunds) logging.info('[SABer]: Merging results for all samsum output\n') # Merge API output for each raw sample file refseq_header_list = ss_output_list[0].keys() refseq_merge_list = [] for refseq_header in refseq_header_list: for i, refseq_dict in enumerate(ss_output_list): refseq_obj = refseq_dict[refseq_header] rso_name = refseq_obj.name rso_length = refseq_obj.length rso_reads_mapped = refseq_obj.reads_mapped rso_weight_total = refseq_obj.weight_total rso_fpkm = refseq_obj.fpkm rso_tpm = refseq_obj.tpm rso_sample_index = i refseq_merge_list.append([ rso_name, rso_sample_index, rso_length, rso_reads_mapped, rso_weight_total, rso_fpkm, rso_tpm ]) mg_ss_df = pd.DataFrame(refseq_merge_list, columns=[ 'subcontig_id', 'sample_index', 'length', 'reads_mapped', 'weight_total', 'fpkm', 'tpm' ]) mg_ss_df.to_csv(o_join(abr_path, mg_id + '.samsum_merged.tsv'), sep='\t', index=False) # extract TPM and pivot for MG mg_ss_trim_df = mg_ss_df[['subcontig_id', 'sample_index', 'tpm']].dropna(how='any') mg_ss_piv_df = pd.pivot_table(mg_ss_trim_df, values='tpm', index='subcontig_id', columns='sample_index') normed_ss_df = pd.DataFrame(normalize(mg_ss_piv_df.values), columns=mg_ss_piv_df.columns, index=mg_ss_piv_df.index) normed_ss_df.to_csv(o_join(abr_path, mg_id + '.samsum_normmed.tsv'), sep='\t') # get MinHash "passed" mg ss ss_pass_list = [] for sag_id in set(minhash_df['sag_id']): logging.info('[SABer]: Calulating/Loading abundance stats for %s\n' % sag_id) if isfile(o_join(abr_path, sag_id + '.abr_recruits.tsv')): with open(o_join(abr_path, sag_id + '.abr_recruits.tsv'), 'r') as abr_in: pass_list = tuple( [x.rstrip('\n').split('\t') for x in abr_in.readlines()]) else: sag_mh_pass_df = minhash_df[minhash_df['sag_id'] == sag_id] mh_cntg_pass_list = set(sag_mh_pass_df['subcontig_id']) mg_ss_pass_df = mg_ss_piv_df[mg_ss_piv_df.index.isin( mh_cntg_pass_list)] mg_ss_test_df = mg_ss_piv_df[~mg_ss_piv_df.index. isin(mh_cntg_pass_list)] mg_ss_pass_stat_df = mg_ss_pass_df.mean().reset_index() mg_ss_pass_stat_df.columns = ['sample_id', 'mean'] mg_ss_pass_stat_df['std'] = tuple(mg_ss_pass_df.std()) mg_ss_pass_stat_df['var'] = tuple(mg_ss_pass_df.var()) mg_ss_pass_stat_df['skew'] = tuple(mg_ss_pass_df.skew()) mg_ss_pass_stat_df['kurt'] = tuple(mg_ss_pass_df.kurt()) mg_ss_pass_stat_df['IQ_25'] = tuple(mg_ss_pass_df.quantile(0.25)) mg_ss_pass_stat_df['IQ_75'] = tuple(mg_ss_pass_df.quantile(0.75)) mg_ss_pass_stat_df['IQ_10'] = tuple(mg_ss_pass_df.quantile(0.10)) mg_ss_pass_stat_df['IQ_90'] = tuple(mg_ss_pass_df.quantile(0.90)) mg_ss_pass_stat_df['IQ_05'] = tuple(mg_ss_pass_df.quantile(0.05)) mg_ss_pass_stat_df['IQ_95'] = tuple(mg_ss_pass_df.quantile(0.95)) mg_ss_pass_stat_df['IQ_01'] = tuple(mg_ss_pass_df.quantile(0.01)) mg_ss_pass_stat_df['IQ_99'] = tuple(mg_ss_pass_df.quantile(0.99)) mg_ss_pass_stat_df['IQR'] = mg_ss_pass_stat_df['IQ_75'] - \ mg_ss_pass_stat_df['IQ_25'] # calc Tukey Fences mg_ss_pass_stat_df['upper_bound'] = mg_ss_pass_stat_df['IQ_75'] + \ (1.5 * mg_ss_pass_stat_df['IQR']) mg_ss_pass_stat_df['lower_bound'] = mg_ss_pass_stat_df['IQ_25'] - \ (1.5 * mg_ss_pass_stat_df['IQR']) mg_ss_pass_stat_df.to_csv(o_join(abr_path, sag_id + '.passed_ss_stats.tsv'), sep='\t') # Use passed MG from MHR to recruit more seqs, iqr_pass_df = mg_ss_test_df.copy() for i, col_nm in enumerate(mg_ss_test_df.columns): pass_stats = mg_ss_pass_stat_df.iloc[[i]] pass_max = pass_stats['upper_bound'].values[0] pass_min = pass_stats['lower_bound'].values[0] iqr_pass_df = iqr_pass_df.loc[ (iqr_pass_df[col_nm] >= pass_min) & (iqr_pass_df[col_nm] <= pass_max)] pass_list = [] join_ss_recruits = set( tuple(iqr_pass_df.index) + tuple(mh_cntg_pass_list)) for md_nm in join_ss_recruits: pass_list.append([sag_id, md_nm, md_nm.rsplit('_', 1)[0]]) with open(o_join(abr_path, sag_id + '.abr_recruits.tsv'), 'w') as abr_out: abr_out.write('\n'.join(['\t'.join(x) for x in pass_list])) logging.info('[SABer]: Recruited %s subcontigs to %s\n' % (len(pass_list), sag_id)) ss_pass_list.extend(tuple(pass_list)) ss_df = pd.DataFrame(ss_pass_list, columns=['sag_id', 'subcontig_id', 'contig_id']) # Count # of subcontigs recruited to each SAG via samsum ss_cnt_df = ss_df.groupby(['sag_id', 'contig_id']).count().reset_index() ss_cnt_df.columns = ['sag_id', 'contig_id', 'subcontig_recruits'] # Build subcontig count for each MG contig mg_contig_list = [x.rsplit('_', 1)[0] for x in mg_headers] mg_tot_df = pd.DataFrame(zip(mg_contig_list, mg_headers), columns=['contig_id', 'subcontig_id']) mg_tot_cnt_df = mg_tot_df.groupby(['contig_id']).count().reset_index() mg_tot_cnt_df.columns = ['contig_id', 'subcontig_total'] ss_recruit_df = ss_cnt_df.merge(mg_tot_cnt_df, how='left', on='contig_id') ss_recruit_df['percent_recruited'] = ss_recruit_df['subcontig_recruits'] / \ ss_recruit_df['subcontig_total'] ss_recruit_df.sort_values(by='percent_recruited', ascending=False, inplace=True) # Only pass contigs that have the magjority of subcontigs recruited (>= 51%) ss_recruit_filter_df = ss_recruit_df.loc[ ss_recruit_df['percent_recruited'] >= float(ss_per_pass)] mg_contig_per_max_df = ss_recruit_filter_df.groupby( ['contig_id'])['percent_recruited'].max().reset_index() mg_contig_per_max_df.columns = ['contig_id', 'percent_max'] ss_recruit_max_df = ss_recruit_filter_df.merge(mg_contig_per_max_df, how='left', on='contig_id') # Now pass contigs that have the maximum recruit % of subcontigs ss_max_only_df = ss_recruit_max_df.loc[ ss_recruit_max_df['percent_recruited'] >= ss_recruit_max_df['percent_max']] ss_max_df = ss_df[ss_df['contig_id'].isin( tuple(ss_max_only_df['contig_id']))] ss_max_df.to_csv(o_join(abr_path, mg_id + '.abr_trimmed_recruits.tsv'), sep='\t', index=False) return ss_max_df