def main(): args = parser.parse_args() # Check that input files exist. check_files_exist([args.input, args.function, args.marker]) # Pass arguments to key function and get predicted functions # stratified and unstratified by genomes. strat_pred, unstrat_pred = run_metagenome_pipeline(input_biom=args.input, function=args.function, marker=args.marker, out_dir=args.out_dir, max_nsti=args.max_nsti, min_reads=args.min_reads, min_samples=args.min_samples, strat_out=args.strat_out, proc=args.proc, output_normfile=True) # Generate output table filepaths and write out pandas dataframe. unstrat_outfile = path.join(args.out_dir, "pred_metagenome_unstrat.tsv") unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t", index=True, index_label="function") # Write out stratified table only if that option was specified. if args.strat_out: strat_outfile = path.join(args.out_dir, "pred_metagenome_strat.tsv") strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=True)
def test_full_pipeline_unstrat_tsv(self): '''Test that run_metagenome_pipeline works on tsv input seqtab.''' with TemporaryDirectory() as temp_dir: strat_out, unstrat_out = run_metagenome_pipeline(input_biom=seqtab_tsv, function=func_predict, marker=marker_predict, max_nsti=2, out_dir=temp_dir) pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_in)
def test_full_pipeline_strat_biom(self): '''Test that run_metagenome_pipeline works on tsv input seqtab.''' with TemporaryDirectory() as temp_dir: strat_out, unstrat_out = run_metagenome_pipeline(input_biom=seqtab_biom, function=func_predict, marker=marker_predict, max_nsti=2, out_dir=temp_dir) # Need to reset index names since these aren't in output files. strat_out.index = range(30) pd.testing.assert_frame_equal(strat_out, exp_strat_in)
def metagenome_pipeline_steps(input_table, func_infile, marker_infile, func_output_dir, no_descrip, max_nsti, min_reads, min_samples, stratified, threads, func_map, verbose): '''Steps wraping metagenome pipeline moved to separate function to decrease memory usage.''' # Infer metagenome abundances per-sample. # Pass arguments to key function and get predicted functions # stratified and unstratified by genomes. strat_pred, unstrat_pred = run_metagenome_pipeline(input_biom=input_table, function=func_infile, marker=marker_infile, out_dir=func_output_dir, max_nsti=max_nsti, min_reads=min_reads, min_samples=min_samples, strat_out=stratified, proc=threads, output_normfile=True) unstrat_pred.index.name = "function" unstrat_pred.reset_index(inplace=True) if not no_descrip and func_map: unstrat_pred = add_descrip_col(inputfile=unstrat_pred, mapfile=func_map, in_df=True) # Write out stratified table only if that option was specified. if stratified: strat_pred.reset_index(inplace=True) if not no_descrip and func_map: strat_pred = add_descrip_col(inputfile=strat_pred, mapfile=func_map, in_df=True) if verbose: print("Writing metagenome output files for " + func + " to: " + func_output_dir) unstrat_outfile = path.join(func_output_dir, "pred_metagenome_unstrat.tsv") unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t", index=False) strat_outfile = None if stratified: strat_outfile = path.join(func_output_dir, "pred_metagenome_strat.tsv") strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=False) # Return output filenames. return (strat_outfile, unstrat_outfile)
def test_full_pipeline_unstrat_biom(self): '''Test that run_metagenome_pipeline create corrected unstratified output on biom input seqtab.''' with TemporaryDirectory() as temp_dir: strat_out, unstrat_out = run_metagenome_pipeline(input_seqabun=seqtab_biom, function=func_predict, marker=marker_predict, max_nsti=2.1, out_dir=temp_dir, strat_out=False) pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_in, check_like=True)
def test_full_pipeline_unstrat_msf_when_no_strat(self): '''Test that run_metagenome_pipeline works on mothur shared file input seqtab when strat_out=False.''' with TemporaryDirectory() as temp_dir: strat_out, unstrat_out = run_metagenome_pipeline(input_seqabun=seqtab_msf, function=func_predict, marker=marker_predict, max_nsti=2.1, out_dir=temp_dir, strat_out=False) pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_in, check_like=True)
def test_full_pipeline_strat_wide_biom(self): '''Test that run_metagenome_pipeline creates correct stratified output on biom input seqtab. Compare with wide-format table in this case.''' with TemporaryDirectory() as temp_dir: strat_out, unstrat_out = run_metagenome_pipeline(input_seqabun=seqtab_biom, function=func_predict, marker=marker_predict, max_nsti=2.0, out_dir=temp_dir, strat_out=True, wide_table=True) pd.testing.assert_frame_equal(strat_out, exp_strat_wide_in, check_like=True)
def test_full_pipeline_strat_tsv_2proc(self): '''Test that run_metagenome_pipeline works on tsv input seqtab and running on 2 processes.''' with TemporaryDirectory() as temp_dir: strat_out, unstrat_out = run_metagenome_pipeline( input_biom=seqtab_tsv, function=func_predict, marker=marker_predict, max_nsti=2, out_dir=temp_dir, proc=2, strat_out=True) pd.testing.assert_frame_equal(strat_out, exp_strat_in, check_like=True)
def test_full_pipeline_strat_tsv(self): '''Test that run_metagenome_pipeline works on tsv input seqtab.''' with TemporaryDirectory() as temp_dir: strat_out, unstrat_out = run_metagenome_pipeline(input_seqabun=seqtab_tsv_simple, function=func_simple_in, marker=marker_simple_in, max_nsti=1.9, out_dir=temp_dir, strat_out=True, wide_table=False) pd.testing.assert_frame_equal(strat_out.reset_index(drop=True), exp_strat_simple_in.reset_index(drop=True), check_like=True) pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_simple_in, check_like=True)
def test_full_pipeline_strat_wide_tsv(self): '''Test that run_metagenome_pipeline works on tsv input seqtab. Compare with wide-format table in this case.''' with TemporaryDirectory() as temp_dir: strat_out, unstrat_out = run_metagenome_pipeline(input_seqabun=seqtab_tsv, function=func_predict, marker=marker_predict, max_nsti=1.9, out_dir=temp_dir, strat_out=True, wide_table=True) pd.testing.assert_frame_equal(strat_out, exp_strat_wide_in, check_like=True) pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_in, check_like=True)
def test_full_pipeline_strat_rare_category_tsv(self): '''Test that run_metagenome_pipeline works on tsv input seqtab and when rare seqs are collapsed into RARE category''' with TemporaryDirectory() as temp_dir: strat_out, unstrat_out = run_metagenome_pipeline(input_seqabun=seqtab_tsv_simple, function=func_simple_in, marker=marker_simple_in, max_nsti=2.1, min_reads=10, min_samples=2, out_dir=temp_dir, strat_out=True, wide_table=False) pd.testing.assert_frame_equal(strat_out.reset_index(drop=True), exp_strat_simple_rare_in.reset_index(drop=True), check_like=True) pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_simple_in, check_like=True)
def main(): args = parser.parse_args() check_files_exist([args.input, args.function]) strat_pred, unstrat_pred = run_metagenome_pipeline( input_seqabun=args.input, function=args.function, max_nsti=args.max_nsti, marker=args.marker, out_dir=args.out_dir, min_reads=args.min_reads, min_samples=args.min_samples, strat_out=args.strat_out, wide_table=args.wide_table, skip_norm=args.skip_norm) unstrat_outfile = path.join(args.out_dir, "pred_metagenome_unstrat.tsv.gz") unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t", index=True, index_label="function", compression="gzip") if args.strat_out and not args.wide_table: strat_outfile = path.join(args.out_dir, "pred_metagenome_contrib.tsv.gz") strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=False, compression="gzip") elif args.strat_out and args.wide_table: strat_outfile = path.join(args.out_dir, "pred_metagenome_strat.tsv.gz") strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=True, compression="gzip")
def test_full_pipeline_strat_tsv_rare_category(self): '''Test that run_metagenome_pipeline works on tsv input seqtab and when rare seqs are collapsed into RARE category''' with TemporaryDirectory() as temp_dir: strat_out, unstrat_out = run_metagenome_pipeline( input_seqabun=seqtab_tsv, function=func_predict, marker=marker_predict, max_nsti=2.1, min_reads=4, min_samples=2, out_dir=temp_dir, strat_out=True) pd.testing.assert_frame_equal(strat_out, exp_strat_in_rare, check_like=True) pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_in, check_like=True)
def main(): args = parser.parse_args() # Check that input files exist. check_files_exist([args.input, args.function, args.marker]) # Pass arguments to key function and get predicted functions # stratified and unstratified by genomes. strat_pred, unstrat_pred = run_metagenome_pipeline(input_biom=args.input, function=args.function, marker=args.marker, out_dir=args.out_dir, max_nsti=args.max_nsti, proc=args.proc, output_normfile=True) # Generate output table filepaths and write out pandas dataframes. strat_outfile = path.join(args.out_dir, "pred_metagenome_strat.tsv") unstrat_outfile = path.join(args.out_dir, "pred_metagenome_unstrat.tsv") # Note that no index labels are written for stratified output. strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=False) unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t")
def main(): args = parser.parse_args() # Get start time. start_time = time.time() # Check that input files exist. check_files_exist([args.study_fasta, args.input]) # Make output folder. make_output_dir(args.output) out_tree = path.join(args.output, "out.tre") if args.custom_trait_tables is None: # Check that specified functional categories are allowed. FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM'] funcs = args.in_traits.split(",") for func in funcs: if func not in FUNC_TRAIT_OPTIONS: sys.exit("Error - specified category " + func + " is not one of " "the default categories.") # Add EC to this set if pathways are to be predicted. if "EC" not in funcs and not args.no_pathways: funcs.append("EC") rxn_func = "EC" func_tables = default_tables else: funcs = [] func_tables = {} table_i = 0 for custom in args.custom_trait_tables.split(","): func_id = path.splitext(path.basename(custom))[0] funcs.append(func_id) func_tables[func_id] = custom if table_i == 0: rxn_func = func_id table_i += 1 # Append marker as well, since this also needs to be run. funcs.append("marker") func_tables["marker"] = args.marker_gene_table # Methods for discrete trait prediction with CI enabled. discrete_set = set(['emp_prob', 'mp']) if args.confidence and args.hsp_method in discrete_set: ci_setting = True else: ci_setting = False gap_fill_opt = not args.no_gap_fill with TemporaryDirectory() as temp_dir: print("Placing sequences onto reference tree.") place_seqs_pipeline(study_fasta=args.study_fasta, ref_msa=args.ref_msa, tree=args.tree, out_tree=out_tree, threads=args.threads, papara_output=None, out_dir=temp_dir, chunk_size=5000, print_cmds=args.print_cmds) print("Finished placing sequences on output tree: " + out_tree) # Get predictions for all specified functions and keep track of outfiles. predicted_funcs = {} for func in funcs: # Only output NSTI in 16S table. nsti_setting = False if func == "marker" and args.calculate_NSTI: nsti_setting = True print("Running hidden-state prediction for " + func) hsp_table, ci_table = castor_hsp_workflow( tree_path=out_tree, trait_table_path=func_tables[func], hsp_method=args.hsp_method, calc_nsti=nsti_setting, calc_ci=ci_setting, check_input=False, num_proc=args.threads, ran_seed=args.seed) count_outfile = path.join(args.output, func + "_predicted.tsv") # Add "_nsti" to filename if output. if nsti_setting: count_outfile = path.join(args.output, func + "_nsti_predicted.tsv") # Keep track of output file name for next step of pipeline. predicted_funcs[func] = count_outfile print("Writing out predicted gene family abundances to " + count_outfile) hsp_table.to_csv(path_or_buf=count_outfile, index_label="sequence", sep="\t") # Output the CI file as well if option set. if ci_setting: ci_outfile = path.join(args.output, func + "_predicted_ci.tsv") print("Writing out predicted gene family CIs to " + ci_outfile) ci_table.to_csv(path_or_buf=ci_outfile, index_label="sequence", sep="\t") marker_infile = predicted_funcs["marker"] # Loop over each function again and run metagenome pipeline. for func in funcs: if func == "marker": continue func_infile = predicted_funcs[func] func_output_dir = path.join(args.output, func + "_metagenome_out") print("Running metagenome pipeline for " + func) # Infer metagenome abundances per-sample. with TemporaryDirectory() as temp_dir: # Pass arguments to key function and get predicted functions # stratified and unstratified by genomes. strat_pred, unstrat_pred = run_metagenome_pipeline( input_biom=args.input, function=func_infile, marker=marker_infile, out_dir=func_output_dir, max_nsti=args.max_nsti, min_reads=args.min_reads, min_samples=args.min_samples, strat_out=args.stratified, proc=args.threads, output_normfile=True) print("Writing metagenome output files for " + func + " to: " + func_output_dir) # Generate output table filepaths and write out pandas dataframe. unstrat_outfile = path.join(func_output_dir, "pred_metagenome_unstrat.tsv") unstrat_pred.index.name = "function" unstrat_pred.reset_index(inplace=True) if args.custom_trait_tables is None: unstrat_pred = add_descrip_col(inputfile=unstrat_pred, mapfile=default_map[func], in_df=True) unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t", index=False) # Write out stratified table only if that option was specified. if args.stratified: strat_outfile = path.join(func_output_dir, "pred_metagenome_strat.tsv") strat_pred.reset_index(inplace=True) if args.custom_trait_tables is None: strat_pred = add_descrip_col(inputfile=strat_pred, mapfile=default_map[func], in_df=True) strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=False) # Infer pathway abundances and coverages unless --no_pathways set. if not args.no_pathways: if args.stratified: in_metagenome = path.join(args.output, rxn_func + "_metagenome_out", "pred_metagenome_strat.tsv") else: in_metagenome = path.join(args.output, rxn_func + "_metagenome_out", "pred_metagenome_unstrat.tsv") print("Inferring MetaCyc pathways from predicted functions in this " "file: " + in_metagenome) with TemporaryDirectory() as temp_dir: unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline( inputfile=in_metagenome, mapfile=default_pathway_map, regroup_mapfile=default_regroup_map, proc=args.threads, out_dir=temp_dir, gap_fill=gap_fill_opt, per_sequence_contrib=args.per_sequence_contrib, print_cmds=args.print_cmds) pathways_out = path.join(args.output, "pathways_out") make_output_dir(pathways_out) print("Writing predicted pathway abundances and coverages to " + pathways_out) # Write output files. unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv") unstrat_abun.reset_index(inplace=True) if args.custom_trait_tables is None: unstrat_abun = add_descrip_col(inputfile=unstrat_abun, mapfile=default_map["METACYC"], in_df=True) unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile, sep="\t", index=False) unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv") unstrat_cov.reset_index(inplace=True) if args.custom_trait_tables is None: unstrat_cov = add_descrip_col(inputfile=unstrat_cov, mapfile=default_map["METACYC"], in_df=True) unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile, sep="\t", index=False) # Write stratified output only if something besides None was returned. if strat_abun is not None: strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv") if args.custom_trait_tables is None: strat_abun = add_descrip_col(inputfile=strat_abun, mapfile=default_map["METACYC"], in_df=True) strat_abun.to_csv(path_or_buf=strat_abun_outfile, sep="\t", index=False) if strat_cov is not None: strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv") if args.custom_trait_tables is None: strat_cov = add_descrip_col(inputfile=strat_cov, mapfile=default_map["METACYC"], in_df=True) strat_cov.to_csv(path_or_buf=strat_cov_outfile, sep="\t", index=False) # Print out elapsed time. elapsed_time = time.time() - start_time print("Completed PICRUSt2 pipeline in " + "%.2f" % elapsed_time + " seconds.")