def test_basic_pipeline_2_proc(self): '''Test running full pipeline over 2 processes.''' with TemporaryDirectory() as temp_dir: test_unstrat, test_strat = run_minpath_pipeline( inputfile=in_metagenome_abun, mapfile=map_ec2path_prokaryotic, proc=2, out_dir=temp_dir) # Compare to expected pathway abundances. exp_path_abun_strat = pd.read_csv(exp_minpath_out_strat, sep="\t") exp_path_abun_unstrat = pd.read_csv(exp_minpath_out_unstrat, sep="\t", index_col="pathway") test_unstrat.index.name = "pathway" # Sort stratified files (different versions can sort the output slightly differently). test_strat.sort_values(['pathway', 'sequence'], inplace=True) exp_path_abun_strat.sort_values(['pathway', 'sequence'], inplace=True) # Reset index labels. test_strat.reset_index(inplace=True, drop=True) exp_path_abun_strat.reset_index(inplace=True, drop=True) pd.testing.assert_frame_equal(exp_path_abun_unstrat, test_unstrat, check_like=True) pd.testing.assert_frame_equal(exp_path_abun_strat, test_strat, check_like=True)
def test_unstrat_default_pipeline(self): '''Test running default pipeline on unstratified input table.''' with TemporaryDirectory() as temp_dir: unstrat_path_abun_df, unstrat_path_cov_df, strat_abun, strat_cov = run_minpath_pipeline( in_metagenome_unstrat, default_pathway_map, proc=1, out_dir=temp_dir, regroup_mapfile=default_regroup_map, gap_fill=True, per_sequence_contrib=False, print_cmds=False) # Compare these predicted tables to expected tables. exp_abun_unstrat = pd.read_csv(exp_minpath_abun_unstrat, sep="\t", index_col="pathway") exp_cov_unstrat = pd.read_csv(exp_minpath_cov_unstrat, sep="\t", index_col="pathway") pd.testing.assert_frame_equal(exp_abun_unstrat, unstrat_path_abun_df, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_cov_unstrat, unstrat_path_cov_df, check_like=True, check_less_precise=True)
def main(): args = parser.parse_args() # Check that input files exist. check_files_exist([args.input, args.map]) # If intermediate output directory set then create and output there. # Otherwise make a temporary directory for the intermediate files. if args.intermediate: make_output_dir(args.intermediate) unstrat_out, strat_out = run_minpath_pipeline( inputfile=args.input, mapfile=args.map, proc=args.proc, out_dir=args.intermediate, print_cmds=args.print_cmds) else: with TemporaryDirectory() as temp_dir: unstrat_out, strat_out = run_minpath_pipeline( inputfile=args.input, mapfile=args.map, proc=args.proc, out_dir=temp_dir, print_cmds=args.print_cmds) # Write output files. unstrat_outfile = args.out_prefix + "_unstrat_path.tsv" strat_outfile = args.out_prefix + "_strat_path.tsv" unstrat_out.to_csv(path_or_buf=unstrat_outfile, sep="\t", index_label="pathway") strat_out.to_csv(path_or_buf=strat_outfile, sep="\t", index=False)
def main(): args = parser.parse_args() # Check that input files exist. check_files_exist([args.input, args.map]) gap_fill_opt = not args.no_gap_fill # If no regrouping flag set then set input regrouping mapfile to be None. if args.no_regroup: args.regroup_map = None # If intermediate output directory set then create and output there. # Otherwise make a temporary directory for the intermediate files. if args.intermediate: make_output_dir(args.intermediate) unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline( inputfile=args.input, mapfile=args.map, regroup_mapfile=args.regroup_map, proc=args.proc, out_dir=args.intermediate, gap_fill=gap_fill_opt, per_sequence_contrib=args.per_sequence_contrib, print_cmds=args.print_cmds) else: with TemporaryDirectory() as temp_dir: unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline( inputfile=args.input, mapfile=args.map, regroup_mapfile=args.regroup_map, proc=args.proc, out_dir=temp_dir, gap_fill=gap_fill_opt, per_sequence_contrib=args.per_sequence_contrib, print_cmds=args.print_cmds) make_output_dir(args.out_dir) # Write output files. unstrat_abun_outfile = path.join(args.out_dir, "path_abun_unstrat.tsv") unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile, sep="\t", index_label="pathway") unstrat_cov_outfile = path.join(args.out_dir, "path_cov_unstrat.tsv") unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile, sep="\t", index_label="pathway") # Write stratified output only if something besides None was returned. if strat_abun is not None: strat_abun_outfile = path.join(args.out_dir, "path_abun_strat.tsv") strat_abun.to_csv(path_or_buf=strat_abun_outfile, sep="\t", index=False) if strat_cov is not None: strat_cov_outfile = path.join(args.out_dir, "path_cov_strat.tsv") strat_cov.to_csv(path_or_buf=strat_cov_outfile, sep="\t", index=False)
def test_strat_default_pipeline(self): '''Test running strat_minpath default pipeline. Make sure that community wide stratified abundances are calculated correctly and that unstratified abundances are right.''' with TemporaryDirectory() as temp_dir: unstrat_path_abun_df, unstrat_path_cov_df, strat_path_abun_df, strat_cov = run_minpath_pipeline( in_metagenome_strat2, default_pathway_map, proc=1, out_dir=temp_dir, regroup_mapfile=default_regroup_map, gap_fill=True, per_sequence_contrib=False, print_cmds=False) # Compare these predicted tables to expected tables. exp_abun_unstrat = pd.read_csv(exp_minpath_abun_unstrat, sep="\t", index_col="pathway") exp_cov_unstrat = pd.read_csv(exp_minpath_cov_unstrat, sep="\t", index_col="pathway") exp_abun_strat = pd.read_csv(exp_minpath_abun_strat, sep="\t") # Sort stratified files (different versions can sort the output # slightly differently). strat_path_abun_df.sort_values(['pathway', 'sequence'], inplace=True) exp_abun_strat.sort_values(['pathway', 'sequence'], inplace=True) # Reset index labels. exp_abun_strat.reset_index(drop=True, inplace=True) strat_path_abun_df.reset_index(drop=True, inplace=True) pd.testing.assert_frame_equal(exp_abun_unstrat, unstrat_path_abun_df, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_cov_unstrat, unstrat_path_cov_df, check_like=True, check_less_precise=True) # Check with less precision here since the HUMAnN2 output that is used # as expected abundances are not rounded. pd.testing.assert_frame_equal(exp_abun_strat, strat_path_abun_df, check_like=True, check_less_precise=True)
def test_strat_per_genome_pipeline(self): '''Test running strat_minpath default pipeline. Make sure that per genome contributions are correct (per_sequence_contrib set).''' with TemporaryDirectory() as temp_dir: unstrat_path_abun_df, unstrat_path_cov_df, strat_path_abun_df, strat_path_cov_df = run_minpath_pipeline( in_metagenome_strat, default_pathway_map, proc=1, out_dir=temp_dir, regroup_mapfile=default_regroup_map, gap_fill=True, per_sequence_contrib=True, print_cmds=False) # Compare these predicted tables to expected tables. exp_abun_unstrat = pd.read_csv(exp_minpath_abun_unstrat, sep="\t", index_col="pathway") exp_cov_unstrat = pd.read_csv(exp_minpath_cov_unstrat, sep="\t", index_col="pathway") exp_abun_strat = pd.read_csv(exp_minpath_abun_strat_per_genome, sep="\t") exp_cov_strat = pd.read_csv(exp_minpath_cov_strat_per_genome, sep="\t") # Sort stratified files (different versions can sort the output # slightly differently). strat_path_abun_df.sort_values(['pathway', 'sequence'], inplace=True) exp_abun_strat.sort_values(['pathway', 'sequence'], inplace=True) strat_path_cov_df.sort_values(['pathway', 'sequence'], inplace=True) exp_cov_strat.sort_values(['pathway', 'sequence'], inplace=True) # Reset index labels. exp_abun_strat.reset_index(drop=True, inplace=True) strat_path_abun_df.reset_index(drop=True, inplace=True) exp_cov_strat.reset_index(drop=True, inplace=True) strat_path_cov_df.reset_index(drop=True, inplace=True) pd.testing.assert_frame_equal(exp_abun_unstrat, unstrat_path_abun_df, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_cov_unstrat, unstrat_path_cov_df, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_abun_strat, strat_path_abun_df, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_cov_strat, strat_path_cov_df, check_like=True, check_less_precise=True)
def full_pipeline(study_fasta, input_table, output_folder, threads, ref_msa, tree, hmm, in_traits, custom_trait_tables, marker_gene_table, pathway_map, no_pathways, regroup_map, no_regroup, stratified, alignment_tool, max_nsti, min_reads, min_samples, hsp_method, calculate_NSTI, confidence, seed, no_gap_fill, per_sequence_contrib, no_descrip, verbose): '''Function that contains wrapper commands for full PICRUSt2 pipeline. Descriptions of all of these input arguments/options are given in the picrust2_pipeline.py script.''' # Check that input files exist. check_files_exist([study_fasta, input_table]) if path.exists(output_folder): sys.exit("Stopping - output directory " + output_folder + " already exists.") # Make output folder. make_output_dir(output_folder) out_tree = path.join(output_folder, "out.tre") if custom_trait_tables is None: # Check that specified functional categories are allowed. FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM'] funcs = in_traits.split(",") for func in funcs: if func not in FUNC_TRAIT_OPTIONS: sys.exit("Error - specified category " + func + " is not one of " "the default categories.") # Add EC to this set if pathways are to be predicted. if "EC" not in funcs and not no_pathways: funcs.append("EC") rxn_func = "EC" func_tables = default_tables else: no_descrip = True funcs = [] func_tables = {} table_i = 0 for custom in custom_trait_tables.split(","): func_id = path.splitext(path.basename(custom))[0] funcs.append(func_id) func_tables[func_id] = custom if table_i == 0: rxn_func = func_id table_i += 1 # Append marker as well, since this also needs to be run. funcs.append("marker") func_tables["marker"] = marker_gene_table # Methods for discrete trait prediction with CI enabled. discrete_set = set(['emp_prob', 'mp']) if confidence and hsp_method in discrete_set: ci_setting = True else: ci_setting = False gap_fill_opt = not no_gap_fill if verbose: print("Placing sequences onto reference tree.") # Define folders for intermediate files. intermediate_dir = path.join(output_folder, "intermediate") place_seqs_intermediate = path.join(intermediate_dir, "place_seqs") make_output_dir(intermediate_dir) make_output_dir(place_seqs_intermediate) place_seqs_pipeline(study_fasta=study_fasta, ref_msa=ref_msa, tree=tree, hmm=hmm, out_tree=out_tree, alignment_tool=alignment_tool, threads=threads, out_dir=place_seqs_intermediate, chunk_size=5000, print_cmds=verbose) if verbose: print("Finished placing sequences on output tree: " + out_tree) # Get predictions for all specified functions and keep track of outfiles. predicted_funcs = {} for func in funcs: count_outfile = hsp_pipeline_steps(func=func, calculate_NSTI=calculate_NSTI, out_tree=out_tree, func_table_in=func_tables[func], hsp_method=hsp_method, ci_setting=ci_setting, threads=threads, seed=seed, output_folder=output_folder, verbose=verbose) # Keep track of output file name for next step of pipeline. predicted_funcs[func] = count_outfile marker_infile = predicted_funcs["marker"] # Inititalize dictionary of function names to output filenames to return. func_output = {} # Each value will be a list of 2 elements corresponding to the unstratified # and stratified tables respectively (stratified will be None of not calculated). # Loop over each function again and run metagenome pipeline. for func in funcs: if func == "marker": continue if verbose: print("Running metagenome pipeline for " + func) func_infile = predicted_funcs[func] func_output_dir = path.join(output_folder, func + "_metagenome_out") func_map = None if func in default_map: func_map = default_map[func] func_strat_out, func_unstrat_out = metagenome_pipeline_steps( input_table=input_table, func_infile=func_infile, marker_infile=marker_infile, func_output_dir=func_output_dir, no_descrip=no_descrip, max_nsti=max_nsti, min_reads=min_reads, min_samples=min_samples, stratified=stratified, threads=threads, func_map=func_map, verbose=verbose) if stratified: func_output[func] = func_strat_out else: func_output[func] = func_unstrat_out pathway_outfiles = None # Infer pathway abundances and coverages unless --no_pathways set. if not no_pathways: pathways_intermediate = path.join(intermediate_dir, "pathways") make_output_dir(pathways_intermediate) if verbose: print("Inferring pathways from predicted " + rxn_func) predicted_rxn = func_output[rxn_func] # Set regrouping mapfile to be empty if no_regroup set. if no_regroup: regroup_map = None unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline( inputfile=predicted_rxn, mapfile=pathway_map, regroup_mapfile=regroup_map, proc=threads, out_dir=pathways_intermediate, gap_fill=gap_fill_opt, per_sequence_contrib=per_sequence_contrib, print_cmds=verbose) pathways_out = path.join(output_folder, "pathways_out") unstrat_abun.index.name = 'pathway' unstrat_cov.index.name = 'pathway' unstrat_abun.reset_index(inplace=True) unstrat_cov.reset_index(inplace=True) pathway_outfiles = {} if not no_descrip: unstrat_abun = add_descrip_col(inputfile=unstrat_abun, mapfile=default_map["METACYC"], in_df=True) if not no_descrip: unstrat_cov = add_descrip_col(inputfile=unstrat_cov, mapfile=default_map["METACYC"], in_df=True) if verbose: print("Writing predicted pathway abundances and coverages to " + pathways_out) make_output_dir(pathways_out) unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv") unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv") unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile, sep="\t", index=False) unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile, sep="\t", index=False) pathway_outfiles["unstrat_abun"] = unstrat_abun_outfile pathway_outfiles["unstrat_cov"] = unstrat_cov_outfile strat_abun_outfile = None strat_cov_outfile = None # Write stratified output only if something besides None was returned. if strat_abun is not None: if not no_descrip: strat_abun = add_descrip_col(inputfile=strat_abun, mapfile=default_map["METACYC"], in_df=True) strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv") strat_abun.to_csv(path_or_buf=strat_abun_outfile, sep="\t", index=False) if strat_cov is not None: if not no_descrip: strat_cov = add_descrip_col(inputfile=strat_cov, mapfile=default_map["METACYC"], in_df=True) strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv") strat_cov.to_csv(path_or_buf=strat_cov_outfile, sep="\t", index=False) pathway_outfiles["strat_abun"] = strat_abun_outfile pathway_outfiles["strat_cov"] = strat_cov_outfile return (func_output, pathway_outfiles)
def main(): args = parser.parse_args() # Get start time. start_time = time.time() # Check that input files exist. check_files_exist([args.study_fasta, args.input]) # Make output folder. make_output_dir(args.output) out_tree = path.join(args.output, "out.tre") if args.custom_trait_tables is None: # Check that specified functional categories are allowed. FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM'] funcs = args.in_traits.split(",") for func in funcs: if func not in FUNC_TRAIT_OPTIONS: sys.exit("Error - specified category " + func + " is not one of " "the default categories.") # Add EC to this set if pathways are to be predicted. if "EC" not in funcs and not args.no_pathways: funcs.append("EC") rxn_func = "EC" func_tables = default_tables else: funcs = [] func_tables = {} table_i = 0 for custom in args.custom_trait_tables.split(","): func_id = path.splitext(path.basename(custom))[0] funcs.append(func_id) func_tables[func_id] = custom if table_i == 0: rxn_func = func_id table_i += 1 # Append marker as well, since this also needs to be run. funcs.append("marker") func_tables["marker"] = args.marker_gene_table # Methods for discrete trait prediction with CI enabled. discrete_set = set(['emp_prob', 'mp']) if args.confidence and args.hsp_method in discrete_set: ci_setting = True else: ci_setting = False gap_fill_opt = not args.no_gap_fill with TemporaryDirectory() as temp_dir: print("Placing sequences onto reference tree.") place_seqs_pipeline(study_fasta=args.study_fasta, ref_msa=args.ref_msa, tree=args.tree, out_tree=out_tree, threads=args.threads, papara_output=None, out_dir=temp_dir, chunk_size=5000, print_cmds=args.print_cmds) print("Finished placing sequences on output tree: " + out_tree) # Get predictions for all specified functions and keep track of outfiles. predicted_funcs = {} for func in funcs: # Only output NSTI in 16S table. nsti_setting = False if func == "marker" and args.calculate_NSTI: nsti_setting = True print("Running hidden-state prediction for " + func) hsp_table, ci_table = castor_hsp_workflow( tree_path=out_tree, trait_table_path=func_tables[func], hsp_method=args.hsp_method, calc_nsti=nsti_setting, calc_ci=ci_setting, check_input=False, num_proc=args.threads, ran_seed=args.seed) count_outfile = path.join(args.output, func + "_predicted.tsv") # Add "_nsti" to filename if output. if nsti_setting: count_outfile = path.join(args.output, func + "_nsti_predicted.tsv") # Keep track of output file name for next step of pipeline. predicted_funcs[func] = count_outfile print("Writing out predicted gene family abundances to " + count_outfile) hsp_table.to_csv(path_or_buf=count_outfile, index_label="sequence", sep="\t") # Output the CI file as well if option set. if ci_setting: ci_outfile = path.join(args.output, func + "_predicted_ci.tsv") print("Writing out predicted gene family CIs to " + ci_outfile) ci_table.to_csv(path_or_buf=ci_outfile, index_label="sequence", sep="\t") marker_infile = predicted_funcs["marker"] # Loop over each function again and run metagenome pipeline. for func in funcs: if func == "marker": continue func_infile = predicted_funcs[func] func_output_dir = path.join(args.output, func + "_metagenome_out") print("Running metagenome pipeline for " + func) # Infer metagenome abundances per-sample. with TemporaryDirectory() as temp_dir: # Pass arguments to key function and get predicted functions # stratified and unstratified by genomes. strat_pred, unstrat_pred = run_metagenome_pipeline( input_biom=args.input, function=func_infile, marker=marker_infile, out_dir=func_output_dir, max_nsti=args.max_nsti, min_reads=args.min_reads, min_samples=args.min_samples, strat_out=args.stratified, proc=args.threads, output_normfile=True) print("Writing metagenome output files for " + func + " to: " + func_output_dir) # Generate output table filepaths and write out pandas dataframe. unstrat_outfile = path.join(func_output_dir, "pred_metagenome_unstrat.tsv") unstrat_pred.index.name = "function" unstrat_pred.reset_index(inplace=True) if args.custom_trait_tables is None: unstrat_pred = add_descrip_col(inputfile=unstrat_pred, mapfile=default_map[func], in_df=True) unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t", index=False) # Write out stratified table only if that option was specified. if args.stratified: strat_outfile = path.join(func_output_dir, "pred_metagenome_strat.tsv") strat_pred.reset_index(inplace=True) if args.custom_trait_tables is None: strat_pred = add_descrip_col(inputfile=strat_pred, mapfile=default_map[func], in_df=True) strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=False) # Infer pathway abundances and coverages unless --no_pathways set. if not args.no_pathways: if args.stratified: in_metagenome = path.join(args.output, rxn_func + "_metagenome_out", "pred_metagenome_strat.tsv") else: in_metagenome = path.join(args.output, rxn_func + "_metagenome_out", "pred_metagenome_unstrat.tsv") print("Inferring MetaCyc pathways from predicted functions in this " "file: " + in_metagenome) with TemporaryDirectory() as temp_dir: unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline( inputfile=in_metagenome, mapfile=default_pathway_map, regroup_mapfile=default_regroup_map, proc=args.threads, out_dir=temp_dir, gap_fill=gap_fill_opt, per_sequence_contrib=args.per_sequence_contrib, print_cmds=args.print_cmds) pathways_out = path.join(args.output, "pathways_out") make_output_dir(pathways_out) print("Writing predicted pathway abundances and coverages to " + pathways_out) # Write output files. unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv") unstrat_abun.reset_index(inplace=True) if args.custom_trait_tables is None: unstrat_abun = add_descrip_col(inputfile=unstrat_abun, mapfile=default_map["METACYC"], in_df=True) unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile, sep="\t", index=False) unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv") unstrat_cov.reset_index(inplace=True) if args.custom_trait_tables is None: unstrat_cov = add_descrip_col(inputfile=unstrat_cov, mapfile=default_map["METACYC"], in_df=True) unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile, sep="\t", index=False) # Write stratified output only if something besides None was returned. if strat_abun is not None: strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv") if args.custom_trait_tables is None: strat_abun = add_descrip_col(inputfile=strat_abun, mapfile=default_map["METACYC"], in_df=True) strat_abun.to_csv(path_or_buf=strat_abun_outfile, sep="\t", index=False) if strat_cov is not None: strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv") if args.custom_trait_tables is None: strat_cov = add_descrip_col(inputfile=strat_cov, mapfile=default_map["METACYC"], in_df=True) strat_cov.to_csv(path_or_buf=strat_cov_outfile, sep="\t", index=False) # Print out elapsed time. elapsed_time = time.time() - start_time print("Completed PICRUSt2 pipeline in " + "%.2f" % elapsed_time + " seconds.")