def test_identify_ref_files(self): '''Test for reference files being identified correctly.''' expected_files = [ default_fasta, default_tree, default_hmm, default_model ] identified_files = identify_ref_files(default_ref_dir) self.assertEqual(expected_files, identified_files)
def full_pipeline(study_fasta, input_table, output_folder, processes, ref_dir, in_traits, custom_trait_tables, marker_gene_table, pathway_map, rxn_func, no_pathways, regroup_map, no_regroup, stratified, max_nsti, min_reads, min_samples, hsp_method, min_align, skip_nsti, skip_minpath, no_gap_fill, coverage, per_sequence_contrib, wide_table, skip_norm, remove_intermediate, verbose): '''Function that contains wrapper commands for full PICRUSt2 pipeline. Descriptions of all of these input arguments/options are given in the picrust2_pipeline.py script.''' # Throw warning if --per_sequence_contrib set but --stratified unset. if per_sequence_contrib and not stratified: print( "\nThe option --per_sequence_contrib was set, but not the option " "--stratified. This means that a stratified pathway table will " "be output only (i.e. a stratified metagenome table will NOT " "be output).\n", file=sys.stderr) out_tree = path.join(output_folder, "out.tre") if custom_trait_tables is None: # Check that specified functional categories are allowed. FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM'] funcs = in_traits.split(",") for func in funcs: if func not in FUNC_TRAIT_OPTIONS: sys.exit("Error - specified category " + func + " is not " + "one of the default categories.") func_tables = default_tables else: # Split paths to input custom trait tables and take the basename to be # the function id. funcs = [] func_tables = {} for custom in custom_trait_tables.split(","): func_id = path.splitext(path.basename(custom))[0] funcs.append(func_id) func_tables[func_id] = custom # Add reaction function to be in set of gene families if it is not already # and as long as pathways are also to be predicted. if rxn_func not in funcs and not no_pathways: orig_rxn_func = rxn_func rxn_func = path.splitext(path.basename(rxn_func))[0] funcs.append(rxn_func) if rxn_func not in func_tables: func_tables[rxn_func] = orig_rxn_func if not skip_norm: # Append marker as well, since this also needs to be run. funcs.append("marker") func_tables["marker"] = marker_gene_table # Check that all input files exist. ref_msa, tree, hmm, model = identify_ref_files(ref_dir) files2check = [study_fasta, input_table, ref_msa, tree, hmm, model] + list( func_tables.values()) if not no_pathways: files2check.append(pathway_map) # Throw warning if default pathway mapfile used with non-default # reference files. if pathway_map == default_pathway_map and ref_dir != default_ref_dir: print( "Warning - non-default reference files specified with " "default pathway mapfile of prokaryote-specific MetaCyc " "pathways (--pathway_map option). This usage may be " "unintended.", file=sys.stderr) if not no_regroup: files2check.append(regroup_map) # This will throw an error if any input files are not found. check_files_exist(files2check) # Check that sequence names in FASTA overlap with input table. check_overlapping_seqs(study_fasta, input_table, verbose) if path.exists(output_folder): sys.exit("Stopping since output directory " + output_folder + " already exists.") # Make output folder. make_output_dir(output_folder) if verbose: print("Placing sequences onto reference tree", file=sys.stderr) # Define folders for intermediate files (unless --remove_intermediate set). if remove_intermediate: place_seqs_intermediate = "" pathways_intermediate = "" else: intermediate_dir = path.join(output_folder, "intermediate") make_output_dir(intermediate_dir) place_seqs_intermediate = path.join(intermediate_dir, "place_seqs") pathways_intermediate = path.join(intermediate_dir, "pathways") # Run place_seqs.py. place_seqs_cmd = [ "place_seqs.py", "--study_fasta", study_fasta, "--ref_dir", ref_dir, "--out_tree", out_tree, "--processes", str(processes), "--intermediate", place_seqs_intermediate, "--min_align", str(min_align), "--chunk_size", str(5000) ] if verbose: place_seqs_cmd.append("--verbose") system_call_check(place_seqs_cmd, print_command=verbose, print_stdout=verbose, print_stderr=True) if verbose: print("Finished placing sequences on output tree: " + out_tree, file=sys.stderr) # Get predictions for all specified functions and keep track of outfiles. predicted_funcs = {} if not skip_norm: # Make sure marker database is first in the list. This is because this will # be run on a single core and so will be easier to identify any errors # if the program exits when working on this function type. funcs.insert(0, funcs.pop(funcs.index("marker"))) for func in funcs: # Change output filename for NSTI and non-NSTI containing files. hsp_outfile = path.join(output_folder, func + "_predicted") if (func == "marker" and not skip_nsti) or (skip_norm and not skip_nsti): hsp_outfile = hsp_outfile + "_and_nsti.tsv.gz" else: hsp_outfile = hsp_outfile + ".tsv.gz" # Keep track of output filename for next step of pipeline. predicted_funcs[func] = hsp_outfile # Run hsp.py for each function database. hsp_cmd = [ "hsp.py", "--tree", out_tree, "--output", hsp_outfile, "--observed_trait_table", func_tables[func], "--hsp_method", hsp_method, "--seed", "100" ] # Add flags to command if specified. if (func == "marker" and not skip_nsti) or (skip_norm and not skip_nsti): hsp_cmd.append("--calculate_NSTI") # Run marker on only 1 processor. if func == "marker": hsp_cmd += ["--processes", "1"] else: hsp_cmd += ["--processes", str(processes)] if verbose: hsp_cmd.append("--verbose") system_call_check(hsp_cmd, print_command=verbose, print_stdout=verbose, print_stderr=True) # Now run metagenome pipeline commands. # Inititalize dictionary of function names --> metagenome output files. func_output = {} # Loop over each function again and run metagenome pipeline. for func in funcs: if func == "marker": continue if verbose: print("Running metagenome pipeline for " + func, file=sys.stderr) func_output_dir = path.join(output_folder, func + "_metagenome_out") metagenome_pipeline_cmd = [ "metagenome_pipeline.py", "--input", input_table, "--function", predicted_funcs[func], "--min_reads", str(min_reads), "--min_samples", str(min_samples), "--out_dir", func_output_dir ] # Initialize two-element list as value for each function. # First value will be unstratified output and second will be # stratified output. func_output[func] = [None, None] func_output[func][0] = path.join(func_output_dir, "pred_metagenome_unstrat.tsv.gz") if wide_table: metagenome_pipeline_cmd.append("--wide_table") if not skip_nsti: metagenome_pipeline_cmd += ["--max_nsti", str(max_nsti)] if skip_norm: metagenome_pipeline_cmd.append("--skip_norm") else: metagenome_pipeline_cmd += ["--marker", predicted_funcs["marker"]] if stratified: metagenome_pipeline_cmd.append("--strat_out") if wide_table: func_output[func][1] = path.join( func_output_dir, "pred_metagenome_strat.tsv.gz") else: func_output[func][1] = path.join( func_output_dir, "pred_metagenome_contrib.tsv.gz") system_call_check(metagenome_pipeline_cmd, print_command=verbose, print_stdout=verbose, print_stderr=True) # Now infer pathway abundances and coverages unless --no_pathways set. pathway_outfiles = None if not no_pathways: path_output_dir = path.join(output_folder, "pathways_out") if verbose: print("Inferring pathways from predicted " + rxn_func) # Determine whether stratified or unstratified table should be input. if not stratified or per_sequence_contrib: rxn_input_metagenome = func_output[rxn_func][0] else: rxn_input_metagenome = func_output[rxn_func][1] pathway_pipeline_cmd = [ "pathway_pipeline.py", "--input", rxn_input_metagenome, "--out_dir", path_output_dir, "--map", pathway_map, "--intermediate", pathways_intermediate, "--proc", str(processes) ] if no_gap_fill: pathway_pipeline_cmd.append("--no_gap_fill") if skip_minpath: pathway_pipeline_cmd.append("--skip_minpath") if coverage: pathway_pipeline_cmd.append("--coverage") if no_regroup: pathway_pipeline_cmd.append("--no_regroup") else: pathway_pipeline_cmd += ["--regroup_map", regroup_map] if wide_table: pathway_pipeline_cmd.append("--wide_table") if per_sequence_contrib: pathway_pipeline_cmd.append("--per_sequence_contrib") if skip_norm: norm_sequence_abun = input_table else: norm_sequence_abun = path.join(output_folder, rxn_func + "_metagenome_out", "seqtab_norm.tsv.gz") pathway_pipeline_cmd += ["--per_sequence_abun", norm_sequence_abun] pathway_pipeline_cmd += [ "--per_sequence_function", predicted_funcs[rxn_func] ] if verbose: pathway_pipeline_cmd.append("--verbose") system_call_check(pathway_pipeline_cmd, print_command=verbose, print_stdout=False, print_stderr=True) if verbose: print("Wrote predicted pathway abundances and coverages to " + path_output_dir, file=sys.stderr) # Keep track of output filenames if this function is being used in # a non-default way (e.g. with a QIIME2 plugin). pathway_outfiles = {} pathway_outfiles["unstrat_abun"] = path.join( path_output_dir, "path_abun_unstrat.tsv.gz") pathway_outfiles["unstrat_cov"] = path.join(path_output_dir, "path_cov_unstrat.tsv.gz") pathway_outfiles["strat_abun"] = None pathway_outfiles["strat_cov"] = None if stratified or per_sequence_contrib: if wide_table: pathway_outfiles["strat_abun"] = path.join( path_output_dir, "path_abun_strat.tsv.gz") if per_sequence_contrib: pathway_outfiles["strat_cov"] = path.join( path_output_dir, "path_cov_strat.tsv.gz") else: pathway_outfiles["strat_abun"] = path.join( path_output_dir, "path_abun_contrib.tsv.gz") if per_sequence_contrib: pathway_outfiles["strat_cov"] = path.join( path_output_dir, "path_cov_contrib.tsv.gz") return (func_output, pathway_outfiles)