def run_epa_ng(tree: str, ref_msa_fastafile: str, study_msa_fastafile: str, model: str, out_dir: str, chunk_size=5000, threads=1, print_cmds=False): '''Run EPA-NG on specified tree, reference MSA, and study sequence MSA. Will output a .jplace file in out_dir.''' make_output_dir(out_dir) epa_ng_command = [ "epa-ng", "--tree", tree, "--ref-msa", ref_msa_fastafile, "--query", study_msa_fastafile, "--chunk-size", str(chunk_size), "-T", str(threads), "-m", model, "-w", out_dir, "--filter-acc-lwr", "0.99", "--filter-max", "100" ] system_call_check(epa_ng_command, print_out=print_cmds) # Parse jplace file so that output is reprodicible. jplace_orig = path.join(out_dir, "epa_result.jplace") jplace_parsed = path.join(out_dir, "epa_result_parsed.jplace") parse_jplace(jplace_orig, jplace_parsed)
def castor_nsti(tree_path, known_tips): '''Will calculate distance from each study sequence to the closest reference sequence. Takes in the path to treefile and the known tips (i.e. the rownames in the trait table - the reference genome ids).''' castor_nsti_script = path.join(path.dirname(path.abspath(__file__)), 'Rscripts', 'castor_nsti.R') # Create temporary directory for working in. with TemporaryDirectory() as temp_dir: # Output known tip names to temp file # (note this object is a numpy.ndarray) known_tips_out = path.join(temp_dir, "known_tips.txt") known_tips.tofile(known_tips_out, sep="\n") nsti_tmp_out = path.join(temp_dir, "nsti_out.txt") # Run Rscript. system_call_check(" ".join(["Rscript", castor_nsti_script, tree_path, known_tips_out, nsti_tmp_out])) # Read in calculated NSTI values. nsti_out = pd.read_csv(nsti_tmp_out, sep="\t", index_col="sequence") # Make sure that the table has the correct number of rows. if len(known_tips) != nsti_out.shape[0]: ValueError("Number of rows in returned NSTI table is incorrect.") return(nsti_out)
def run_papara(tree: str, ref_msa: dict, study_fasta: str, out_dir: str, threads=1, print_cmds=False): '''Run PaPaRa to place study sequences into reference multiple-sequence alignment (MSA). Will return dictionary of the the output MSA (sequence ids as keys). Expects path to tree and study FASTA as strings. Expects reference MSA as a dictionary output by read_fasta. This MSA will be converted to phylip format before running PaPaRa.''' # Get absolute paths to input files. tree = path.abspath(tree) study_fasta = path.abspath(study_fasta) # Change working directory to out directory (but keep track of original). # This is necessary because PaPaRa outputs into the current working # directory. orig_wd = getcwd() chdir(out_dir) # Convert ref sequences from MSA FASTA to phylip. write_phylip(ref_msa, "ref_seqs.phylip") # Make call to papara to place sequences (outputs phylip format). system_call_check("papara -t " + tree + " -s ref_seqs.phylip " + "-q " + study_fasta + " -j " + str(threads) + " -n out", print_command=print_cmds, print_stdout=print_cmds, print_stderr=print_cmds) # Change back to original working directory. chdir(orig_wd) # Read in papara phylip output and return. return(read_phylip(path.join(out_dir, "papara_alignment.out"), check_input=True))
def test_full_pipeline_tsv(self): '''Test that full pipeline can be run without error with TSV sequence abundance table.''' with TemporaryDirectory() as temp_dir: out_tree = path.join(temp_dir, "out.tre") system_call_check("place_seqs.py -s " + test_study_seqs + " -r " + test_ref_dir + " -o " + out_tree) traits_predict = path.join(temp_dir, "hsp_out.tsv.gz") marker_predict = path.join(temp_dir, "hsp_out_marker.tsv.gz") system_call_check("hsp.py -t " + out_tree + " --observed_trait_table " + test_known_traits + " -n -o " + traits_predict) system_call_check("hsp.py -t " + out_tree + " --observed_trait_table " + test_known_marker + " -n -o " + marker_predict) metagenome_out = path.join(temp_dir, "meta_out") system_call_check("metagenome_pipeline.py -i " + test_seq_abun_tsv + " -f " + traits_predict + " --strat_out " + " -m " + marker_predict + " -o " + metagenome_out) metagenome_outfile = path.join(metagenome_out, "pred_metagenome_unstrat.tsv.gz") system_call_check("pathway_pipeline.py -i " + metagenome_outfile + " -o " + temp_dir)
def test_picrust2_pipeline_script_per_seq_contrib_strat(self): '''Test that full pipeline can be run successfully with picrust2_pipeline.py with the --per_sequence_contrib and --stratified options.''' with TemporaryDirectory() as temp_dir: out_dir = path.join(temp_dir, "pipeline_out") system_call_check("picrust2_pipeline.py -s " + test_study_seqs + " -i " + test_seq_abun_tsv + " -o " + out_dir + " -r " + test_ref_dir + " -p 1" + " --custom_trait_tables " + test_known_traits + " --marker_gene_table " + test_known_marker + " --reaction_func " + test_known_traits + " --max_nsti 1.9" + " --min_reads 2" + " --min_samples 2" + " --skip_minpath" + " --no_gap_fill" + " --coverage" + " --remove_intermediate" + " --stratified" + " --per_sequence_contrib" " --verbose")
def minpath_wrapper(sample_id, unstrat_input, minpath_map, minpath_outdir, print_opt=False, extra_str=""): '''Run MinPath based on gene abundances in a single sample. Will return a set of all pathways called as present.''' # Define MinPath input and output filenames. minpath_in = path.join(minpath_outdir, str(sample_id) + extra_str + "_minpath_in.txt") minpath_report = path.join( minpath_outdir, str(sample_id) + extra_str + "_minpath_report.txt") minpath_details = path.join( minpath_outdir, str(sample_id) + extra_str + "_minpath_details.txt") minpath_mps = path.join(minpath_outdir, str(sample_id) + extra_str + "_minpath.mps") id_minpath_fh = open(minpath_in, "w") # Loop over all reactions (which are the index labels in unstrat table # unless regrouped). for reaction_id in unstrat_input.index.values: # Get count of each sequence in sample and write that sequence out # along with count if non-zero abundance. reaction_count = unstrat_input.loc[reaction_id, sample_id] # If 0 then skip. if reaction_count == 0: continue id_minpath_fh.write(reaction_id + "\t" + str(reaction_count) + "\n") id_minpath_fh.close() # Run MinPath on this sample. path2minpath = path.join(path.dirname(path.abspath(__file__)), 'MinPath', 'MinPath12hmp.py') minpath_cmd = path2minpath + " -any " + minpath_in + " -map " +\ minpath_map + " -report " + minpath_report +\ " -details " + minpath_details + " -mps " + minpath_mps system_call_check(minpath_cmd, print_out=print_opt) # Read through MinPath report and keep track of pathways identified # to be present. path_present = identify_minpath_present(minpath_report) # Return list of which pathways are present. return (path_present)
def castor_hsp_wrapper(tree_path, trait_tab, hsp_method, calc_ci=False, check_input=False, ran_seed=None): '''Wrapper for making system calls to castor_hsp.py Rscript.''' castor_hsp_script = path.join(get_picrust_project_dir(), 'picrust2', 'Rscripts', 'castor_hsp.R') # Need to format boolean setting as string for R to read in as argument. if calc_ci: calc_ci_setting = "TRUE" else: calc_ci_setting = "FALSE" if check_input: check_input_setting = "TRUE" else: check_input_setting = "FALSE" # Create temporary directory for writing output files of castor_hsp.R with TemporaryDirectory() as temp_dir: output_count_path = path.join(temp_dir, "predicted_counts.txt") output_ci_path = path.join(temp_dir, "predicted_ci.txt") hsp_cmd = " ".join([ "Rscript", castor_hsp_script, tree_path, trait_tab, hsp_method, calc_ci_setting, check_input_setting, output_count_path, output_ci_path, str(ran_seed) ]) # Run castor_hsp.R system_call_check(hsp_cmd) # Load the output into Table objects try: asr_table = pd.read_table(filepath_or_buffer=output_count_path, sep="\t", index_col="sequence") except IOError: raise ValueError("Cannot read in expected output file" + output_ci_path) if calc_ci: asr_ci_table = pd.read_table(filepath_or_buffer=output_ci_path, sep="\t", index_col="sequence") else: asr_ci_table = None # Return list with predicted counts and CIs. return [asr_table, asr_ci_table]
def test_picrust2_pipeline_script(self): '''Test that full pipeline can be run successfully with picrust2_pipeline.py''' with TemporaryDirectory() as temp_dir: system_call_check("picrust2_pipeline.py -s " + test_study_seqs + " -i " + test_seq_abun_tsv + " -o " + temp_dir + " -r " + test_msa + " -t " + test_tree + " --custom_trait_tables " + test_known_traits + " --marker_gene_table " + test_known_marker + " -o " + temp_dir)
def place_seqs_pipeline(study_fasta, ref_dir, out_tree, threads, out_dir, chunk_size, print_cmds): '''Full pipeline for running sequence placement.''' # Identify reference files to use. ref_msa, tree, hmm, model = identify_ref_files(ref_dir) # Run hmmalign to place study sequences into reference MSA. out_stockholm = path.join(out_dir, "query_align.stockholm") system_call_check("hmmalign --trim --dna --mapali " + ref_msa + " --informat FASTA -o " + out_stockholm + " " + hmm + " " + study_fasta, print_out=print_cmds) hmmalign_out = read_stockholm(out_stockholm, clean_char=True) # Specify split FASTA files to be created. study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta") ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta") ref_seqnames = set(list(read_fasta(ref_msa).keys())) study_seqnames = set(read_fasta(study_fasta).keys()) ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames} study_hmmalign_subset = {seq: hmmalign_out[seq] for seq in study_seqnames} write_fasta(ref_hmmalign_subset, ref_msa_fastafile) write_fasta(study_hmmalign_subset, study_msa_fastafile) # Run EPA-NG to output .jplace file. epa_out_dir = path.join(out_dir, "epa_out") run_epa_ng(tree=tree, ref_msa_fastafile=ref_msa_fastafile, study_msa_fastafile=study_msa_fastafile, model=model, chunk_size=chunk_size, threads=threads, out_dir=epa_out_dir, print_cmds=print_cmds) jplace_outfile = path.join(epa_out_dir, "epa_result_parsed.jplace") gappa_jplace_to_newick(jplace_file=jplace_outfile, outfile=out_tree, print_cmds=print_cmds)
def gappa_jplace_to_newick(jplace_file: str, outfile: str, print_cmds=False): '''System call to gappa binary to convert jplace object to newick treefile (with specified filename).''' gappa_out_dir = path.dirname(jplace_file) # Run gappa to convert jplace to newick. system_call_check("gappa analyze graft --jplace-path " + jplace_file + " --fully-resolve --out-dir " + gappa_out_dir, print_out=print_cmds) # Expected name of output newick file. newick_file = jplace_file.replace(".jplace", ".newick") # Rename newick file to be specified outfile. system_call_check("mv " + newick_file + " " + outfile, print_out=print_cmds)
def run_epa_ng(tree: str, ref_msa_fastafile: str, study_msa_fastafile: str, out_dir: str, chunk_size=5000, threads=1, print_cmds=False): '''Run EPA-NG on specified tree, reference MSA, and study sequence MSA. Will opath.joinutput a .jplace file in out_dir.''' make_output_dir(out_dir) system_call_check("epa-ng --tree " + tree + " --ref-msa " + ref_msa_fastafile + " --query " + study_msa_fastafile + " --chunk-size " + str(chunk_size) + " -T " + str(threads) + " -w " + out_dir, print_out=print_cmds)
def test_full_pipeline_biom(self): '''Test that full pipeline can be run without error with BIOM sequence abundance table.''' with TemporaryDirectory() as temp_dir: out_tree = path.join(temp_dir, "out.tre") system_call_check("place_seqs.py -s " + test_study_seqs + " -r " +\ test_msa + " -t " + test_tree + " -o " +\ out_tree) hsp_out_prefix = path.join(temp_dir, "hsp_out") hsp_out_prefix_marker = path.join(temp_dir, "hsp_out_marker") system_call_check("hsp.py -t " + out_tree +\ " --observed_trait_table " + test_known_traits + " -n -c " +\ "-o " + hsp_out_prefix) system_call_check("hsp.py -t " + out_tree +\ " --observed_trait_table " + test_known_marker + " -n -c " +\ "-o " + hsp_out_prefix_marker) traits_predict = path.join(temp_dir, hsp_out_prefix +\ ".tsv") marker_predict = path.join(temp_dir, hsp_out_prefix_marker +\ ".tsv") metagenome_out = path.join(temp_dir, "meta_out") system_call_check("metagenome_pipeline.py -i " + test_seq_abun_biom +\ " -f " + traits_predict + " -m " + marker_predict + " -o " + metagenome_out) metagenome_outfile = path.join(metagenome_out, "pred_metagenome_strat.tsv") minpath_out = path.join(temp_dir, "minpath_out") system_call_check("run_minpath.py -i " + metagenome_outfile +\ " -m " + minpath_map + " -o " + minpath_out)
def castor_hsp_loocv_wrapper(tree_path, trait_table_path, tips_path, hsp_method, expected_out_path, predicted_out_path, metrics_out_path, num_cores=1): '''Runs the castor_hsp_loocv.R Rscript and writes out result tables''' castor_loocv_hsp_script_fp = path.join(get_picrust_project_dir(), 'picrust2', 'Rscripts', 'castor_hsp_loocv.R') loocv_cmd = " ".join([ "Rscript", castor_loocv_hsp_script_fp, tree_path, trait_table_path, tips_path, hsp_method, expected_out_path, predicted_out_path, metrics_out_path, str(num_cores) ]) # Run castor_hsp_loocv.R here system_call_check(loocv_cmd)
def full_pipeline(study_fasta, input_table, output_folder, processes, ref_dir, in_traits, custom_trait_tables, marker_gene_table, pathway_map, rxn_func, no_pathways, regroup_map, no_regroup, stratified, max_nsti, min_reads, min_samples, hsp_method, min_align, skip_nsti, skip_minpath, no_gap_fill, coverage, per_sequence_contrib, wide_table, skip_norm, remove_intermediate, verbose): '''Function that contains wrapper commands for full PICRUSt2 pipeline. Descriptions of all of these input arguments/options are given in the picrust2_pipeline.py script.''' # Throw warning if --per_sequence_contrib set but --stratified unset. if per_sequence_contrib and not stratified: print( "\nThe option --per_sequence_contrib was set, but not the option " "--stratified. This means that a stratified pathway table will " "be output only (i.e. a stratified metagenome table will NOT " "be output).\n", file=sys.stderr) out_tree = path.join(output_folder, "out.tre") if custom_trait_tables is None: # Check that specified functional categories are allowed. FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM'] funcs = in_traits.split(",") for func in funcs: if func not in FUNC_TRAIT_OPTIONS: sys.exit("Error - specified category " + func + " is not " + "one of the default categories.") func_tables = default_tables else: # Split paths to input custom trait tables and take the basename to be # the function id. funcs = [] func_tables = {} for custom in custom_trait_tables.split(","): func_id = path.splitext(path.basename(custom))[0] funcs.append(func_id) func_tables[func_id] = custom # Add reaction function to be in set of gene families if it is not already # and as long as pathways are also to be predicted. if rxn_func not in funcs and not no_pathways: orig_rxn_func = rxn_func rxn_func = path.splitext(path.basename(rxn_func))[0] funcs.append(rxn_func) if rxn_func not in func_tables: func_tables[rxn_func] = orig_rxn_func if not skip_norm: # Append marker as well, since this also needs to be run. funcs.append("marker") func_tables["marker"] = marker_gene_table # Check that all input files exist. ref_msa, tree, hmm, model = identify_ref_files(ref_dir) files2check = [study_fasta, input_table, ref_msa, tree, hmm, model] + list( func_tables.values()) if not no_pathways: files2check.append(pathway_map) # Throw warning if default pathway mapfile used with non-default # reference files. if pathway_map == default_pathway_map and ref_dir != default_ref_dir: print( "Warning - non-default reference files specified with " "default pathway mapfile of prokaryote-specific MetaCyc " "pathways (--pathway_map option). This usage may be " "unintended.", file=sys.stderr) if not no_regroup: files2check.append(regroup_map) # This will throw an error if any input files are not found. check_files_exist(files2check) # Check that sequence names in FASTA overlap with input table. check_overlapping_seqs(study_fasta, input_table, verbose) if path.exists(output_folder): sys.exit("Stopping since output directory " + output_folder + " already exists.") # Make output folder. make_output_dir(output_folder) if verbose: print("Placing sequences onto reference tree", file=sys.stderr) # Define folders for intermediate files (unless --remove_intermediate set). if remove_intermediate: place_seqs_intermediate = "" pathways_intermediate = "" else: intermediate_dir = path.join(output_folder, "intermediate") make_output_dir(intermediate_dir) place_seqs_intermediate = path.join(intermediate_dir, "place_seqs") pathways_intermediate = path.join(intermediate_dir, "pathways") # Run place_seqs.py. place_seqs_cmd = [ "place_seqs.py", "--study_fasta", study_fasta, "--ref_dir", ref_dir, "--out_tree", out_tree, "--processes", str(processes), "--intermediate", place_seqs_intermediate, "--min_align", str(min_align), "--chunk_size", str(5000) ] if verbose: place_seqs_cmd.append("--verbose") system_call_check(place_seqs_cmd, print_command=verbose, print_stdout=verbose, print_stderr=True) if verbose: print("Finished placing sequences on output tree: " + out_tree, file=sys.stderr) # Get predictions for all specified functions and keep track of outfiles. predicted_funcs = {} if not skip_norm: # Make sure marker database is first in the list. This is because this will # be run on a single core and so will be easier to identify any errors # if the program exits when working on this function type. funcs.insert(0, funcs.pop(funcs.index("marker"))) for func in funcs: # Change output filename for NSTI and non-NSTI containing files. hsp_outfile = path.join(output_folder, func + "_predicted") if (func == "marker" and not skip_nsti) or (skip_norm and not skip_nsti): hsp_outfile = hsp_outfile + "_and_nsti.tsv.gz" else: hsp_outfile = hsp_outfile + ".tsv.gz" # Keep track of output filename for next step of pipeline. predicted_funcs[func] = hsp_outfile # Run hsp.py for each function database. hsp_cmd = [ "hsp.py", "--tree", out_tree, "--output", hsp_outfile, "--observed_trait_table", func_tables[func], "--hsp_method", hsp_method, "--seed", "100" ] # Add flags to command if specified. if (func == "marker" and not skip_nsti) or (skip_norm and not skip_nsti): hsp_cmd.append("--calculate_NSTI") # Run marker on only 1 processor. if func == "marker": hsp_cmd += ["--processes", "1"] else: hsp_cmd += ["--processes", str(processes)] if verbose: hsp_cmd.append("--verbose") system_call_check(hsp_cmd, print_command=verbose, print_stdout=verbose, print_stderr=True) # Now run metagenome pipeline commands. # Inititalize dictionary of function names --> metagenome output files. func_output = {} # Loop over each function again and run metagenome pipeline. for func in funcs: if func == "marker": continue if verbose: print("Running metagenome pipeline for " + func, file=sys.stderr) func_output_dir = path.join(output_folder, func + "_metagenome_out") metagenome_pipeline_cmd = [ "metagenome_pipeline.py", "--input", input_table, "--function", predicted_funcs[func], "--min_reads", str(min_reads), "--min_samples", str(min_samples), "--out_dir", func_output_dir ] # Initialize two-element list as value for each function. # First value will be unstratified output and second will be # stratified output. func_output[func] = [None, None] func_output[func][0] = path.join(func_output_dir, "pred_metagenome_unstrat.tsv.gz") if wide_table: metagenome_pipeline_cmd.append("--wide_table") if not skip_nsti: metagenome_pipeline_cmd += ["--max_nsti", str(max_nsti)] if skip_norm: metagenome_pipeline_cmd.append("--skip_norm") else: metagenome_pipeline_cmd += ["--marker", predicted_funcs["marker"]] if stratified: metagenome_pipeline_cmd.append("--strat_out") if wide_table: func_output[func][1] = path.join( func_output_dir, "pred_metagenome_strat.tsv.gz") else: func_output[func][1] = path.join( func_output_dir, "pred_metagenome_contrib.tsv.gz") system_call_check(metagenome_pipeline_cmd, print_command=verbose, print_stdout=verbose, print_stderr=True) # Now infer pathway abundances and coverages unless --no_pathways set. pathway_outfiles = None if not no_pathways: path_output_dir = path.join(output_folder, "pathways_out") if verbose: print("Inferring pathways from predicted " + rxn_func) # Determine whether stratified or unstratified table should be input. if not stratified or per_sequence_contrib: rxn_input_metagenome = func_output[rxn_func][0] else: rxn_input_metagenome = func_output[rxn_func][1] pathway_pipeline_cmd = [ "pathway_pipeline.py", "--input", rxn_input_metagenome, "--out_dir", path_output_dir, "--map", pathway_map, "--intermediate", pathways_intermediate, "--proc", str(processes) ] if no_gap_fill: pathway_pipeline_cmd.append("--no_gap_fill") if skip_minpath: pathway_pipeline_cmd.append("--skip_minpath") if coverage: pathway_pipeline_cmd.append("--coverage") if no_regroup: pathway_pipeline_cmd.append("--no_regroup") else: pathway_pipeline_cmd += ["--regroup_map", regroup_map] if wide_table: pathway_pipeline_cmd.append("--wide_table") if per_sequence_contrib: pathway_pipeline_cmd.append("--per_sequence_contrib") if skip_norm: norm_sequence_abun = input_table else: norm_sequence_abun = path.join(output_folder, rxn_func + "_metagenome_out", "seqtab_norm.tsv.gz") pathway_pipeline_cmd += ["--per_sequence_abun", norm_sequence_abun] pathway_pipeline_cmd += [ "--per_sequence_function", predicted_funcs[rxn_func] ] if verbose: pathway_pipeline_cmd.append("--verbose") system_call_check(pathway_pipeline_cmd, print_command=verbose, print_stdout=False, print_stderr=True) if verbose: print("Wrote predicted pathway abundances and coverages to " + path_output_dir, file=sys.stderr) # Keep track of output filenames if this function is being used in # a non-default way (e.g. with a QIIME2 plugin). pathway_outfiles = {} pathway_outfiles["unstrat_abun"] = path.join( path_output_dir, "path_abun_unstrat.tsv.gz") pathway_outfiles["unstrat_cov"] = path.join(path_output_dir, "path_cov_unstrat.tsv.gz") pathway_outfiles["strat_abun"] = None pathway_outfiles["strat_cov"] = None if stratified or per_sequence_contrib: if wide_table: pathway_outfiles["strat_abun"] = path.join( path_output_dir, "path_abun_strat.tsv.gz") if per_sequence_contrib: pathway_outfiles["strat_cov"] = path.join( path_output_dir, "path_cov_strat.tsv.gz") else: pathway_outfiles["strat_abun"] = path.join( path_output_dir, "path_abun_contrib.tsv.gz") if per_sequence_contrib: pathway_outfiles["strat_cov"] = path.join( path_output_dir, "path_cov_contrib.tsv.gz") return (func_output, pathway_outfiles)
def place_seqs_pipeline(study_fasta, ref_dir, out_tree, threads, out_dir, min_align, chunk_size, verbose): '''Full pipeline for running sequence placement.''' # Throw error if there is a space in the study FASTA filepath. if " " in study_fasta: sys.exit("Stopping - remove the space from the input FASTA filepath.") # Identify reference files to use. ref_msa, tree, hmm, model = identify_ref_files(ref_dir) # Run hmmalign to place study sequences into reference MSA. out_stockholm = path.join(out_dir, "query_align.stockholm") system_call_check("hmmalign --trim --dna --mapali " + ref_msa + " --informat FASTA -o " + out_stockholm + " " + hmm + " " + study_fasta, print_command=verbose, print_stdout=verbose, print_stderr=verbose) hmmalign_out = read_stockholm(out_stockholm, clean_char=True) # Specify split FASTA files to be created. study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta") ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta") ref_seqnames = set(list(read_fasta(ref_msa).keys())) study_seqs = read_fasta(study_fasta) ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames} study_hmmalign_subset = check_alignments(raw_seqs=study_seqs, aligned_seqs=hmmalign_out, min_align=min_align, verbose=verbose) write_fasta(ref_hmmalign_subset, ref_msa_fastafile) write_fasta(study_hmmalign_subset, study_msa_fastafile) # Run EPA-ng to place input sequences and output JPLACE file. epa_out_dir = path.join(out_dir, "epa_out") run_epa_ng(tree=tree, ref_msa_fastafile=ref_msa_fastafile, study_msa_fastafile=study_msa_fastafile, model=model, chunk_size=chunk_size, threads=threads, out_dir=epa_out_dir, print_cmds=verbose) jplace_outfile = path.join(epa_out_dir, "epa_result_parsed.jplace") gappa_jplace_to_newick(jplace_file=jplace_outfile, outfile=out_tree, print_cmds=verbose)
def place_seqs_pipeline(study_fasta, ref_msa, tree, hmm, out_tree, alignment_tool, threads, out_dir, chunk_size, print_cmds): '''Full pipeline for running sequence placement.''' if alignment_tool == "hmmalign": out_stockholm = path.join(out_dir, "query_align.stockholm") system_call_check("hmmalign --trim --dna --mapali " + ref_msa + " --informat FASTA -o " + out_stockholm + " " + hmm + " " + study_fasta, print_out=print_cmds) hmmalign_out = read_stockholm(out_stockholm, clean_char=True) # Specify split FASTA files to be created. study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta") ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta") ref_seqnames = set(list(read_fasta(ref_msa).keys())) study_seqnames = set(read_fasta(study_fasta).keys()) ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames} study_hmmalign_subset = { seq: hmmalign_out[seq] for seq in study_seqnames } write_fasta(ref_hmmalign_subset, ref_msa_fastafile) write_fasta(study_hmmalign_subset, study_msa_fastafile) elif alignment_tool == "papara": # Read in ref seqs FASTA as a dict. ref_msa = read_fasta(ref_msa) # Run PaPaRa to place study sequences and read in Phylip file. papara_out = run_papara(tree=tree, ref_msa=ref_msa, study_fasta=study_fasta, out_dir=out_dir, threads=threads, print_cmds=print_cmds) # Specify split FASTA files to be created. study_msa_fastafile = path.join(out_dir, "study_seqs_papara.fasta") ref_msa_fastafile = path.join(out_dir, "ref_seqs_papara.fasta") # Split PaPaRa output into two FASTA files containing study and reference # sequences respectively. split_ref_study_papara(papara_out=papara_out, ref_seqnames=set(list(ref_msa.keys())), study_fasta=study_msa_fastafile, ref_fasta=ref_msa_fastafile) # Run EPA-NG to output .jplace file. epa_out_dir = path.join(out_dir, "epa_out") run_epa_ng(tree=tree, ref_msa_fastafile=ref_msa_fastafile, study_msa_fastafile=study_msa_fastafile, chunk_size=chunk_size, threads=threads, out_dir=epa_out_dir, print_cmds=print_cmds) jplace_outfile = path.join(epa_out_dir, "epa_result.jplace") gappa_jplace_to_newick(jplace_file=jplace_outfile, outfile=out_tree, print_cmds=print_cmds)
def custom_tree_pipeline(table: biom.Table, tree: skbio.TreeNode, threads: int = 1, hsp_method: str = "mp", max_nsti: float = 2.0, edge_exponent: float = 0.5, skip_minpath: bool = False, no_gap_fill: bool = False, skip_norm: bool = False, highly_verbose: bool = False) -> (biom.Table, biom.Table, biom.Table): # Run pipeline in temporary directory so that files are not saved locally. with TemporaryDirectory() as temp_dir: # Need to write out BIOM table and newick tree to be used in pipeline. # Write out biom table: biom_infile = path.join(temp_dir, "intable.biom") with biom.util.biom_open(biom_infile, 'w') as out_biom: table.to_hdf5(h5grp=out_biom, generated_by="PICRUSt2 QIIME 2 Plugin") # Write out newick tree. newick_infile = path.join(temp_dir, "placed_seqs.tre") tree.write(newick_infile, format="newick") picrust2_out = path.join(temp_dir, "picrust2_out") print("Running the below commands:", file=sys.stderr) # Run hidden-state prediction step (on 16S, EC, and KO tables # separately. hsp_out_16S = path.join(picrust2_out, "16S_predicted.tsv.gz") hsp_out_16S_cmd = "hsp.py -i 16S " + \ " -t " + newick_infile + \ " -p 1 " + \ " -n " + \ " -o " + hsp_out_16S + \ " -m " + hsp_method + \ " -e " + str(edge_exponent) hsp_out_EC = path.join(picrust2_out, "EC_predicted.tsv.gz") hsp_out_EC_cmd = "hsp.py -i EC " + \ " -t " + newick_infile + \ " -p " + str(threads) + \ " -n " + \ " -o " + hsp_out_EC + \ " -m " + hsp_method + \ " -e " + str(edge_exponent) hsp_out_KO = path.join(picrust2_out, "KO_predicted.tsv.gz") hsp_out_KO_cmd = "hsp.py -i KO " + \ " -t " + newick_infile + \ " -p " + str(threads) + \ " -n " + \ " -o " + hsp_out_KO + \ " -m " + hsp_method + \ " -e " + str(edge_exponent) if highly_verbose: hsp_out_16S_cmd += " --verbose" hsp_out_EC_cmd += " --verbose" hsp_out_KO_cmd += " --verbose" if not skip_norm: system_call_check(hsp_out_16S_cmd, print_command=True, print_stdout=highly_verbose, print_stderr=True) system_call_check(hsp_out_EC_cmd, print_command=True, print_stdout=highly_verbose, print_stderr=True) system_call_check(hsp_out_KO_cmd, print_command=True, print_stdout=highly_verbose, print_stderr=True) # Run metagenome pipeline step. EC_metagenome_out = path.join(picrust2_out, "EC_metagenome_out") KO_metagenome_out = path.join(picrust2_out, "KO_metagenome_out") EC_metagenome_cmd = "metagenome_pipeline.py -i " + biom_infile + \ " -f " + hsp_out_EC + \ " -o " + EC_metagenome_out + \ " --max_nsti " + str(max_nsti) KO_metagenome_cmd = "metagenome_pipeline.py -i " + biom_infile + \ " -f " + hsp_out_KO + \ " -o " + KO_metagenome_out + \ " --max_nsti " + str(max_nsti) if skip_norm: EC_metagenome_cmd += " --skip_norm" KO_metagenome_cmd += " --skip_norm" else: EC_metagenome_cmd += " -m " + hsp_out_16S KO_metagenome_cmd += " -m " + hsp_out_16S system_call_check(EC_metagenome_cmd, print_command=True, print_stdout=highly_verbose, print_stderr=True) system_call_check(KO_metagenome_cmd, print_command=True, print_stdout=highly_verbose, print_stderr=True) EC_out = path.join(EC_metagenome_out, "pred_metagenome_unstrat.tsv.gz") KO_out = path.join(KO_metagenome_out, "pred_metagenome_unstrat.tsv.gz") # Run pathway inference step. pathways_out = path.join(picrust2_out, "pathways_out") pathabun_out = path.join(pathways_out, "path_abun_unstrat.tsv.gz") pathway_pipeline_cmd = "pathway_pipeline.py -i " + EC_out + \ " -o " + pathways_out + \ " -p " + str(threads) if skip_minpath: pathway_pipeline_cmd += " --skip_minpath" if no_gap_fill: pathway_pipeline_cmd += " --no_gap_fill" if highly_verbose: pathway_pipeline_cmd += " --verbose" system_call_check(pathway_pipeline_cmd, print_command=True, print_stdout=highly_verbose, print_stderr=True) # Read in output unstratified metagenome tables and return as BIOM # objects. ko_biom = biom.load_table(KO_out) ec_biom = biom.load_table(EC_out) pathabun_biom = biom.load_table(pathabun_out) return ko_biom, ec_biom, pathabun_biom
def minpath_wrapper(sample_id, strat_input, minpath_map, out_dir, print_opt=False): '''Read in sample_id, gene family table, and out_dir, and run MinPath based on the gene family abundances. Returns both unstratified and stratified pathway abundances as dictionaries in a list.''' # Get gene family abundances summed over all sequences for this sample. unstrat_input = strat_to_unstrat_counts(strat_input) # Define MinPath input and outout filenames. minpath_in = path.join(out_dir, sample_id + "_minpath_in.txt") minpath_report = path.join(out_dir, sample_id + "_minpath_report.txt") minpath_details = path.join(out_dir, sample_id + "_minpath_details.txt") minpath_mps = path.join(out_dir, sample_id + "_minpath.mps") minpath_output = open(path.join(out_dir, sample_id + "_minpath_out.txt"), "w") id_minpath_fh = open(minpath_in, "w") # Loop over all functions (which are the index labels in unstrat table). for func_id in unstrat_input.index.values: # Get count of each sequence in sample and write that sequence out # along with count if non-zero abundance. func_count = unstrat_input.loc[func_id, sample_id] # If 0 then skip. if func_count == 0: continue id_minpath_fh.write(func_id + "\t" + str(func_count) + "\n") id_minpath_fh.close() # Run MinPath on this sample. path2minpath = path.join(get_picrust_project_dir(), 'MinPath', 'MinPath12hmp.py') minpath_cmd = path2minpath + " -any " + minpath_in + " -map " +\ minpath_map + " -report " + minpath_report +\ " -details " + minpath_details + " -mps " + minpath_mps system_call_check(minpath_cmd, print_out=print_opt, stdout=minpath_output) # Read through MinPath report and keep track of pathways identified # to be present. path_present = identify_minpath_present(minpath_report) # Now read in details file and take abundance of pathway to be # mean of top 1/2 most abundant gene families. # Abundances of 0 will be added in for gene families not found. gf_abundances, gf_ids = parse_minpath_details(minpath_details, path_present) # Initialize series and dataframe that will contain pathway abundances. unstrat_abun = pd.Series() strat_abun = pd.DataFrame(columns=["pathway", "sequence", sample_id]) strat_abun = strat_abun.set_index(["pathway", "sequence"]) # Loop through all pathways present and get mean of 1/2 most abundant. for pathway in gf_abundances.keys(): # Like HUMAnN2, sort enzyme reactions, take second half, and get # their mean abundance. # First get indices of sorted list. sorted_index = list(np.argsort(gf_abundances[pathway])) sorted_gf_abundances = [gf_abundances[pathway][i] for i in sorted_index] sorted_gf_ids = [gf_ids[pathway][i] for i in sorted_index] # Take second half of gene family abundances and ids lists. half_i = int(len(sorted_gf_abundances) / 2) gf_abundances_subset = sorted_gf_abundances[half_i:] gf_ids_subset = sorted_gf_ids[half_i:] # Take mean for unstratified pathway abundance. unstrat_abun[pathway] = sum(gf_abundances_subset)/len(gf_abundances_subset) # Get stratified pathway abundances by sequences. strat_path_abun = path_abun_by_seq(strat_input, gf_ids_subset, sum(gf_abundances_subset), unstrat_abun[pathway]) # Remove rows that are all 0. strat_path_abun[strat_path_abun[sample_id] > 0] # Add pathway as new column. strat_path_abun["pathway"] = [pathway]*strat_path_abun.shape[0] strat_path_abun.set_index("pathway", append=True, inplace=True) # Changes levels of index labels. strat_path_abun = strat_path_abun.reorder_levels(["pathway", "sequence"]) strat_abun = pd.concat([strat_abun, strat_path_abun], levels=["pathway", "sequence"]) # Return unstratified and stratified abundances. # Note that the stratified abundances are converted to a series. return([unstrat_abun, strat_abun[sample_id]])
def castor_hsp_wrapper(tree_path, trait_tab, hsp_method, calc_ci=False, check_input=False, ran_seed=None, verbose=False): '''Wrapper for making system calls to castor_hsp.py Rscript.''' castor_hsp_script = path.join(path.dirname(path.abspath(__file__)), 'Rscripts', 'castor_hsp.R') # Need to format boolean setting as string for R to read in as argument. if calc_ci: calc_ci_setting = "TRUE" else: calc_ci_setting = "FALSE" if check_input: check_input_setting = "TRUE" else: check_input_setting = "FALSE" # Create temporary directory for writing output files of castor_hsp.R with TemporaryDirectory() as temp_dir: output_count_path = path.join(temp_dir, "predicted_counts.txt") output_ci_path = path.join(temp_dir, "predicted_ci.txt") hsp_cmd = " ".join([ "Rscript", castor_hsp_script, tree_path, trait_tab, hsp_method, calc_ci_setting, check_input_setting, output_count_path, output_ci_path, str(ran_seed) ]) # Run castor_hsp.R system_call_check(hsp_cmd, print_command=verbose, print_stdout=verbose, print_stderr=verbose) # Load the output into Table objects try: asr_table = pd.read_csv(filepath_or_buffer=output_count_path, sep="\t", dtype={'sequence': str}) asr_table.set_index('sequence', drop=True, inplace=True) except IOError: raise ValueError("Cannot read in expected output file" + output_ci_path) if calc_ci: asr_ci_table = pd.read_csv(filepath_or_buffer=output_ci_path, sep="\t", dtype={'sequence': str}) asr_ci_table.set_index('sequence', drop=True, inplace=True) else: asr_ci_table = None # Return list with predicted counts and CIs. return [asr_table, asr_ci_table]
def custom_tree_pipeline( table: biom.Table, tree: skbio.TreeNode, threads: int = 1, hsp_method: str = "mp", max_nsti: float = 2.0) -> (biom.Table, biom.Table, biom.Table): # Run pipeline in temporary directory so that files are not saved locally. with TemporaryDirectory() as temp_dir: # Need to write out BIOM table and newick tree to be used in pipeline. # Write out biom table: biom_infile = path.join(temp_dir, "intable.biom") with biom.util.biom_open(biom_infile, 'w') as out_biom: table.to_hdf5(h5grp=out_biom, generated_by="PICRUSt2 QIIME2 Plugin") # Write out newick tree. newick_infile = path.join(temp_dir, "placed_seqs.tre") tree.write(newick_infile, format="newick") picrust2_out = path.join(temp_dir, "picrust2_out") print("Running the below commands:", file=sys.stderr) # Run hidden-state prediction step (on 16S, EC, and KO tables # separately. hsp_out_16S = path.join(picrust2_out, "16S_predicted.tsv.gz") system_call_check("hsp.py -i 16S " + " -t " + newick_infile + " -p 1 " + " -n " + "-o " + hsp_out_16S + " -m " + hsp_method, print_out=True) hsp_out_EC = path.join(picrust2_out, "EC_predicted.tsv.gz") system_call_check("hsp.py -i EC " + " -t " + newick_infile + " -p " + str(threads) + " -o " + hsp_out_EC + " -m " + hsp_method, print_out=True) hsp_out_KO = path.join(picrust2_out, "KO_predicted.tsv.gz") system_call_check("hsp.py -i KO " + " -t " + newick_infile + " -p " + str(threads) + " -o " + hsp_out_KO + " -m " + hsp_method, print_out=True) # Run metagenome pipeline step. EC_metagenome_out = path.join(picrust2_out, "EC_metagenome_out") system_call_check("metagenome_pipeline.py -i " + biom_infile + " -m " + hsp_out_16S + " -f " + hsp_out_EC + " -o " + EC_metagenome_out + " --max_nsti " + str(max_nsti), print_out=True) KO_metagenome_out = path.join(picrust2_out, "KO_metagenome_out") system_call_check("metagenome_pipeline.py -i " + biom_infile + " -m " + hsp_out_16S + " -f " + hsp_out_KO + " -o " + KO_metagenome_out + " --max_nsti " + str(max_nsti), print_out=True) EC_out = path.join(EC_metagenome_out, "pred_metagenome_unstrat.tsv.gz") KO_out = path.join(KO_metagenome_out, "pred_metagenome_unstrat.tsv.gz") # Run pathway inference step. pathways_out = path.join(picrust2_out, "pathways_out") pathabun_out = path.join(pathways_out, "path_abun_unstrat.tsv.gz") system_call_check("pathway_pipeline.py -i " + EC_out + " -o " + pathways_out + " -p " + str(threads), print_out=True) # Read in output unstratified metagenome tables and return as BIOM # objects. ko_biom = biom.load_table(KO_out) ec_biom = biom.load_table(EC_out) pathabun_biom = biom.load_table(pathabun_out) return ko_biom, ec_biom, pathabun_biom
def minpath_wrapper(sample_id, unstrat_input, minpath_map, out_dir, print_opt=False, extra_str=""): '''Run MinPath based on gene abundances in a single sample. Will return the abundances of gene families within each identified pathway.''' # Make output directory for MinPath intermediate files. make_output_dir(path.join(out_dir, "minpath_running")) # Define MinPath input and output filenames. minpath_in = path.join(out_dir, "minpath_running", sample_id + extra_str + "_minpath_in.txt") minpath_report = path.join(out_dir, "minpath_running", sample_id + extra_str + "_minpath_report.txt") minpath_details = path.join(out_dir, "minpath_running", sample_id + extra_str + "_minpath_details.txt") minpath_mps = path.join(out_dir, "minpath_running", sample_id + extra_str + "_minpath.mps") minpath_output = open(path.join(out_dir, sample_id + "_minpath_out.txt"), "w") id_minpath_fh = open(minpath_in, "w") # Inititalize dictionary for keeping track of reaction abundances. reaction_abun = defaultdict(int) # Loop over all reactions (which are the index labels in unstrat table # unless regrouped). for reaction_id in unstrat_input.index.values: # Get count of each sequence in sample and write that sequence out # along with count if non-zero abundance. reaction_count = unstrat_input.loc[reaction_id, sample_id] # If 0 then skip. if reaction_count == 0: continue id_minpath_fh.write(reaction_id + "\t" + str(reaction_count) + "\n") reaction_abun[reaction_id] = reaction_count id_minpath_fh.close() # Run MinPath on this sample. path2minpath = path.join(get_picrust_project_dir(), 'picrust2', 'MinPath', 'MinPath12hmp.py') minpath_cmd = path2minpath + " -any " + minpath_in + " -map " +\ minpath_map + " -report " + minpath_report +\ " -details " + minpath_details + " -mps " + minpath_mps system_call_check(minpath_cmd, print_out=print_opt, stdout=minpath_output) # Read through MinPath report and keep track of pathways identified # to be present. path_present = identify_minpath_present(minpath_report) # Return list of which pathways are present and the abundances of all gene # families. return (path_present, reaction_abun)