def castor_nsti(tree_path, known_tips): '''Will calculate distance from each study sequence to the closest reference sequence. Takes in the path to treefile and the known tips (i.e. the rownames in the trait table - the reference genome ids).''' castor_nsti_script = path.join(get_picrust_project_dir(), 'picrust2', 'Rscripts', 'castor_nsti.R') # Create temporary directory for working in. with TemporaryDirectory() as temp_dir: # Output known tip names to temp file # (note this object is a numpy.ndarray) known_tips_out = path.join(temp_dir, "known_tips.txt") known_tips.tofile(known_tips_out, sep="\n") nsti_tmp_out = path.join(temp_dir, "nsti_out.txt") # Run Rscript. system_call_check(" ".join([ "Rscript", castor_nsti_script, tree_path, known_tips_out, nsti_tmp_out ])) # Read in calculated NSTI values. nsti_out = pd.read_table(nsti_tmp_out, sep="\t", index_col="sequence") # Make sure that the table has the correct number of rows. if len(known_tips) != nsti_out.shape[0]: ValueError("Number of rows in returned NSTI table is incorrect.") return (nsti_out)
def castor_hsp_wrapper(tree_path, trait_tab, hsp_method, calc_ci=False, check_input=False, ran_seed=None): '''Wrapper for making system calls to castor_hsp.py Rscript.''' castor_hsp_script = path.join(get_picrust_project_dir(), 'picrust2', 'Rscripts', 'castor_hsp.R') # Need to format boolean setting as string for R to read in as argument. if calc_ci: calc_ci_setting = "TRUE" else: calc_ci_setting = "FALSE" if check_input: check_input_setting = "TRUE" else: check_input_setting = "FALSE" # Create temporary directory for writing output files of castor_hsp.R with TemporaryDirectory() as temp_dir: output_count_path = path.join(temp_dir, "predicted_counts.txt") output_ci_path = path.join(temp_dir, "predicted_ci.txt") hsp_cmd = " ".join([ "Rscript", castor_hsp_script, tree_path, trait_tab, hsp_method, calc_ci_setting, check_input_setting, output_count_path, output_ci_path, str(ran_seed) ]) # Run castor_hsp.R system_call_check(hsp_cmd) # Load the output into Table objects try: asr_table = pd.read_table(filepath_or_buffer=output_count_path, sep="\t", index_col="sequence") except IOError: raise ValueError("Cannot read in expected output file" + output_ci_path) if calc_ci: asr_ci_table = pd.read_table(filepath_or_buffer=output_ci_path, sep="\t", index_col="sequence") else: asr_ci_table = None # Return list with predicted counts and CIs. return [asr_table, asr_ci_table]
def castor_hsp_loocv_wrapper(tree_path, trait_table_path, tips_path, hsp_method, expected_out_path, predicted_out_path, metrics_out_path, num_cores=1): '''Runs the castor_hsp_loocv.R Rscript and writes out result tables''' castor_loocv_hsp_script_fp = path.join(get_picrust_project_dir(), 'picrust2', 'Rscripts', 'castor_hsp_loocv.R') loocv_cmd = " ".join([ "Rscript", castor_loocv_hsp_script_fp, tree_path, trait_table_path, tips_path, hsp_method, expected_out_path, predicted_out_path, metrics_out_path, str(num_cores) ]) # Run castor_hsp_loocv.R here system_call_check(loocv_cmd)
#!/usr/bin/env python __copyright__ = "Copyright 2018, The PICRUSt Project" __license__ = "GPL" __version__ = "2.0.0-b.3" import unittest from os import path from tempfile import TemporaryDirectory from picrust2.util import get_picrust_project_dir, read_phylip, read_fasta from picrust2.place_seqs import (place_seqs_pipeline, run_papara, split_ref_study_papara, run_epa_ng, gappa_jplace_to_newick) # Set paths to test files. test_dir_path = path.join(get_picrust_project_dir(), "tests") test_study_seqs = path.join(test_dir_path, "test_data", "place_seqs", "study_seqs_test.fasta") test_tree = path.join(test_dir_path, "test_data", "place_seqs", "img_centroid_16S_aligned_head30.tre") test_msa = path.join(test_dir_path, "test_data", "place_seqs", "img_centroid_16S_aligned_head30.fna") exp_papara_phylip = path.join(test_dir_path, "test_data", "place_seqs", "place_seqs_output", "place_seqs_working", "papara_alignment.out") exp_study_fasta = path.join(test_dir_path, "test_data", "place_seqs",
#!/usr/bin/env python __copyright__ = "Copyright 2018, The PICRUSt Project" __license__ = "GPL" __version__ = "2.0.0-b.7" from picrust2.util import get_picrust_project_dir from os import path # Default support files packaged with PICRUSt2. project_dir = get_picrust_project_dir() default_fasta = path.join(project_dir, "default_files", "prokaryotic", "reference.fna") default_tree = path.join(project_dir, "default_files", "prokaryotic", "reference.tre") default_regroup_map = path.join(project_dir, "default_files", "pathway_mapfiles", "ec_level4_to_metacyc_rxn.tsv") default_pathway_map = path.join(project_dir, "default_files", "pathway_mapfiles", "metacyc_path2rxn_struc_filt_pro.txt") # Inititalize default trait table files for hsp.py. prokaryotic_dir = path.join(project_dir, "default_files", "prokaryotic") default_tables = { "16S": path.join(prokaryotic_dir, "16S.txt.gz"),
def minpath_wrapper(sample_id, strat_input, minpath_map, out_dir, print_opt=False): '''Read in sample_id, gene family table, and out_dir, and run MinPath based on the gene family abundances. Returns both unstratified and stratified pathway abundances as dictionaries in a list.''' # Get gene family abundances summed over all sequences for this sample. unstrat_input = strat_to_unstrat_counts(strat_input) # Define MinPath input and outout filenames. minpath_in = path.join(out_dir, sample_id + "_minpath_in.txt") minpath_report = path.join(out_dir, sample_id + "_minpath_report.txt") minpath_details = path.join(out_dir, sample_id + "_minpath_details.txt") minpath_mps = path.join(out_dir, sample_id + "_minpath.mps") minpath_output = open(path.join(out_dir, sample_id + "_minpath_out.txt"), "w") id_minpath_fh = open(minpath_in, "w") # Loop over all functions (which are the index labels in unstrat table). for func_id in unstrat_input.index.values: # Get count of each sequence in sample and write that sequence out # along with count if non-zero abundance. func_count = unstrat_input.loc[func_id, sample_id] # If 0 then skip. if func_count == 0: continue id_minpath_fh.write(func_id + "\t" + str(func_count) + "\n") id_minpath_fh.close() # Run MinPath on this sample. path2minpath = path.join(get_picrust_project_dir(), 'MinPath', 'MinPath12hmp.py') minpath_cmd = path2minpath + " -any " + minpath_in + " -map " +\ minpath_map + " -report " + minpath_report +\ " -details " + minpath_details + " -mps " + minpath_mps system_call_check(minpath_cmd, print_out=print_opt, stdout=minpath_output) # Read through MinPath report and keep track of pathways identified # to be present. path_present = identify_minpath_present(minpath_report) # Now read in details file and take abundance of pathway to be # mean of top 1/2 most abundant gene families. # Abundances of 0 will be added in for gene families not found. gf_abundances, gf_ids = parse_minpath_details(minpath_details, path_present) # Initialize series and dataframe that will contain pathway abundances. unstrat_abun = pd.Series() strat_abun = pd.DataFrame(columns=["pathway", "sequence", sample_id]) strat_abun = strat_abun.set_index(["pathway", "sequence"]) # Loop through all pathways present and get mean of 1/2 most abundant. for pathway in gf_abundances.keys(): # Like HUMAnN2, sort enzyme reactions, take second half, and get # their mean abundance. # First get indices of sorted list. sorted_index = list(np.argsort(gf_abundances[pathway])) sorted_gf_abundances = [gf_abundances[pathway][i] for i in sorted_index] sorted_gf_ids = [gf_ids[pathway][i] for i in sorted_index] # Take second half of gene family abundances and ids lists. half_i = int(len(sorted_gf_abundances) / 2) gf_abundances_subset = sorted_gf_abundances[half_i:] gf_ids_subset = sorted_gf_ids[half_i:] # Take mean for unstratified pathway abundance. unstrat_abun[pathway] = sum(gf_abundances_subset)/len(gf_abundances_subset) # Get stratified pathway abundances by sequences. strat_path_abun = path_abun_by_seq(strat_input, gf_ids_subset, sum(gf_abundances_subset), unstrat_abun[pathway]) # Remove rows that are all 0. strat_path_abun[strat_path_abun[sample_id] > 0] # Add pathway as new column. strat_path_abun["pathway"] = [pathway]*strat_path_abun.shape[0] strat_path_abun.set_index("pathway", append=True, inplace=True) # Changes levels of index labels. strat_path_abun = strat_path_abun.reorder_levels(["pathway", "sequence"]) strat_abun = pd.concat([strat_abun, strat_path_abun], levels=["pathway", "sequence"]) # Return unstratified and stratified abundances. # Note that the stratified abundances are converted to a series. return([unstrat_abun, strat_abun[sample_id]])
#!/usr/bin/env python __copyright__ = "Copyright 2018, The PICRUSt Project" __license__ = "GPL" __version__ = "2.0.0-b.3" import unittest from os import path from tempfile import TemporaryDirectory from picrust2.util import get_picrust_project_dir, system_call_check # Paths to input files. test_dir_path = path.join(get_picrust_project_dir(), "tests") test_study_seqs = path.join(test_dir_path, "test_data", "place_seqs", "study_seqs_test.fasta") test_tree = path.join(test_dir_path, "test_data", "place_seqs", "img_centroid_16S_aligned_head30.tre") test_msa = path.join(test_dir_path, "test_data", "place_seqs", "img_centroid_16S_aligned_head30.fna") test_known_marker = path.join(test_dir_path, "test_data", "workflow", "workflow_known_marker.tsv") test_known_traits = path.join(test_dir_path, "test_data", "workflow", "workflow_known_traits.tsv") test_seq_abun_tsv = path.join(test_dir_path, "test_data", "workflow", "workflow_seq_abun.tsv")
import unittest from os import path import pandas as pd import hashlib import gzip from tempfile import TemporaryDirectory from picrust2.util import (write_fasta, read_fasta, write_phylip, read_phylip, three_df_index_overlap_sort, add_descrip_col, get_picrust_project_dir, convert_humann2_to_picrust2, convert_picrust2_to_humann2, convert_picrust2_to_humann2_merged) from picrust2.default import default_map descrip_test_dir_path = path.join(get_picrust_project_dir(), "tests", "test_data", "add_descriptions") descrip_test_dir_out_path = path.join(descrip_test_dir_path, "output") # Set paths to test input and output files for add_descriptions.py tests. ec_unstrat_in = path.join(descrip_test_dir_path, "ec_unstrat_test.txt") ec_unstrat_exp = path.join(descrip_test_dir_out_path, "ec_unstrat_exp.txt") ec_strat_in = path.join(descrip_test_dir_path, "ec_strat_test.txt") ec_strat_exp = path.join(descrip_test_dir_out_path, "ec_strat_exp.txt") ec_nomatch_in = path.join(descrip_test_dir_path, "ec_nomatch_test.txt") metacyc_unstrat_in = path.join(descrip_test_dir_path, "metacyc_unstrat_test.txt")
#!/usr/bin/env python __copyright__ = "Copyright 2018, The PICRUSt Project" __license__ = "GPL" __version__ = "2.0.0-b.4" import unittest import pandas as pd from os import path from tempfile import TemporaryDirectory from picrust2.run_minpath import (minpath_wrapper, run_minpath_pipeline, read_strat_genes) from picrust2.util import get_picrust_project_dir # Path to test directory. test_dir_path = path.join(get_picrust_project_dir(), "tests") in_metagenome_abun = path.join(test_dir_path, "test_data", "run_minpath", "test_metagenome_out.tsv") exp_minpath_out_strat = path.join(test_dir_path, "test_data", "run_minpath", "expected_out_strat_path.tsv") exp_minpath_out_unstrat = path.join(test_dir_path, "test_data", "run_minpath", "expected_out_unstrat_path.tsv") map_ec2path_prokaryotic = path.join(get_picrust_project_dir(), "MinPath", "ec2metacyc_picrust_prokaryotic.txt") class minpath_wrapper_tests(unittest.TestCase):
def minpath_wrapper(sample_id, unstrat_input, minpath_map, out_dir, print_opt=False, extra_str=""): '''Run MinPath based on gene abundances in a single sample. Will return the abundances of gene families within each identified pathway.''' # Make output directory for MinPath intermediate files. make_output_dir(path.join(out_dir, "minpath_running")) # Define MinPath input and output filenames. minpath_in = path.join(out_dir, "minpath_running", sample_id + extra_str + "_minpath_in.txt") minpath_report = path.join(out_dir, "minpath_running", sample_id + extra_str + "_minpath_report.txt") minpath_details = path.join(out_dir, "minpath_running", sample_id + extra_str + "_minpath_details.txt") minpath_mps = path.join(out_dir, "minpath_running", sample_id + extra_str + "_minpath.mps") minpath_output = open(path.join(out_dir, sample_id + "_minpath_out.txt"), "w") id_minpath_fh = open(minpath_in, "w") # Inititalize dictionary for keeping track of reaction abundances. reaction_abun = defaultdict(int) # Loop over all reactions (which are the index labels in unstrat table # unless regrouped). for reaction_id in unstrat_input.index.values: # Get count of each sequence in sample and write that sequence out # along with count if non-zero abundance. reaction_count = unstrat_input.loc[reaction_id, sample_id] # If 0 then skip. if reaction_count == 0: continue id_minpath_fh.write(reaction_id + "\t" + str(reaction_count) + "\n") reaction_abun[reaction_id] = reaction_count id_minpath_fh.close() # Run MinPath on this sample. path2minpath = path.join(get_picrust_project_dir(), 'picrust2', 'MinPath', 'MinPath12hmp.py') minpath_cmd = path2minpath + " -any " + minpath_in + " -map " +\ minpath_map + " -report " + minpath_report +\ " -details " + minpath_details + " -mps " + minpath_mps system_call_check(minpath_cmd, print_out=print_opt, stdout=minpath_output) # Read through MinPath report and keep track of pathways identified # to be present. path_present = identify_minpath_present(minpath_report) # Return list of which pathways are present and the abundances of all gene # families. return (path_present, reaction_abun)