def test_mp_simple(self): predict_out, ci_out = castor_hsp_workflow(tree_path=in_tree1, trait_table_path=in_traits1, hsp_method="mp", ran_seed=10) pd.testing.assert_frame_equal(predict_out, hsp_mp_pred_in, check_like=True)
def test_emp_prob_ci(self): '''Test that Emp Prob confidence intervals calculated correctly.''' predict_out, ci_out = castor_hsp_workflow(tree_path=in_tree1, trait_table_path=in_traits1, hsp_method="emp_prob", ran_seed=10, calc_ci=True) pd.testing.assert_frame_equal(ci_out, hsp_emp_prob_pred_in_ci, check_like=True)
def test_scp_simple(self): predict_out, ci_out = castor_hsp_workflow(tree_path=in_tree1, trait_table_path=in_traits1, hsp_method="scp", ran_seed=10) # Since values can differ depending on exact dependency versions, just comparing dimension and names. predict_out[:] = 0 hsp_scp_pred_in[:] = 0 pd.testing.assert_frame_equal(predict_out, hsp_scp_pred_in, check_like=True)
def main(): args = parser.parse_args() # Determine which input trait table was specified. If neither a default # or custom table was specified then throw an error. if args.in_trait: trait_table = default_tables[args.in_trait] elif args.observed_trait_table: trait_table = args.observed_trait_table else: raise RuntimeError( "A default input trait table needs to be specified with the " + "--in_trait option, or alternatively a custom table can be " + "specified with the --observed_trait_table option") # Check that input filenames exist. check_files_exist([args.tree, trait_table]) # Methods for discrete trait prediction with CI enabled. discrete_set = set(['emp_prob', 'mp']) if args.confidence and args.hsp_method in discrete_set: ci_setting = True else: ci_setting = False count_outfile = args.output_prefix + ".tsv" ci_outfile = args.output_prefix + "_ci.tsv" hsp_table, ci_table = castor_hsp_workflow(tree_path=args.tree, trait_table_path=trait_table, hsp_method=args.hsp_method, chunk_size=args.chunk_size, calc_nsti=args.calculate_NSTI, calc_ci=ci_setting, check_input=args.check, num_proc=args.processes, ran_seed=args.seed) # Output the table to file. make_output_dir_for_file(count_outfile) hsp_table.to_csv(path_or_buf=count_outfile, index_label="sequence", sep="\t") # Output the CI file as well if option set. if ci_setting: make_output_dir_for_file(ci_outfile) ci_table.to_csv(path_or_buf=ci_outfile, index_label="sequence", sep="\t")
def test_mp_ci(self): '''Test that MP confidence intervals calculated correctly.''' predict_out, ci_out = castor_hsp_workflow(tree_path=in_tree1, trait_table_path=in_traits1, hsp_method="mp", ran_seed=10, calc_ci=True) # Since values can differ depending on exact dependency versions, just comparing dimension and names. #predict_out[:] = 0 #hsp_mp_pred_in_ci[:] = 0 pd.testing.assert_frame_equal(ci_out, hsp_mp_pred_in_ci, check_like=True)
def hsp_pipeline_steps(func, calculate_NSTI, out_tree, func_table_in, hsp_method, ci_setting, threads, seed, output_folder, verbose): '''HSP pipeline steps moved to separate function for improved garbage collection (i.e. so that large objects no longer needed are removed from memory).''' # Only output NSTI in 16S table. nsti_setting = False if func == "marker" and calculate_NSTI: nsti_setting = True if verbose: print("Running hidden-state prediction for " + func) hsp_table, ci_table = castor_hsp_workflow(tree_path=out_tree, trait_table_path=func_table_in, hsp_method=hsp_method, calc_nsti=nsti_setting, calc_ci=ci_setting, check_input=False, num_proc=threads, ran_seed=seed) count_outfile = path.join(output_folder, func + "_predicted.tsv") # Add "_nsti" to filename if output. if nsti_setting: count_outfile = path.join(output_folder, func + "_nsti_predicted.tsv") if verbose: print("Writing out predicted gene family abundances to " + count_outfile) hsp_table.to_csv(path_or_buf=count_outfile, index_label="sequence", sep="\t") # Output the CI file as well if option set. if ci_setting: ci_outfile = path.join(output_folder, func + "_predicted_ci.tsv") if verbose: print("Writing out predicted gene family CIs to " + ci_outfile) ci_table.to_csv(path_or_buf=ci_outfile, index_label="sequence", sep="\t") return (count_outfile)
def main(): args = parser.parse_args() # Determine which input trait table was specified. If neither a default # or custom table was specified then throw an error. if args.in_trait: trait_table = default_tables[args.in_trait] elif args.observed_trait_table: trait_table = args.observed_trait_table else: raise RuntimeError( "A default input trait table needs to be specified with the " + "--in_trait option, or alternatively a custom table can be " + "specified with the --observed_trait_table option") # Check that input filenames exist. check_files_exist([args.tree, trait_table]) # No longer support outputting CIs with this script. ci_setting = False hsp_table, ci_table = castor_hsp_workflow(tree_path=args.tree, trait_table_path=trait_table, hsp_method=args.hsp_method, chunk_size=args.chunk_size, calc_nsti=args.calculate_NSTI, calc_ci=ci_setting, check_input=args.check, num_proc=args.processes, ran_seed=args.seed, verbose=args.verbose) # Output the table to file. make_output_dir_for_file(args.output) hsp_table.to_csv(path_or_buf=args.output, index_label="sequence", sep="\t", compression="infer")
def main(): args = parser.parse_args() # Get start time. start_time = time.time() # Check that input files exist. check_files_exist([args.study_fasta, args.input]) # Make output folder. make_output_dir(args.output) out_tree = path.join(args.output, "out.tre") if args.custom_trait_tables is None: # Check that specified functional categories are allowed. FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM'] funcs = args.in_traits.split(",") for func in funcs: if func not in FUNC_TRAIT_OPTIONS: sys.exit("Error - specified category " + func + " is not one of " "the default categories.") # Add EC to this set if pathways are to be predicted. if "EC" not in funcs and not args.no_pathways: funcs.append("EC") rxn_func = "EC" func_tables = default_tables else: funcs = [] func_tables = {} table_i = 0 for custom in args.custom_trait_tables.split(","): func_id = path.splitext(path.basename(custom))[0] funcs.append(func_id) func_tables[func_id] = custom if table_i == 0: rxn_func = func_id table_i += 1 # Append marker as well, since this also needs to be run. funcs.append("marker") func_tables["marker"] = args.marker_gene_table # Methods for discrete trait prediction with CI enabled. discrete_set = set(['emp_prob', 'mp']) if args.confidence and args.hsp_method in discrete_set: ci_setting = True else: ci_setting = False gap_fill_opt = not args.no_gap_fill with TemporaryDirectory() as temp_dir: print("Placing sequences onto reference tree.") place_seqs_pipeline(study_fasta=args.study_fasta, ref_msa=args.ref_msa, tree=args.tree, out_tree=out_tree, threads=args.threads, papara_output=None, out_dir=temp_dir, chunk_size=5000, print_cmds=args.print_cmds) print("Finished placing sequences on output tree: " + out_tree) # Get predictions for all specified functions and keep track of outfiles. predicted_funcs = {} for func in funcs: # Only output NSTI in 16S table. nsti_setting = False if func == "marker" and args.calculate_NSTI: nsti_setting = True print("Running hidden-state prediction for " + func) hsp_table, ci_table = castor_hsp_workflow( tree_path=out_tree, trait_table_path=func_tables[func], hsp_method=args.hsp_method, calc_nsti=nsti_setting, calc_ci=ci_setting, check_input=False, num_proc=args.threads, ran_seed=args.seed) count_outfile = path.join(args.output, func + "_predicted.tsv") # Add "_nsti" to filename if output. if nsti_setting: count_outfile = path.join(args.output, func + "_nsti_predicted.tsv") # Keep track of output file name for next step of pipeline. predicted_funcs[func] = count_outfile print("Writing out predicted gene family abundances to " + count_outfile) hsp_table.to_csv(path_or_buf=count_outfile, index_label="sequence", sep="\t") # Output the CI file as well if option set. if ci_setting: ci_outfile = path.join(args.output, func + "_predicted_ci.tsv") print("Writing out predicted gene family CIs to " + ci_outfile) ci_table.to_csv(path_or_buf=ci_outfile, index_label="sequence", sep="\t") marker_infile = predicted_funcs["marker"] # Loop over each function again and run metagenome pipeline. for func in funcs: if func == "marker": continue func_infile = predicted_funcs[func] func_output_dir = path.join(args.output, func + "_metagenome_out") print("Running metagenome pipeline for " + func) # Infer metagenome abundances per-sample. with TemporaryDirectory() as temp_dir: # Pass arguments to key function and get predicted functions # stratified and unstratified by genomes. strat_pred, unstrat_pred = run_metagenome_pipeline( input_biom=args.input, function=func_infile, marker=marker_infile, out_dir=func_output_dir, max_nsti=args.max_nsti, min_reads=args.min_reads, min_samples=args.min_samples, strat_out=args.stratified, proc=args.threads, output_normfile=True) print("Writing metagenome output files for " + func + " to: " + func_output_dir) # Generate output table filepaths and write out pandas dataframe. unstrat_outfile = path.join(func_output_dir, "pred_metagenome_unstrat.tsv") unstrat_pred.index.name = "function" unstrat_pred.reset_index(inplace=True) if args.custom_trait_tables is None: unstrat_pred = add_descrip_col(inputfile=unstrat_pred, mapfile=default_map[func], in_df=True) unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t", index=False) # Write out stratified table only if that option was specified. if args.stratified: strat_outfile = path.join(func_output_dir, "pred_metagenome_strat.tsv") strat_pred.reset_index(inplace=True) if args.custom_trait_tables is None: strat_pred = add_descrip_col(inputfile=strat_pred, mapfile=default_map[func], in_df=True) strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=False) # Infer pathway abundances and coverages unless --no_pathways set. if not args.no_pathways: if args.stratified: in_metagenome = path.join(args.output, rxn_func + "_metagenome_out", "pred_metagenome_strat.tsv") else: in_metagenome = path.join(args.output, rxn_func + "_metagenome_out", "pred_metagenome_unstrat.tsv") print("Inferring MetaCyc pathways from predicted functions in this " "file: " + in_metagenome) with TemporaryDirectory() as temp_dir: unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline( inputfile=in_metagenome, mapfile=default_pathway_map, regroup_mapfile=default_regroup_map, proc=args.threads, out_dir=temp_dir, gap_fill=gap_fill_opt, per_sequence_contrib=args.per_sequence_contrib, print_cmds=args.print_cmds) pathways_out = path.join(args.output, "pathways_out") make_output_dir(pathways_out) print("Writing predicted pathway abundances and coverages to " + pathways_out) # Write output files. unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv") unstrat_abun.reset_index(inplace=True) if args.custom_trait_tables is None: unstrat_abun = add_descrip_col(inputfile=unstrat_abun, mapfile=default_map["METACYC"], in_df=True) unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile, sep="\t", index=False) unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv") unstrat_cov.reset_index(inplace=True) if args.custom_trait_tables is None: unstrat_cov = add_descrip_col(inputfile=unstrat_cov, mapfile=default_map["METACYC"], in_df=True) unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile, sep="\t", index=False) # Write stratified output only if something besides None was returned. if strat_abun is not None: strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv") if args.custom_trait_tables is None: strat_abun = add_descrip_col(inputfile=strat_abun, mapfile=default_map["METACYC"], in_df=True) strat_abun.to_csv(path_or_buf=strat_abun_outfile, sep="\t", index=False) if strat_cov is not None: strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv") if args.custom_trait_tables is None: strat_cov = add_descrip_col(inputfile=strat_cov, mapfile=default_map["METACYC"], in_df=True) strat_cov.to_csv(path_or_buf=strat_cov_outfile, sep="\t", index=False) # Print out elapsed time. elapsed_time = time.time() - start_time print("Completed PICRUSt2 pipeline in " + "%.2f" % elapsed_time + " seconds.")