def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if(opts.parallel): tmp_dir='jobs/' make_output_dir(tmp_dir) asr_table, ci_table =run_asr_in_parallel(tree=opts.input_tree_fp,table=opts.input_trait_table_fp,asr_method=opts.asr_method,parallel_method=opts.parallel_method, num_jobs=opts.num_jobs,tmp_dir=tmp_dir,verbose=opts.verbose) else: #call the apporpriate ASR app controller if(opts.asr_method == 'wagner'): asr_table = wagner_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,HALT_EXEC=opts.debug) elif(opts.asr_method == 'bayestraits'): pass elif(opts.asr_method == 'ace_ml'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'ML',HALT_EXEC=opts.debug) elif(opts.asr_method == 'ace_pic'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'pic',HALT_EXEC=opts.debug) elif(opts.asr_method == 'ace_reml'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'REML',HALT_EXEC=opts.debug) #output the table to file make_output_dir_for_file(opts.output_fp) asr_table.writeToFile(opts.output_fp,sep='\t') #output the CI file (unless the method is wagner) if not (opts.asr_method == 'wagner'): make_output_dir_for_file(opts.output_ci_fp) ci_table.writeToFile(opts.output_ci_fp,sep='\t')
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading sequencing depth table: ",opts.input_seq_depth_file scaling_factors = {} for sample_id,depth in parse_seq_count_file(open(opts.input_seq_depth_file,'U')): scaling_factors[sample_id]=depth ext=path.splitext(opts.input_count_table)[1] if opts.verbose: print "Loading count table: ", opts.input_count_table if (ext == '.gz'): genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb')) else: genome_table = parse_biom_table(open(opts.input_count_table,'U')) if opts.verbose: print "Scaling the metagenome..." scaled_metagenomes = scale_metagenomes(genome_table,scaling_factors) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) open(opts.output_metagenome_table,'w').write(format_biom_table(scaled_metagenomes))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ext=path.splitext(opts.input_count_table)[1] if opts.verbose: print "Loading count table: ", opts.input_count_table if (ext == '.gz'): genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb')) else: genome_table = parse_biom_table(open(opts.input_count_table,'U')) if opts.verbose: print "Predicting the metagenome..." partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) open(opts.output_metagenome_table,'w').write(output_text)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if(opts.parallel): tmp_dir='jobs/' make_output_dir(tmp_dir) asr_table, ci_table =run_asr_in_parallel(tree=opts.input_tree_fp,table=opts.input_trait_table_fp,asr_method=opts.asr_method,parallel_method=opts.parallel_method, num_jobs=opts.num_jobs,tmp_dir=tmp_dir,verbose=opts.verbose) else: #call the apporpriate ASR app controller if(opts.asr_method == 'wagner'): asr_table = wagner_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,HALT_EXEC=opts.debug) elif(opts.asr_method == 'bayestraits'): pass elif(opts.asr_method == 'ace_ml'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'ML',HALT_EXEC=opts.debug) elif(opts.asr_method == 'ace_pic'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'pic',HALT_EXEC=opts.debug) elif(opts.asr_method == 'ace_reml'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'REML',HALT_EXEC=opts.debug) #output the table to file make_output_dir_for_file(opts.output_fp) asr_table.writeToFile(opts.output_fp,sep='\t') #output the CI file (unless the method is wagner) if not (opts.asr_method == 'wagner'): make_output_dir_for_file(opts.output_ci_fp) ci_table.writeToFile(opts.output_ci_fp,sep='\t')
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_ext=path.splitext(opts.input_otu_fp)[1] if opts.input_format_classic: otu_table=parse_classic_table_to_rich_table(open(opts.input_otu_fp,'U'),None,None,None,DenseOTUTable) else: if input_ext != '.biom': sys.stderr.write("\nOTU table does not have '.biom' extension! If loading causes error consider using '-f' option to load tab-delimited OTU table!\n\n") otu_table = parse_biom_table(open(opts.input_otu_fp,'U')) ext=path.splitext(opts.input_count_fp)[1] if (ext == '.gz'): count_table = parse_biom_table(gzip.open(opts.input_count_fp,'rb')) else: count_table = parse_biom_table(open(opts.input_count_fp,'U')) #Need to only keep data relevant to our otu list ids=[] for x in otu_table.iterObservations(): ids.append(str(x[1])) ob_id=count_table.ObservationIds[0] filtered_otus=[] filtered_values=[] for x in ids: if count_table.sampleExists(x): filtered_otus.append(x) filtered_values.append(otu_table.observationData(x)) #filtered_values = map(list,zip(*filtered_values)) filtered_otu_table=table_factory(filtered_values,otu_table.SampleIds,filtered_otus, constructor=DenseOTUTable) copy_numbers_filtered={} for x in filtered_otus: value = count_table.getValueByIds(ob_id,x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x]={opts.metadata_identifer:value} filtered_otu_table.addObservationMetadata(copy_numbers_filtered) normalized_table = filtered_otu_table.normObservationByMetadata(opts.metadata_identifer) make_output_dir_for_file(opts.output_otu_fp) open(opts.output_otu_fp,'w').write(\ normalized_table.getBiomFormatJsonString('PICRUST'))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ext=path.splitext(opts.input_count_table)[1] if opts.verbose: print "Loading count table: ", opts.input_count_table if (ext == '.gz'): genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb')) else: genome_table = parse_biom_table(open(opts.input_count_table,'U')) make_output_dir_for_file(opts.output_metagenome_table) if opts.accuracy_metrics: # Calculate accuracy metrics #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False) #print "Unweighted NSTI:", unweighted_nsti weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True) samples= weighted_nsti[0] nstis = list(weighted_nsti[1]) #print "Samples:",samples #print "NSTIs:",nstis samples_and_nstis = zip(samples,nstis) #print "Samples and NSTIs:",samples_and_nstis lines = ["#Sample\tMetric\tValue\n"] #print weighted_nsti for sample,nsti in samples_and_nstis: line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti)) lines.append(line) if opts.verbose: for l in sorted(lines): print l if opts.verbose: print "Writing accuracy information to file:", opts.accuracy_metrics open(opts.accuracy_metrics,'w').writelines(sorted(lines)) if opts.verbose: print "Predicting the metagenome..." predicted_metagenomes = predict_metagenomes(otu_table,genome_table) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) if(opts.format_tab_delimited): open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf()) else: open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))
def write_metagenome_to_file( predicted_metagenome, output_fp, tab_delimited=False, verbose_filetype_message="metagenome prediction", verbose=False, ): """Write a BIOM Table object to a file, creating the directory if needed predicted_metagenome -- a BIOM table object output_fp -- the filepath to write the output tab_delimited -- if False, write in BIOm format, otherwise write as a tab-delimited file verbose -- if True output verbose info to StdOut """ if verbose: print "Writing %s results to output file: %s" % (verbose_filetype_message, output_fp) make_output_dir_for_file(output_fp) if tab_delimited: # peek at first observation to decide on what observeration metadata # to output in tab-delimited format (obs_val, obs_id, obs_metadata) = predicted_metagenome.iter(axis="observation").next() # see if there is a metadata field that contains the "Description" # (e.g. KEGG_Description or COG_Description) h = re.compile(".*Description") metadata_names = filter(h.search, obs_metadata.keys()) if metadata_names: # use the "Description" field we found metadata_name = metadata_names[0] elif obs_metadata.keys(): # if no "Description" metadata then just output the first # observation metadata metadata_name = (obs_metadata.keys())[0] else: # if no observation metadata then don't output any metadata_name = None open(output_fp, "w").write( predicted_metagenome.to_tsv( header_key=metadata_name, header_value=metadata_name, metadata_formatter=biom_meta_to_string ) ) else: # output in BIOM format format_fs = { "KEGG_Description": picrust_formatter, "COG_Description": picrust_formatter, "KEGG_Pathways": picrust_formatter, "COG_Category": picrust_formatter, } write_biom_table(predicted_metagenome, output_fp, format_fs=format_fs)
def write_metagenome_to_file(predicted_metagenome,output_fp,\ tab_delimited=False,verbose_filetype_message="metagenome prediction",\ verbose=False): """Write a BIOM Table object to a file, creating the directory if needed predicted_metagenome -- a BIOM table object output_fp -- the filepath to write the output tab_delimited -- if False, write in BIOm format, otherwise write as a tab-delimited file verbose -- if True output verbose info to StdOut """ if verbose: print "Writing %s results to output file: %s"\ %(verbose_filetype_message,output_fp) make_output_dir_for_file(output_fp) if tab_delimited: #peek at first observation to decide on what observeration metadata #to output in tab-delimited format (obs_val,obs_id,obs_metadata)=\ predicted_metagenome.iter(axis='observation').next() #see if there is a metadata field that contains the "Description" #(e.g. KEGG_Description or COG_Description) h = re.compile('.*Description') metadata_names = filter(h.search, obs_metadata.keys()) if metadata_names: #use the "Description" field we found metadata_name = metadata_names[0] elif (obs_metadata.keys()): #if no "Description" metadata then just output the first #observation metadata metadata_name = (obs_metadata.keys())[0] else: #if no observation metadata then don't output any metadata_name = None open(output_fp,'w').write(predicted_metagenome.to_tsv(\ header_key=metadata_name,header_value=metadata_name,metadata_formatter=biom_meta_to_string)) else: #output in BIOM format format_fs = { 'KEGG_Description': picrust_formatter, 'COG_Description': picrust_formatter, 'KEGG_Pathways': picrust_formatter, 'COG_Category': picrust_formatter } write_biom_table(predicted_metagenome, output_fp, format_fs=format_fs)
def write_metagenome_to_file(predicted_metagenome,output_fp,\ tab_delimited=False,verbose_filetype_message="metagenome prediction",\ verbose=False): """Write a BIOM Table object to a file, creating the directory if needed predicted_metagenome -- a BIOM table object output_fp -- the filepath to write the output tab_delimited -- if False, write in BIOm format, otherwise write as a tab-delimited file verbose -- if True output verbose info to StdOut """ if verbose: print "Writing %s results to output file: %s"\ %(verbose_filetype_message,output_fp) make_output_dir_for_file(output_fp) if tab_delimited: #peek at first observation to decide on what observeration metadata #to output in tab-delimited format (obs_val,obs_id,obs_metadata)=\ predicted_metagenome.iterObservations().next() #see if there is a metadata field that contains the "Description" #(e.g. KEGG_Description or COG_Description) h = re.compile('.*Description') metadata_names=filter(h.search,obs_metadata.keys()) if metadata_names: #use the "Description" field we found metadata_name=metadata_names[0] elif(obs_metadata.keys()): #if no "Description" metadata then just output the first #observation metadata metadata_name=(obs_metadata.keys())[0] else: #if no observation metadata then don't output any metadata_name=None open(output_fp,'w').write(predicted_metagenome.delimitedSelf(\ header_key=metadata_name,header_value=metadata_name)) else: #output in BIOM format open(output_fp,'w').write(format_biom_table(predicted_metagenome))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.verbose: print "Loading sequencing depth table: ", opts.input_seq_depth_file scaling_factors = {} for sample_id, depth in parse_seq_count_file(open(opts.input_seq_depth_file, "U")): scaling_factors[sample_id] = depth if opts.verbose: print "Loading count table: ", opts.input_count_table genome_table = load_table(opts.input_count_table) if opts.verbose: print "Scaling the metagenome..." scaled_metagenomes = scale_metagenomes(genome_table, scaling_factors) if opts.verbose: print "Writing results to output file: ", opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) write_biom_table(scaled_metagenomes, opts.output_metagenome_table)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading sequencing depth table: ",opts.input_seq_depth_file scaling_factors = {} for sample_id,depth in parse_seq_count_file(open(opts.input_seq_depth_file,'U')): scaling_factors[sample_id]=depth if opts.verbose: print "Loading count table: ", opts.input_count_table genome_table = load_table(opts.input_count_table) if opts.verbose: print "Scaling the metagenome..." scaled_metagenomes = scale_metagenomes(genome_table,scaling_factors) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) write_biom_table(scaled_metagenomes, opts.output_metagenome_table)
try: assert set(variance_table.ids()) == set(genome_table.ids()) except AssertionError, e: for var_id in variance_table.ids(): if var_id not in genome_table.ids(): print "Variance table SampleId %s not in genome_table SampleIds" % var_id raise AssertionError( "Variance table and genome table contain different OTU ids") #sort the ObservationIds and SampleIds to be in the same order variance_table = variance_table.sort_order( genome_table.ids(axis='observation'), axis='observation') variance_table = variance_table.sort_order(genome_table.ids(), axis='sample') make_output_dir_for_file(opts.output_metagenome_table) if opts.accuracy_metrics: # Calculate accuracy metrics weighted_nsti = calc_nsti(otu_table, genome_table, weighted=True) samples = weighted_nsti[0] nstis = list(weighted_nsti[1]) samples_and_nstis = zip(samples, nstis) if opts.verbose: print "Writing NSTI information to file:", opts.accuracy_metrics accuracy_output_fh = open(opts.accuracy_metrics, 'w') accuracy_output_fh.write("#Sample\tMetric\tValue\n") for sample, nsti in samples_and_nstis: line = "%s\tWeighted NSTI\t%s\n" % (sample, str(nsti)) accuracy_output_fh.write(line)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_ext = path.splitext(opts.input_otu_fp)[1] if opts.input_format_classic: otu_table = parse_classic_table_to_rich_table( open(opts.input_otu_fp, 'U'), None, None, None, DenseOTUTable) else: try: otu_table = parse_biom_table(open(opts.input_otu_fp, 'U')) except ValueError: raise ValueError( "Error loading OTU table! If not in BIOM format use '-f' option.\n" ) ids_to_load = otu_table.ObservationIds if (opts.input_count_fp is None): #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name = '_'.join( ['16S', opts.gg_version, 'precalculated.tab.gz']) input_count_table = join(get_picrust_project_dir(), 'picrust', 'data', precalc_file_name) else: input_count_table = opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext = path.splitext(input_count_table)[1] if (ext == '.gz'): count_table_fh = gzip.open(input_count_table, 'rb') else: count_table_fh = open(input_count_table, 'U') if opts.load_precalc_file_in_biom: count_table = parse_biom_table(count_table_fh.read()) else: count_table = convert_precalc_to_biom(count_table_fh, ids_to_load) #Need to only keep data relevant to our otu list ids = [] for x in otu_table.iterObservations(): ids.append(str(x[1])) ob_id = count_table.ObservationIds[0] filtered_otus = [] filtered_values = [] for x in ids: if count_table.sampleExists(x): filtered_otus.append(x) filtered_values.append(otu_table.observationData(x)) #filtered_values = map(list,zip(*filtered_values)) filtered_otu_table = table_factory(filtered_values, otu_table.SampleIds, filtered_otus, constructor=DenseOTUTable) copy_numbers_filtered = {} for x in filtered_otus: value = count_table.getValueByIds(ob_id, x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x] = {opts.metadata_identifer: value} filtered_otu_table.addObservationMetadata(copy_numbers_filtered) normalized_table = filtered_otu_table.normObservationByMetadata( opts.metadata_identifer) #move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'ObservationMetadata') normalized_otu_table = transfer_sample_metadata(otu_table, normalized_table, 'SampleMetadata') make_output_dir_for_file(opts.output_otu_fp) open(opts.output_otu_fp, 'w').write(format_biom_table(normalized_table))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table = load_table(opts.input_otu_fp) ids_to_load = otu_table.ids(axis='observation') if(opts.input_count_fp is None): #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if (ext == '.gz'): count_table_fh = gzip.open(input_count_table,'rb') else: count_table_fh = open(input_count_table,'U') if opts.load_precalc_file_in_biom: count_table = load_table(count_table_fh) else: count_table = convert_precalc_to_biom(count_table_fh, ids_to_load) #Need to only keep data relevant to our otu list ids=[] for x in otu_table.iter(axis='observation'): ids.append(str(x[1])) ob_id=count_table.ids(axis='observation')[0] filtered_otus=[] filtered_values=[] for x in ids: if count_table.exists(x, axis='sample'): filtered_otus.append(x) filtered_values.append(otu_table.data(x, axis='observation')) filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids()) copy_numbers_filtered={} for x in filtered_otus: value = count_table.get_value_by_ids(ob_id,x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x]={opts.metadata_identifer:value} filtered_otu_table.add_metadata(copy_numbers_filtered, axis='observation') def metadata_norm(v, i, md): return v / float(md[opts.metadata_identifer]) normalized_table = filtered_otu_table.transform(metadata_norm, axis='observation') #move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'observation') make_output_dir_for_file(opts.output_otu_fp) write_biom_table(normalized_table, opts.output_otu_fp)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #if we specify we want NSTI only then we have to calculate it first if opts.output_accuracy_metrics_only: opts.calculate_accuracy_metrics=True if opts.verbose: print "Loading tree from file:", opts.tree if opts.no_round: round_opt = False else: round_opt = True # Load Tree tree = load_picrust_tree(opts.tree, opts.verbose) table_headers=[] traits={} #load the asr trait table using the previous list of functions to order the arrays if opts.reconstructed_trait_table: table_headers,traits =\ update_trait_dict_from_file(opts.reconstructed_trait_table) #Only load confidence intervals on the reconstruction #If we actually have ASR values in the analysis if opts.reconstruction_confidence: if opts.verbose: print "Loading ASR confidence data from file:",\ opts.reconstruction_confidence print "Assuming confidence data is of type:",opts.confidence_format asr_confidence_output = open(opts.reconstruction_confidence) asr_min_vals,asr_max_vals, params,column_mapping =\ parse_asr_confidence_output(asr_confidence_output,format=opts.confidence_format) if 'sigma' in params: brownian_motion_parameter = params['sigma'][0] else: brownian_motion_parameter = None if opts.verbose: print "Done. Loaded %i confidence interval values." %(len(asr_max_vals)) print "Brownian motion parameter:",brownian_motion_parameter else: brownian_motion_parameter = None #load the trait table into a dict with organism names as keys and arrays as functions table_headers,genome_traits =\ update_trait_dict_from_file(opts.observed_trait_table,table_headers) #Combine the trait tables overwriting the asr ones if they exist in the genome trait table. traits.update(genome_traits) # Specify the attribute where we'll store the reconstructions trait_label = "Reconstruction" if opts.verbose: print "Assigning traits to tree..." # Decorate tree using the traits tree = assign_traits_to_tree(traits,tree, trait_label=trait_label) if opts.reconstruction_confidence: if opts.verbose: print "Assigning trait confidence intervals to tree..." tree = assign_traits_to_tree(asr_min_vals,tree,\ trait_label="lower_bound") tree = assign_traits_to_tree(asr_max_vals,tree,\ trait_label="upper_bound") if brownian_motion_parameter is None: if opts.verbose: print "No Brownian motion parameters loaded. Inferring these from 95% confidence intervals..." brownian_motion_parameter = get_brownian_motion_param_from_confidence_intervals(tree,\ upper_bound_trait_label="upper_bound",\ lower_bound_trait_label="lower_bound",\ trait_label=trait_label,\ confidence=0.95) if opts.verbose: print "Inferred the following rate parameters:",brownian_motion_parameter if opts.verbose: print "Collecting list of nodes to predict..." #Start by predict all tip nodes. nodes_to_predict = [tip.Name for tip in tree.tips()] if opts.verbose: print "Found %i nodes to predict." % len(nodes_to_predict) if opts.limit_predictions_to_organisms: organism_id_str = opts.limit_predictions_to_organisms ok_organism_ids = organism_id_str.split(',') ok_organism_ids = [n.strip() for n in ok_organism_ids] for f in set_label_conversion_fns(True,True): ok_organism_ids = [f(i) for i in ok_organism_ids] if opts.verbose: print "Limiting predictions to user-specified ids:",\ ",".join(ok_organism_ids) if not ok_organism_ids: raise RuntimeError(\ "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\ % opts.limit_predictions_to_organisms) nodes_to_predict =\ [n for n in nodes_to_predict if n in ok_organism_ids] if not nodes_to_predict: raise RuntimeError(\ "Filtering by user-specified ids resulted in an empty set of nodes to predict. Are the ids on the commmand-line and tree ids in the same format? Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0])) if opts.verbose: print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" %(len(nodes_to_predict)) if opts.limit_predictions_by_otu_table: if opts.verbose: print "Limiting predictions to ids in user-specified OTU table:",\ opts.limit_predictions_by_otu_table otu_table = open(opts.limit_predictions_by_otu_table,"U") #Parse OTU table for ids otu_ids =\ extract_ids_from_table(otu_table.readlines(),delimiter="\t") if not otu_ids: raise RuntimeError(\ "Found no valid ids in input OTU table: %s. Is the path correct?"\ % opts.limit_predictions_by_otu_table) nodes_to_predict =\ [n for n in nodes_to_predict if n in otu_ids] if not nodes_to_predict: raise RuntimeError(\ "Filtering by OTU table resulted in an empty set of nodes to predict. Are the OTU ids and tree ids in the same format? Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0])) if opts.verbose: print "After filtering by OTU table, %i nodes remain to be predicted" %(len(nodes_to_predict)) # Calculate accuracy of PICRUST for the given tree, sequenced genomes # and set of ndoes to predict accuracy_metrics = ['NSTI'] accuracy_metric_results = None if opts.calculate_accuracy_metrics: if opts.verbose: print "Calculating accuracy metrics: %s" %([",".join(accuracy_metrics)]) accuracy_metric_results = {} if 'NSTI' in accuracy_metrics: nsti_result,min_distances =\ calc_nearest_sequenced_taxon_index(tree,\ limit_to_tips = nodes_to_predict,\ trait_label = trait_label, verbose=opts.verbose) #accuracy_metric_results['NSTI'] = nsti_result for organism in min_distances.keys(): accuracy_metric_results[organism] = {'NSTI': min_distances[organism]} if opts.verbose: print "NSTI:", nsti_result if opts.output_accuracy_metrics_only: #Write accuracy metrics to file if opts.verbose: print "Writing accuracy metrics to file:",opts.output_accuracy_metrics f = open(opts.output_accuracy_metrics_only,'w+') f.write("metric\torganism\tvalue\n") lines =[] for organism in accuracy_metric_results.keys(): for metric in accuracy_metric_results[organism].keys(): lines.append('\t'.join([metric,organism,\ str(accuracy_metric_results[organism][metric])])+'\n') f.writelines(sorted(lines)) f.close() exit() if opts.verbose: print "Generating predictions using method:",opts.prediction_method if opts.weighting_method == 'exponential': #For now, use exponential weighting weight_fn = make_neg_exponential_weight_fn(e) variances=None #Overwritten by methods that calc variance confidence_intervals=None #Overwritten by methods that calc variance if opts.prediction_method == 'asr_and_weighting': # Perform predictions using reconstructed ancestral states if opts.reconstruction_confidence: predictions,variances,confidence_intervals =\ predict_traits_from_ancestors(tree,nodes_to_predict,\ trait_label=trait_label,\ lower_bound_trait_label="lower_bound",\ upper_bound_trait_label="upper_bound",\ calc_confidence_intervals = True,\ brownian_motion_parameter=brownian_motion_parameter,\ weight_fn=weight_fn,verbose=opts.verbose, round_predictions=round_opt) else: predictions =\ predict_traits_from_ancestors(tree,nodes_to_predict,\ trait_label=trait_label,\ weight_fn =weight_fn,verbose=opts.verbose, round_predictions=round_opt) elif opts.prediction_method == 'weighting_only': #Ignore ancestral information predictions =\ weighted_average_tip_prediction(tree,nodes_to_predict,\ trait_label=trait_label,\ weight_fn =weight_fn,verbose=opts.verbose) elif opts.prediction_method == 'nearest_neighbor': predictions = predict_nearest_neighbor(tree,nodes_to_predict,\ trait_label=trait_label,tips_only = True) elif opts.prediction_method == 'random_neighbor': predictions = predict_random_neighbor(tree,\ nodes_to_predict,trait_label=trait_label) if opts.verbose: print "Done making predictions." make_output_dir_for_file(opts.output_trait_table) out_fh=open(opts.output_trait_table,'w') #Generate the table of biom predictions if opts.verbose: print "Converting results to .biom format for output..." biom_predictions=biom_table_from_predictions(predictions,table_headers,\ observation_metadata=None,\ sample_metadata=accuracy_metric_results,convert_to_int=False) if opts.verbose: print "Writing prediction results to file: ",opts.output_trait_table if opts.output_precalc_file_in_biom: #write biom table to file write_biom_table(biom_predictions, opts.output_trait_table) else: #convert to precalc (tab-delimited) format out_fh = open(opts.output_trait_table, 'w') out_fh.write(convert_biom_to_precalc(biom_predictions)) out_fh.close() #Write out variance information to file if variances: if opts.verbose: print "Converting variances to BIOM format" if opts.output_precalc_file_in_biom: suffix='.biom' else: suffix='.tab' biom_prediction_variances=biom_table_from_predictions({k:v['variance'] for k,v in variances.iteritems()},table_headers,\ observation_metadata=None,\ sample_metadata=None,convert_to_int=False) outfile_base,extension = splitext(opts.output_trait_table) variance_outfile = outfile_base+"_variances"+suffix make_output_dir_for_file(variance_outfile) if opts.verbose: print "Writing variance information to file:",variance_outfile if opts.output_precalc_file_in_biom: write_biom_table(biom_prediction_variances, variance_outfile) else: open(variance_outfile,'w').write(\ convert_biom_to_precalc(biom_prediction_variances)) if confidence_intervals: if opts.verbose: print "Converting upper confidence interval values to BIOM format" biom_prediction_upper_CI=biom_table_from_predictions({k:v['upper_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\ observation_metadata=None,\ sample_metadata=None,convert_to_int=False) outfile_base,extension = splitext(opts.output_trait_table) upper_CI_outfile = outfile_base+"_upper_CI"+suffix make_output_dir_for_file(upper_CI_outfile) if opts.verbose: print "Writing upper confidence limit information to file:",upper_CI_outfile if opts.output_precalc_file_in_biom: write_biom_table(biom_prediction_upper_CI, upper_CI_outfile) else: open(upper_CI_outfile,'w').write(\ convert_biom_to_precalc(biom_prediction_upper_CI)) biom_prediction_lower_CI=biom_table_from_predictions({k:v['lower_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\ observation_metadata=None,\ sample_metadata=None,convert_to_int=False) outfile_base,extension = splitext(opts.output_trait_table) lower_CI_outfile = outfile_base+"_lower_CI"+suffix make_output_dir_for_file(lower_CI_outfile) if opts.verbose: print "Writing lower confidence limit information to file",lower_CI_outfile if opts.output_precalc_file_in_biom: write_biom_table(biom_prediction_lower_CI, lower_CI_outfile) else: open(lower_CI_outfile,'w').write(\ convert_biom_to_precalc(biom_prediction_lower_CI))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading tree from file:", opts.tree # Load Tree #tree = LoadTree(opts.tree) tree = load_picrust_tree(opts.tree, opts.verbose) table_headers =[] traits={} #load the asr trait table using the previous list of functions to order the arrays if opts.reconstructed_trait_table: table_headers,traits =\ update_trait_dict_from_file(opts.reconstructed_trait_table) #Only load confidence intervals on the reconstruction #If we actually have ASR values in the analysis if opts.reconstruction_confidence: if opts.verbose: print "Loading ASR confidence data from file:",\ opts.reconstruction_confidence asr_confidence_output = open(opts.reconstruction_confidence) asr_min_vals,asr_max_vals, params,column_mapping =\ parse_asr_confidence_output(asr_confidence_output) brownian_motion_parameter = params['sigma'][0] brownian_motion_error = params['sigma'][1] if opts.verbose: print "Done. Loaded %i confidence interval values." %(len(asr_max_vals)) print "Brownian motion parameter:",brownian_motion_parameter else: brownian_motion_parameter = None #load the trait table into a dict with organism names as keys and arrays as functions table_headers,genome_traits =\ update_trait_dict_from_file(opts.observed_trait_table,table_headers) #Combine the trait tables overwriting the asr ones if they exist in the genome trait table. traits.update(genome_traits) # Specify the attribute where we'll store the reconstructions trait_label = "Reconstruction" if opts.verbose: print "Assigning traits to tree..." # Decorate tree using the traits tree = assign_traits_to_tree(traits,tree, trait_label=trait_label) if opts.reconstruction_confidence: if opts.verbose: print "Assigning trait confidence intervals to tree..." tree = assign_traits_to_tree(asr_min_vals,tree,\ trait_label="lower_bound") tree = assign_traits_to_tree(asr_max_vals,tree,\ trait_label="upper_bound") if opts.verbose: print "Collecting list of nodes to predict..." #Start by predict all tip nodes. nodes_to_predict = [tip.Name for tip in tree.tips()] if opts.verbose: print "Found %i nodes to predict." % len(nodes_to_predict) if opts.limit_predictions_to_organisms: organism_id_str = opts.limit_predictions_to_organisms ok_organism_ids = organism_id_str.split(',') ok_organism_ids = [n.strip() for n in ok_organism_ids] for f in set_label_conversion_fns(True,True): ok_organism_ids = [f(i) for i in ok_organism_ids] if opts.verbose: print "Limiting predictions to user-specified ids:",\ ",".join(ok_organism_ids) if not ok_organism_ids: raise RuntimeError(\ "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\ % opts.limit_predictions_to_organisms) nodes_to_predict =\ [n for n in nodes_to_predict if n in ok_organism_ids] if not nodes_to_predict: raise RuntimeError(\ "Filtering by user-specified ids resulted in an empty set of nodes to predict. Are the ids on the commmand-line and tree ids in the same format? Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0])) if opts.verbose: print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" %(len(nodes_to_predict)) if opts.limit_predictions_by_otu_table: if opts.verbose: print "Limiting predictions to ids in user-specified OTU table:",\ opts.limit_predictions_by_otu_table otu_table = open(opts.limit_predictions_by_otu_table,"U") #Parse OTU table for ids otu_ids =\ extract_ids_from_table(otu_table.readlines(),delimiter="\t") if not otu_ids: raise RuntimeError(\ "Found no valid ids in input OTU table: %s. Is the path correct?"\ % opts.limit_predictions_by_otu_table) nodes_to_predict =\ [n for n in nodes_to_predict if n in otu_ids] if not nodes_to_predict: raise RuntimeError(\ "Filtering by OTU table resulted in an empty set of nodes to predict. Are the OTU ids and tree ids in the same format? Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0])) if opts.verbose: print "After filtering by OTU table, %i nodes remain to be predicted" %(len(nodes_to_predict)) # Calculate accuracy of PICRUST for the given tree, sequenced genomes # and set of ndoes to predict accuracy_metrics = ['NSTI'] accuracy_metric_results = None if opts.output_accuracy_metrics: if opts.verbose: print "Calculating accuracy metrics: %s" %([",".join(accuracy_metrics)]) accuracy_metric_results = {} if 'NSTI' in accuracy_metrics: nsti_result,min_distances =\ calc_nearest_sequenced_taxon_index(tree,\ limit_to_tips = nodes_to_predict,\ trait_label = trait_label, verbose=opts.verbose) #accuracy_metric_results['NSTI'] = nsti_result for organism in min_distances.keys(): accuracy_metric_results[organism] = {'NSTI': min_distances[organism]} if opts.verbose: print "NSTI:", nsti_result #Write accuracy metrics to file if opts.verbose: print "Writing accuracy metrics to file:",opts.output_accuracy_metrics f = open(opts.output_accuracy_metrics,'w+') lines = ["metric\torganism\tvalue\n"] for organism in accuracy_metric_results.keys(): for metric in accuracy_metric_results[organism].keys(): lines.append('\t'.join([metric,organism,\ str(accuracy_metric_results[organism][metric])])+'\n') f.writelines(sorted(lines)) f.close() if opts.verbose: print "Generating predictions using method:",opts.prediction_method if opts.weighting_method == 'exponential': #For now, use exponential weighting weight_fn = make_neg_exponential_weight_fn(e) elif opts.weighting_method == 'linear': #Linear weight function weight_fn = linear_weight elif opts.weighting_method == 'equal_weight': weight_fn = equal_weight variances=None #Overwritten by methods that calc variance if opts.prediction_method == 'asr_and_weighting': if opts.reconstruction_confidence: # Perform predictions using reconstructed ancestral states predictions,variances =\ predict_traits_from_ancestors(tree,nodes_to_predict,\ trait_label=trait_label,\ lower_bound_trait_label="lower_bound",\ upper_bound_trait_label="upper_bound",\ calc_confidence_intervals = True,\ brownian_motion_parameter=brownian_motion_parameter,\ use_self_in_prediction = True,\ weight_fn =weight_fn,verbose=opts.verbose) else: predictions =\ predict_traits_from_ancestors(tree,nodes_to_predict,\ trait_label=trait_label,\ use_self_in_prediction = True,\ weight_fn =weight_fn,verbose=opts.verbose) elif opts.prediction_method == 'weighting_only': #Ignore ancestral information predictions =\ weighted_average_tip_prediction(tree,nodes_to_predict,\ trait_label=trait_label,\ use_self_in_prediction = True,\ weight_fn =weight_fn,verbose=opts.verbose) elif opts.prediction_method == 'nearest_neighbor': predictions = predict_nearest_neighbor(tree,nodes_to_predict,\ trait_label=trait_label,\ use_self_in_prediction = True, tips_only = True) elif opts.prediction_method == 'random_neighbor': predictions = predict_random_neighbor(tree,\ nodes_to_predict,trait_label=trait_label,\ use_self_in_prediction = True) else: error_template =\ "Prediction method '%s' is not supported. Valid methods are: %s'" error_text = error_template %(opts.prediction_method,\ ", ".join(METHOD_CHOICES)) if opts.verbose: print "Converting results to .biom format for output..." #convert to biom format (and transpose) biom_predictions=biom_table_from_predictions(predictions,table_headers) #In the .biom table, organisms are 'samples' and traits are 'observations #(by analogy with a metagenomic sample) #Therefore, we associate the trait variances with the per-observation metadata #print "variances:",variances #print "BIOM observations:", [o for o in biom_predictions.iterObservations()] #print "BIOM samples:", [s for s in biom_predictions.iterSamples()] if variances is not None: if opts.verbose: print "Adding variance information to output .biom table, as per-observation metadata with key 'variance'..." biom_predictions.addSampleMetadata(variances) if accuracy_metric_results is not None: if opts.verbose: print "Adding accuracy metrics (%s) to biom table as per-observation metadata..." %(",".join(accuracy_metrics)) biom_predictions.addSampleMetadata(accuracy_metric_results) #Add variance information as per observation metadata if opts.verbose: print "Writing biom format prediction results to file: ",opts.output_trait_table #write biom table to file make_output_dir_for_file(opts.output_trait_table) open(opts.output_trait_table,'w').write(\ format_biom_table(biom_predictions))
if var_id not in genome_table.ObservationIds: print "Variance table ObsId %s not in genome_table ObsIds" %var_id raise AssertionError("Variance table and genome table contain different gene ids") try: assert set(variance_table.SampleIds) == set(genome_table.SampleIds) except AssertionError,e: for var_id in variance_table.SampleIds: if var_id not in genome_table.SampleIds: print "Variance table SampleId %s not in genome_table SampleIds" %var_id raise AssertionError("Variance table and genome table contain different OTU ids") #sort the ObservationIds and SampleIds to be in the same order variance_table=variance_table.sortObservationOrder(genome_table.ObservationIds) variance_table=variance_table.sortSampleOrder(genome_table.SampleIds) make_output_dir_for_file(opts.output_metagenome_table) if opts.accuracy_metrics: # Calculate accuracy metrics weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True) samples= weighted_nsti[0] nstis = list(weighted_nsti[1]) samples_and_nstis = zip(samples,nstis) if opts.verbose: print "Writing NSTI information to file:", opts.accuracy_metrics accuracy_output_fh = open(opts.accuracy_metrics,'w') accuracy_output_fh.write("#Sample\tMetric\tValue\n") for sample,nsti in samples_and_nstis: line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti)) accuracy_output_fh.write(line)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = load_table(opts.input_otu_table) ids_to_load = otu_table.ids(axis='observation') if(opts.input_count_table is None): #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz) precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if opts.verbose: print "Loading count table: ", input_count_table if (ext == '.gz'): genome_table_fh = gzip.open(input_count_table,'rb') else: genome_table_fh = open(input_count_table,'U') #In the genome/trait table genomes are the samples and #genes are the observations if opts.load_precalc_file_in_biom: if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = load_table(genome_table_fh) else: genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load) ok_functional_categories = None metadata_type = None if opts.limit_to_functional_categories: ok_functional_categories = opts.limit_to_functional_categories.split("|") if opts.verbose: print "Limiting to functional categories: %s" %(str(ok_functional_categories)) # Either KEGG_Pathways or COG_Category needs # to be assigned to metadata_key to limit to # functional categories (not needed for # individual functions) if opts.type_of_prediction == "ko": metadata_type = "KEGG_Pathways" elif opts.type_of_prediction == "cog": metadata_type = "COG_Category" elif opts.type_of_prediction == "rfam": exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)") partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\ limit_to_functional_categories = ok_functional_categories , metadata_key = metadata_type ) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_fp make_output_dir_for_file(opts.output_fp) open(opts.output_fp,'w').write(output_text)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) verbose = opts.verbose min_args = 1 if len(args) < min_args: option_parser.error( 'One or more predicted biom files must be provided.') observed_files = args make_output_dir_for_file(opts.output_fp) out_fh = open(opts.output_fp, 'w') if verbose: print "Loading expected trait table file:", opts.exp_trait_table_fp exp_table = load_table(opts.exp_trait_table_fp) header_printed = False header_keys = [] delimiter = "\t" for observed_file in observed_files: observed_file_name = basename(observed_file) if verbose: print "Loading predicted trait table file:", observed_file_name obs_table = load_table(observed_file) if opts.compare_observations: if verbose: print "Transposing tables to allow evaluation of observations (instead of samples)..." obs_table = obs_table.transpose() exp_table = exp_table.transpose() if verbose: print "Matching predicted and expected tables..." obs, exp = match_biom_tables( obs_table, exp_table, verbose=verbose, limit_to_expected_observations=opts.limit_to_expected_observations, limit_to_observed_observations=opts.limit_to_observed_observations, normalize=opts.normalize, shuffle_samples=opts.shuffle_samples) if verbose: print "Calculating accuracy stats for all observations..." #import pdb; pdb.set_trace() for i in obs: if verbose: print "Calculating stats for: ", i if opts.not_relative_abundance_scores: results = calculate_accuracy_stats_from_observations( obs[i], exp[i], success_criterion='binary') else: results = calculate_accuracy_stats_from_observations( obs[i], exp[i], success_criterion='ra_exact') #If first pass then print out header if not header_printed: header_printed = True header_keys = sorted(results.keys()) out_fh.write( delimiter.join(['file', 'label'] + header_keys) + "\n") #print results using same order as header values = [observed_file_name, i ] + ['{0:.3g}'.format(results[x]) for x in header_keys] out_str = delimiter.join(map(str, values)) + "\n" out_fh.write(out_str)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading OTU table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) if opts.verbose: print "Done loading OTU table containing %i samples and %i OTUs." %(len(otu_table.SampleIds),len(otu_table.ObservationIds)) if(opts.input_count_table is None): if(opts.type_of_prediction == 'KO'): input_count_table=join(get_picrust_project_dir(),'picrust','data','ko_precalculated.biom.gz') elif(opts.type_of_prediction == 'COG'): input_count_table=join(get_picrust_project_dir(),'picrust','data','cog_precalculated.biom.gz') else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if (ext == '.gz'): genome_table_str = gzip.open(input_count_table,'rb').read() else: genome_table_str = open(input_count_table,'U').read() #In the genome/trait table genomes are the samples and #genes are the observations if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs ids_to_load = otu_table.ObservationIds if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_str,ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* trait table because --suppress_subset_loading was passed. This may result in high memory usage." genome_table = parse_biom_table(genome_table_str) if opts.verbose: print "Done loading trait table containing %i functions for %i organisms." %(len(genome_table.ObservationIds),len(genome_table.SampleIds)) make_output_dir_for_file(opts.output_metagenome_table) if opts.accuracy_metrics: # Calculate accuracy metrics #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False) #print "Unweighted NSTI:", unweighted_nsti weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True) samples= weighted_nsti[0] nstis = list(weighted_nsti[1]) #print "Samples:",samples #print "NSTIs:",nstis samples_and_nstis = zip(samples,nstis) #print "Samples and NSTIs:",samples_and_nstis lines = ["#Sample\tMetric\tValue\n"] #print weighted_nsti for sample,nsti in samples_and_nstis: line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti)) lines.append(line) if opts.verbose: for l in sorted(lines): print l if opts.verbose: print "Writing accuracy information to file:", opts.accuracy_metrics open(opts.accuracy_metrics,'w').writelines(sorted(lines)) if opts.verbose: print "Predicting the metagenome..." predicted_metagenomes = predict_metagenomes(otu_table,genome_table) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) if(opts.format_tab_delimited): open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf(header_key="KEGG Pathways",header_value="KEGG Pathways",metadata_formatter=lambda s: '|'.join(['; '.join(l) for l in s]))) else: open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_ext=path.splitext(opts.input_otu_fp)[1] if opts.input_format_classic: otu_table=parse_classic_table_to_rich_table(open(opts.input_otu_fp,'U'),None,None,None,DenseOTUTable) else: try: otu_table = parse_biom_table(open(opts.input_otu_fp,'U')) except ValueError: raise ValueError("Error loading OTU table! If not in BIOM format use '-f' option.\n") ids_to_load = otu_table.ObservationIds if(opts.input_count_fp is None): #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if (ext == '.gz'): count_table_fh = gzip.open(input_count_table,'rb') else: count_table_fh = open(input_count_table,'U') if opts.load_precalc_file_in_biom: count_table = parse_biom_table(count_table_fh.read()) else: count_table = convert_precalc_to_biom(count_table_fh,ids_to_load) #Need to only keep data relevant to our otu list ids=[] for x in otu_table.iterObservations(): ids.append(str(x[1])) ob_id=count_table.ObservationIds[0] filtered_otus=[] filtered_values=[] for x in ids: if count_table.sampleExists(x): filtered_otus.append(x) filtered_values.append(otu_table.observationData(x)) #filtered_values = map(list,zip(*filtered_values)) filtered_otu_table=table_factory(filtered_values,otu_table.SampleIds,filtered_otus, constructor=DenseOTUTable) copy_numbers_filtered={} for x in filtered_otus: value = count_table.getValueByIds(ob_id,x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x]={opts.metadata_identifer:value} filtered_otu_table.addObservationMetadata(copy_numbers_filtered) normalized_table = filtered_otu_table.normObservationByMetadata(opts.metadata_identifer) #move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table,normalized_table,'ObservationMetadata') normalized_otu_table = transfer_sample_metadata(otu_table,normalized_table,'SampleMetadata') make_output_dir_for_file(opts.output_otu_fp) open(opts.output_otu_fp,'w').write(format_biom_table(normalized_table))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) otu_table = load_table(opts.input_otu_fp) ids_to_load = otu_table.ids(axis="observation") if opts.input_count_fp is None: # precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name = "_".join(["16S", opts.gg_version, "precalculated.tab.gz"]) input_count_table = join(get_picrust_project_dir(), "picrust", "data", precalc_file_name) else: input_count_table = opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext = path.splitext(input_count_table)[1] if ext == ".gz": count_table_fh = gzip.open(input_count_table, "rb") else: count_table_fh = open(input_count_table, "U") if opts.load_precalc_file_in_biom: count_table = load_table(count_table_fh) else: count_table = convert_precalc_to_biom(count_table_fh, ids_to_load) # Need to only keep data relevant to our otu list ids = [] for x in otu_table.iter(axis="observation"): ids.append(str(x[1])) ob_id = count_table.ids(axis="observation")[0] filtered_otus = [] filtered_values = [] for x in ids: if count_table.exists(x, axis="sample"): filtered_otus.append(x) filtered_values.append(otu_table.data(x, axis="observation")) filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids()) copy_numbers_filtered = {} for x in filtered_otus: value = count_table.get_value_by_ids(ob_id, x) try: # data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError, "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x] = {opts.metadata_identifer: value} filtered_otu_table.add_metadata(copy_numbers_filtered, axis="observation") def metadata_norm(v, i, md): return v / float(md[opts.metadata_identifer]) normalized_table = filtered_otu_table.transform(metadata_norm, axis="observation") # move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table, normalized_table, "observation") make_output_dir_for_file(opts.output_otu_fp) write_biom_table(normalized_table, opts.output_otu_fp)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) tmp_dir = 'jobs/' make_output_dir(tmp_dir) #Run the jobs script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py') if (opts.parallel_method == 'sge'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_sge.py') elif (opts.parallel_method == 'multithreaded'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs.py') elif (opts.parallel_method == 'torque'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_torque.py') else: raise RuntimeError if (opts.verbose): print "Loading tree..." tree = load_picrust_tree(opts.tree, opts.verbose) all_tips = [tip.Name for tip in tree.tips()] if (opts.verbose): print "Total number of possible tips to predict: {0}".format( len(all_tips)) created_tmp_files = [] output_files = {} output_files['counts'] = [] if opts.reconstruction_confidence: output_files['variances'] = [] output_files['upper_CI'] = [] output_files['lower_CI'] = [] if opts.already_calculated: all_tips = get_tips_not_in_precalc(all_tips, opts.already_calculated) if opts.verbose: print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format( len(all_tips)) #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_') jobs = open(jobs_fp, 'w') created_tmp_files.append(jobs_fp) if (opts.verbose): print "Creating temporary input files in: ", tmp_dir num_tips_per_job = 1000 for tips_to_predict in [ all_tips[i:i + num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job) ]: #create tmp output files tmp_output_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='out_predict_traits_') output_files['counts'].append(tmp_output_fp) tip_to_predict_str = ','.join(list(tips_to_predict)) if opts.reconstruction_confidence: outfile_base, extension = splitext(tmp_output_fp) output_files['variances'].append(outfile_base + "_variances.tab") output_files['upper_CI'].append(outfile_base + "_upper_CI.tab") output_files['lower_CI'].append(outfile_base + "_lower_CI.tab") #create the job command cmd = "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format( script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp) else: cmd = "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format( script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp) #NOTE: Calculating NSTI this way is convenient, #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on. if opts.calculate_accuracy_metrics: cmd = cmd + " -a" #add job command to the the jobs file jobs.write(cmd + "\n") jobs.close() #add all output files to tmp list (used later for deletion) for predict_type in output_files: created_tmp_files.extend(output_files[predict_type]) if (opts.verbose): print "Launching parallel jobs." #run the job command job_prefix = 'picrust' submit_jobs(cluster_jobs_fp, jobs_fp, job_prefix, num_jobs=opts.num_jobs, delay=opts.delay) if (opts.verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files['counts']) if (opts.verbose): print "Jobs are done running." make_output_dir_for_file(opts.output_trait_table) outfile_base, extension = splitext(opts.output_trait_table) for predict_type in sorted(output_files): #Combine output files if opts.verbose: print "Combining all output files for " + predict_type combined_predictions = combine_predict_trait_output( output_files[predict_type]) if opts.verbose: print "Writing combined file for " + predict_type if predict_type == 'counts': #Output in whatever format the user wants if opts.output_precalc_file_in_biom: open(opts.output_trait_table, 'w').write( format_biom_table( convert_precalc_to_biom(combined_predictions))) else: open(opts.output_trait_table, 'w').write(combined_predictions) else: if opts.output_precalc_file_in_biom: open(outfile_base + "_" + predict_type + ".biom", 'w').write( format_biom_table( convert_precalc_to_biom(combined_predictions))) else: open(outfile_base + "_" + predict_type + ".tab", 'w').write(combined_predictions) #clean up all tmp files for file in created_tmp_files: remove(file)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) tmp_dir='jobs/' make_output_dir(tmp_dir) #Run the jobs script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py') if(opts.parallel_method=='sge'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py') elif(opts.parallel_method=='multithreaded'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py') elif(opts.parallel_method=='torque'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py') else: raise RuntimeError if(opts.verbose): print "Loading tree..." tree = load_picrust_tree(opts.tree, opts.verbose) all_tips = [tip.Name for tip in tree.tips()] if(opts.verbose): print "Total number of possible tips to predict: {0}".format(len(all_tips)) created_tmp_files=[] output_files={} output_files['counts']=[] if opts.reconstruction_confidence: output_files['variances']=[] output_files['upper_CI']=[] output_files['lower_CI']=[] if opts.already_calculated: all_tips=get_tips_not_in_precalc(all_tips,opts.already_calculated) if opts.verbose: print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(len(all_tips)) #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_') jobs=open(jobs_fp,'w') created_tmp_files.append(jobs_fp) if(opts.verbose): print "Creating temporary input files in: ",tmp_dir num_tips_per_job=1000 for tips_to_predict in [all_tips[i:i+num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job)]: #create tmp output files tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_predict_traits_') output_files['counts'].append(tmp_output_fp) tip_to_predict_str=','.join(list(tips_to_predict)) if opts.reconstruction_confidence: outfile_base,extension = splitext(tmp_output_fp) output_files['variances'].append(outfile_base+"_variances.tab") output_files['upper_CI'].append(outfile_base+"_upper_CI.tab") output_files['lower_CI'].append(outfile_base+"_lower_CI.tab") #create the job command cmd= "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp) else: cmd= "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp) #NOTE: Calculating NSTI this way is convenient, #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on. if opts.calculate_accuracy_metrics: cmd=cmd+" -a" #add job command to the the jobs file jobs.write(cmd+"\n") jobs.close() #add all output files to tmp list (used later for deletion) for predict_type in output_files: created_tmp_files.extend(output_files[predict_type]) if(opts.verbose): print "Launching parallel jobs." #run the job command job_prefix='picrust' submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs,delay=opts.delay) if(opts.verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files['counts']) if(opts.verbose): print "Jobs are done running." make_output_dir_for_file(opts.output_trait_table) outfile_base,extension = splitext(opts.output_trait_table) for predict_type in sorted(output_files): #Combine output files if opts.verbose: print "Combining all output files for "+ predict_type combined_predictions=combine_predict_trait_output(output_files[predict_type]) if opts.verbose: print "Writing combined file for "+predict_type if predict_type == 'counts': #Output in whatever format the user wants if opts.output_precalc_file_in_biom: open(opts.output_trait_table,'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions))) else: open(opts.output_trait_table,'w').write(combined_predictions) else: if opts.output_precalc_file_in_biom: open(outfile_base+"_"+predict_type+".biom",'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions))) else: open(outfile_base+"_"+predict_type+".tab",'w').write(combined_predictions) #clean up all tmp files for file in created_tmp_files: remove(file)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #if we specify we want NSTI only then we have to calculate it first if opts.output_accuracy_metrics_only: opts.calculate_accuracy_metrics = True if opts.verbose: print "Loading tree from file:", opts.tree # Load Tree #tree = LoadTree(opts.tree) tree = load_picrust_tree(opts.tree, opts.verbose) table_headers = [] traits = {} #load the asr trait table using the previous list of functions to order the arrays if opts.reconstructed_trait_table: table_headers,traits =\ update_trait_dict_from_file(opts.reconstructed_trait_table) #Only load confidence intervals on the reconstruction #If we actually have ASR values in the analysis if opts.reconstruction_confidence: if opts.verbose: print "Loading ASR confidence data from file:",\ opts.reconstruction_confidence print "Assuming confidence data is of type:", opts.confidence_format asr_confidence_output = open(opts.reconstruction_confidence) asr_min_vals,asr_max_vals, params,column_mapping =\ parse_asr_confidence_output(asr_confidence_output,format=opts.confidence_format) if 'sigma' in params: brownian_motion_parameter = params['sigma'][0] else: brownian_motion_parameter = None if opts.verbose: print "Done. Loaded %i confidence interval values." % ( len(asr_max_vals)) print "Brownian motion parameter:", brownian_motion_parameter else: brownian_motion_parameter = None #load the trait table into a dict with organism names as keys and arrays as functions table_headers,genome_traits =\ update_trait_dict_from_file(opts.observed_trait_table,table_headers) #Combine the trait tables overwriting the asr ones if they exist in the genome trait table. traits.update(genome_traits) # Specify the attribute where we'll store the reconstructions trait_label = "Reconstruction" if opts.verbose: print "Assigning traits to tree..." # Decorate tree using the traits tree = assign_traits_to_tree(traits, tree, trait_label=trait_label) if opts.reconstruction_confidence: if opts.verbose: print "Assigning trait confidence intervals to tree..." tree = assign_traits_to_tree(asr_min_vals,tree,\ trait_label="lower_bound") tree = assign_traits_to_tree(asr_max_vals,tree,\ trait_label="upper_bound") if brownian_motion_parameter is None: if opts.verbose: print "No Brownian motion parameters loaded. Inferring these from 95% confidence intervals..." brownian_motion_parameter = get_brownian_motion_param_from_confidence_intervals(tree,\ upper_bound_trait_label="upper_bound",\ lower_bound_trait_label="lower_bound",\ trait_label=trait_label,\ confidence=0.95) if opts.verbose: print "Inferred the following rate parameters:", brownian_motion_parameter if opts.verbose: print "Collecting list of nodes to predict..." #Start by predict all tip nodes. nodes_to_predict = [tip.Name for tip in tree.tips()] if opts.verbose: print "Found %i nodes to predict." % len(nodes_to_predict) if opts.limit_predictions_to_organisms: organism_id_str = opts.limit_predictions_to_organisms ok_organism_ids = organism_id_str.split(',') ok_organism_ids = [n.strip() for n in ok_organism_ids] for f in set_label_conversion_fns(True, True): ok_organism_ids = [f(i) for i in ok_organism_ids] if opts.verbose: print "Limiting predictions to user-specified ids:",\ ",".join(ok_organism_ids) if not ok_organism_ids: raise RuntimeError(\ "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\ % opts.limit_predictions_to_organisms) nodes_to_predict =\ [n for n in nodes_to_predict if n in ok_organism_ids] if not nodes_to_predict: raise RuntimeError(\ "Filtering by user-specified ids resulted in an empty set of nodes to predict. Are the ids on the commmand-line and tree ids in the same format? Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0])) if opts.verbose: print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" % ( len(nodes_to_predict)) if opts.limit_predictions_by_otu_table: if opts.verbose: print "Limiting predictions to ids in user-specified OTU table:",\ opts.limit_predictions_by_otu_table otu_table = open(opts.limit_predictions_by_otu_table, "U") #Parse OTU table for ids otu_ids =\ extract_ids_from_table(otu_table.readlines(),delimiter="\t") if not otu_ids: raise RuntimeError(\ "Found no valid ids in input OTU table: %s. Is the path correct?"\ % opts.limit_predictions_by_otu_table) nodes_to_predict =\ [n for n in nodes_to_predict if n in otu_ids] if not nodes_to_predict: raise RuntimeError(\ "Filtering by OTU table resulted in an empty set of nodes to predict. Are the OTU ids and tree ids in the same format? Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0])) if opts.verbose: print "After filtering by OTU table, %i nodes remain to be predicted" % ( len(nodes_to_predict)) # Calculate accuracy of PICRUST for the given tree, sequenced genomes # and set of ndoes to predict accuracy_metrics = ['NSTI'] accuracy_metric_results = None if opts.calculate_accuracy_metrics: if opts.verbose: print "Calculating accuracy metrics: %s" % ( [",".join(accuracy_metrics)]) accuracy_metric_results = {} if 'NSTI' in accuracy_metrics: nsti_result,min_distances =\ calc_nearest_sequenced_taxon_index(tree,\ limit_to_tips = nodes_to_predict,\ trait_label = trait_label, verbose=opts.verbose) #accuracy_metric_results['NSTI'] = nsti_result for organism in min_distances.keys(): accuracy_metric_results[organism] = { 'NSTI': min_distances[organism] } if opts.verbose: print "NSTI:", nsti_result if opts.output_accuracy_metrics_only: #Write accuracy metrics to file if opts.verbose: print "Writing accuracy metrics to file:", opts.output_accuracy_metrics f = open(opts.output_accuracy_metrics_only, 'w+') f.write("metric\torganism\tvalue\n") lines = [] for organism in accuracy_metric_results.keys(): for metric in accuracy_metric_results[organism].keys(): lines.append('\t'.join([metric,organism,\ str(accuracy_metric_results[organism][metric])])+'\n') f.writelines(sorted(lines)) f.close() exit() if opts.verbose: print "Generating predictions using method:", opts.prediction_method if opts.weighting_method == 'exponential': #For now, use exponential weighting weight_fn = make_neg_exponential_weight_fn(e) variances = None #Overwritten by methods that calc variance confidence_intervals = None #Overwritten by methods that calc variance if opts.prediction_method == 'asr_and_weighting': # Perform predictions using reconstructed ancestral states if opts.reconstruction_confidence: predictions,variances,confidence_intervals =\ predict_traits_from_ancestors(tree,nodes_to_predict,\ trait_label=trait_label,\ lower_bound_trait_label="lower_bound",\ upper_bound_trait_label="upper_bound",\ calc_confidence_intervals = True,\ brownian_motion_parameter=brownian_motion_parameter,\ weight_fn =weight_fn,verbose=opts.verbose) else: predictions =\ predict_traits_from_ancestors(tree,nodes_to_predict,\ trait_label=trait_label,\ weight_fn =weight_fn,verbose=opts.verbose) elif opts.prediction_method == 'weighting_only': #Ignore ancestral information predictions =\ weighted_average_tip_prediction(tree,nodes_to_predict,\ trait_label=trait_label,\ weight_fn =weight_fn,verbose=opts.verbose) elif opts.prediction_method == 'nearest_neighbor': predictions = predict_nearest_neighbor(tree,nodes_to_predict,\ trait_label=trait_label,tips_only = True) elif opts.prediction_method == 'random_neighbor': predictions = predict_random_neighbor(tree,\ nodes_to_predict,trait_label=trait_label) if opts.verbose: print "Done making predictions." make_output_dir_for_file(opts.output_trait_table) out_fh = open(opts.output_trait_table, 'w') #Generate the table of biom predictions if opts.verbose: print "Converting results to .biom format for output..." biom_predictions=biom_table_from_predictions(predictions,table_headers,\ observation_metadata=None,\ sample_metadata=accuracy_metric_results,convert_to_int=False) if opts.verbose: print "Writing prediction results to file: ", opts.output_trait_table if opts.output_precalc_file_in_biom: #write biom table to file write_biom_table(biom_predictions, opts.output_trait_table) else: #convert to precalc (tab-delimited) format out_fh = open(opts.output_trait_table, 'w') out_fh.write(convert_biom_to_precalc(biom_predictions)) out_fh.close() #Write out variance information to file if variances: if opts.verbose: print "Converting variances to BIOM format" if opts.output_precalc_file_in_biom: suffix = '.biom' else: suffix = '.tab' biom_prediction_variances=biom_table_from_predictions({k:v['variance'] for k,v in variances.iteritems()},table_headers,\ observation_metadata=None,\ sample_metadata=None,convert_to_int=False) outfile_base, extension = splitext(opts.output_trait_table) variance_outfile = outfile_base + "_variances" + suffix make_output_dir_for_file(variance_outfile) if opts.verbose: print "Writing variance information to file:", variance_outfile if opts.output_precalc_file_in_biom: write_biom_table(biom_prediction_variances, variance_outfile) else: open(variance_outfile,'w').write(\ convert_biom_to_precalc(biom_prediction_variances)) if confidence_intervals: if opts.verbose: print "Converting upper confidence interval values to BIOM format" biom_prediction_upper_CI=biom_table_from_predictions({k:v['upper_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\ observation_metadata=None,\ sample_metadata=None,convert_to_int=False) outfile_base, extension = splitext(opts.output_trait_table) upper_CI_outfile = outfile_base + "_upper_CI" + suffix make_output_dir_for_file(upper_CI_outfile) if opts.verbose: print "Writing upper confidence limit information to file:", upper_CI_outfile if opts.output_precalc_file_in_biom: write_biom_table(biom_prediction_upper_CI, upper_CI_outfile) else: open(upper_CI_outfile,'w').write(\ convert_biom_to_precalc(biom_prediction_upper_CI)) biom_prediction_lower_CI=biom_table_from_predictions({k:v['lower_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\ observation_metadata=None,\ sample_metadata=None,convert_to_int=False) outfile_base, extension = splitext(opts.output_trait_table) lower_CI_outfile = outfile_base + "_lower_CI" + suffix make_output_dir_for_file(lower_CI_outfile) if opts.verbose: print "Writing lower confidence limit information to file", lower_CI_outfile if opts.output_precalc_file_in_biom: write_biom_table(biom_prediction_lower_CI, lower_CI_outfile) else: open(lower_CI_outfile,'w').write(\ convert_biom_to_precalc(biom_prediction_lower_CI))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ids_to_load = otu_table.ObservationIds if(opts.input_count_table is None): #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz) precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if opts.verbose: print "Loading count table: ", input_count_table if (ext == '.gz'): genome_table_fh = gzip.open(input_count_table,'rb') else: genome_table_fh = open(input_count_table,'U') #In the genome/trait table genomes are the samples and #genes are the observations if opts.load_precalc_file_in_biom: if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = parse_biom_table(genome_table_fh.read()) else: genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load) partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_fp make_output_dir_for_file(opts.output_fp) open(opts.output_fp,'w').write(output_text)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) verbose=opts.verbose min_args = 1 if len(args) < min_args: option_parser.error('One or more predicted biom files must be provided.') observed_files=args make_output_dir_for_file(opts.output_fp) out_fh=open(opts.output_fp,'w') if verbose: print "Loading expected trait table file:",opts.exp_trait_table_fp exp_table =parse_biom_table(open(opts.exp_trait_table_fp,'U')) header_printed=False header_keys=[] delimiter="\t" for observed_file in observed_files: observed_file_name=basename(observed_file) if verbose: print "Loading predicted trait table file:",observed_file_name obs_table =parse_biom_table(open(observed_file,'U')) if opts.compare_observations: if verbose: print "Transposing tables to allow evaluation of observations (instead of samples)..." obs_table=transpose_biom(obs_table) exp_table=transpose_biom(exp_table) if verbose: print "Matching predicted and expected tables..." obs,exp=match_biom_tables(obs_table,exp_table,verbose=verbose,limit_to_expected_observations=opts.limit_to_expected_observations,limit_to_observed_observations=opts.limit_to_observed_observations,normalize=opts.normalize,shuffle_samples=opts.shuffle_samples) if verbose: print "Calculating accuracy stats for all observations..." #import pdb; pdb.set_trace() for i in obs: if verbose: print "Calculating stats for: ",i if opts.not_relative_abundance_scores: results=calculate_accuracy_stats_from_observations(obs[i],exp[i],success_criterion='binary') else: results=calculate_accuracy_stats_from_observations(obs[i],exp[i],success_criterion='ra_exact') #If first pass then print out header if not header_printed: header_printed=True header_keys=sorted(results.keys()) out_fh.write(delimiter.join(['file','label']+header_keys)+"\n") #print results using same order as header values=[observed_file_name,i]+['{0:.3g}'.format(results[x]) for x in header_keys] out_str=delimiter.join(map(str,values))+"\n" out_fh.write(out_str)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = load_table(opts.input_otu_table) ids_to_load = otu_table.ids(axis='observation') if(opts.input_count_table is None): #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz) precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if opts.verbose: print "Loading count table: ", input_count_table if (ext == '.gz'): genome_table_fh = gzip.open(input_count_table,'rb') else: genome_table_fh = open(input_count_table,'U') #In the genome/trait table genomes are the samples and #genes are the observations if opts.load_precalc_file_in_biom: if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = load_table(genome_table_fh) else: genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load) ok_functional_categories = None metadata_type = None if opts.limit_to_functional_categories: ok_functional_categories = opts.limit_to_functional_categories.split("|") if opts.verbose: print "Limiting to functional categories: %s" %(str(ok_functional_categories)) # Either KEGG_Pathways or COG_Category needs # to be assigned to metadata_key to limit to # functional categories (not needed for # individual functions) if opts.type_of_prediction == "ko": metadata_type = "KEGG_Pathways" elif opts.type_of_prediction == "cog": metadata_type = "COG_Category" elif opts.type_of_prediction == "rfam": exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)") partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\ limit_to_functional_categories = ok_functional_categories , metadata_key = metadata_type ) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_fp make_output_dir_for_file(opts.output_fp) open(opts.output_fp,'w').write(output_text)