def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.level <= 0: option_parser.error("level must be greater than zero!") collapse_f = make_collapse_f(opts.metadata_category, opts.level, opts.ignore) table = load_table(opts.input_fp) if h5py.is_hdf5(opts.input_fp): # metadata are not deserializing correctly. Duct tape it. update_d = {} for i, md in zip(table.ids(axis='observation'), table.metadata(axis='observation')): update_d[i] = {k: json.loads(v[0]) for k, v in md.items()} table.add_metadata(update_d, axis='observation') result = table.collapse(collapse_f, axis='observation', one_to_many=True, norm=False, one_to_many_md_key=opts.metadata_category) if(opts.format_tab_delimited): f = open(opts.output_fp, 'w') f.write(result.to_tsv(header_key=opts.metadata_category, header_value=opts.metadata_category, metadata_formatter=lambda s: '; '.join(s))) f.close() else: format_fs = {opts.metadata_category: vlen_list_of_str_formatter} write_biom_table(result, opts.output_fp, format_fs=format_fs)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading sequencing depth table: ",opts.input_seq_depth_file scaling_factors = {} for sample_id,depth in parse_seq_count_file(open(opts.input_seq_depth_file,'U')): scaling_factors[sample_id]=depth ext=path.splitext(opts.input_count_table)[1] if opts.verbose: print "Loading count table: ", opts.input_count_table if (ext == '.gz'): genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb')) else: genome_table = parse_biom_table(open(opts.input_count_table,'U')) if opts.verbose: print "Scaling the metagenome..." scaled_metagenomes = scale_metagenomes(genome_table,scaling_factors) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) open(opts.output_metagenome_table,'w').write(format_biom_table(scaled_metagenomes))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.submit_jobs and not opts.make_jobs: option_parser.error('Must pass -m if passing -s. (Sorry about this, '+\ 'it\'s for backwards-compatibility.)') min_args = 2 if len(args) != min_args: option_parser.error('Program requires <commands file> and <job prefix>') if (len(args[1])>10 or len(args[1])==0): option_parser.error('job prefix must be 1-10 characters long') commands = list(open(args[0])) job_prefix = args[1] if(not exists(opts.job_dir)): try: makedirs(opts.job_dir) except OSError: exit(" Jobs directory can not be created. " +"Check for permissions or file with the same name: %s\n" % opts.job_dir) if (opts.make_jobs): filenames = make_torque_jobs(commands, job_prefix, opts.queue, opts.job_dir,opts.num_jobs) else: exit("Should we ever get here???") if (opts.submit_jobs): submit_cluster_jobs(filenames, opts.verbose)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.submit_jobs and not opts.make_jobs: option_parser.error('Must pass -m if passing -s. (Sorry about this, '+\ 'it\'s for backwards-compatibility.)') min_args = 2 if len(args) != min_args: option_parser.error( 'Program requires <commands file> and <job prefix>') if (len(args[1]) > 10 or len(args[1]) == 0): option_parser.error('job prefix must be 1-10 characters long') commands = list(open(args[0])) job_prefix = args[1] if (not exists(opts.job_dir)): try: makedirs(opts.job_dir) except OSError: exit(" Jobs directory can not be created. " + "Check for permissions or file with the same name: %s\n" % opts.job_dir) if (opts.make_jobs): filenames = make_sge_jobs(commands, job_prefix, opts.queue, opts.job_dir, opts.num_jobs) else: exit("Should we ever get here???") if (opts.submit_jobs): submit_cluster_jobs(filenames, opts.verbose, delay=opts.delay)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) metadata_file_delimiter = ',' mapping_file_delimiter = '\t' #New strategy: #Open output file print "Opening output file:",opts.output_file outfile = open(opts.output_file,'w+') #Load old QIIME mapping file. Parse header line, then for each data line, #insert new fields just before description print "Loading input QIIME mapping file:",opts.input_mapping_file mapping_file = open(opts.input_mapping_file,'U') event_column,event_state = opts.event.split(':') time_column = opts.time_column result = relative_date_info_from_mapping(mapping_file,time_column,event_column, event_state,individual_column="Individual") #print "Result:", result for l in result: line_to_print = "\t".join(map(str,l))+"\n" outfile.write(line_to_print) print line_to_print.strip() print "Done. Output saved to:",opts.output_file outfile.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if(opts.parallel): tmp_dir='jobs/' make_output_dir(tmp_dir) asr_table, ci_table =run_asr_in_parallel(tree=opts.input_tree_fp,table=opts.input_trait_table_fp,asr_method=opts.asr_method,parallel_method=opts.parallel_method, num_jobs=opts.num_jobs,tmp_dir=tmp_dir,verbose=opts.verbose) else: #call the apporpriate ASR app controller if(opts.asr_method == 'wagner'): asr_table = wagner_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,HALT_EXEC=opts.debug) elif(opts.asr_method == 'bayestraits'): pass elif(opts.asr_method == 'ace_ml'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'ML',HALT_EXEC=opts.debug) elif(opts.asr_method == 'ace_pic'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'pic',HALT_EXEC=opts.debug) elif(opts.asr_method == 'ace_reml'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'REML',HALT_EXEC=opts.debug) #output the table to file make_output_dir_for_file(opts.output_fp) asr_table.writeToFile(opts.output_fp,sep='\t') #output the CI file (unless the method is wagner) if not (opts.asr_method == 'wagner'): make_output_dir_for_file(opts.output_ci_fp) ci_table.writeToFile(opts.output_ci_fp,sep='\t')
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ext=path.splitext(opts.input_count_table)[1] if opts.verbose: print "Loading count table: ", opts.input_count_table if (ext == '.gz'): genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb')) else: genome_table = parse_biom_table(open(opts.input_count_table,'U')) if opts.verbose: print "Predicting the metagenome..." partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) open(opts.output_metagenome_table,'w').write(output_text)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if(opts.parallel): tmp_dir='jobs/' make_output_dir(tmp_dir) asr_table, ci_table =run_asr_in_parallel(tree=opts.input_tree_fp,table=opts.input_trait_table_fp,asr_method=opts.asr_method,parallel_method=opts.parallel_method, num_jobs=opts.num_jobs,tmp_dir=tmp_dir,verbose=opts.verbose) else: #call the apporpriate ASR app controller if(opts.asr_method == 'wagner'): asr_table = wagner_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,HALT_EXEC=opts.debug) elif(opts.asr_method == 'bayestraits'): pass elif(opts.asr_method == 'ace_ml'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'ML',HALT_EXEC=opts.debug) elif(opts.asr_method == 'ace_pic'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'pic',HALT_EXEC=opts.debug) elif(opts.asr_method == 'ace_reml'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'REML',HALT_EXEC=opts.debug) #output the table to file make_output_dir_for_file(opts.output_fp) asr_table.writeToFile(opts.output_fp,sep='\t') #output the CI file (unless the method is wagner) if not (opts.asr_method == 'wagner'): make_output_dir_for_file(opts.output_ci_fp) ci_table.writeToFile(opts.output_ci_fp,sep='\t')
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.level <= 0: option_parser.error("level must be greater than zero!") collapse_f = make_collapse_f(opts.metadata_category, opts.level, opts.ignore) table = load_table(opts.input_fp) if h5py.is_hdf5(opts.input_fp): # metadata are not deserializing correctly. Duct tape it. update_d = {} for i, md in zip(table.ids(axis='observation'), table.metadata(axis='observation')): update_d[i] = {k: json.loads(v[0]) for k, v in md.items()} table.add_metadata(update_d, axis='observation') result = table.collapse(collapse_f, axis='observation', one_to_many=True, norm=False, one_to_many_md_key=opts.metadata_category) if (opts.format_tab_delimited): f = open(opts.output_fp, 'w') f.write( result.to_tsv(header_key=opts.metadata_category, header_value=opts.metadata_category, metadata_formatter=lambda s: '; '.join(s))) f.close() else: format_fs = {opts.metadata_category: vlen_list_of_str_formatter} write_biom_table(result, opts.output_fp, format_fs=format_fs)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.level <= 0: parser.error("level must be greater than zero!") collapse_f = make_collapse_f(opts.metadata_category, opts.level, opts.ignore) table = parse_biom_table(open(opts.input_fp)) result = table.collapseObservationsByMetadata( collapse_f, one_to_many=True, norm=False, one_to_many_md_key=opts.metadata_category) f = open(opts.output_fp, 'w') if (opts.format_tab_delimited): f.write( result.delimitedSelf(header_key=opts.metadata_category, header_value=opts.metadata_category, metadata_formatter=lambda s: '; '.join(s))) else: f.write(result.getBiomFormatJsonString('picrust %s - categorize_by_function'\ % __version__)) f.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_ext=path.splitext(opts.input_otu_fp)[1] if opts.input_format_classic: otu_table=parse_classic_table_to_rich_table(open(opts.input_otu_fp,'U'),None,None,None,DenseOTUTable) else: if input_ext != '.biom': sys.stderr.write("\nOTU table does not have '.biom' extension! If loading causes error consider using '-f' option to load tab-delimited OTU table!\n\n") otu_table = parse_biom_table(open(opts.input_otu_fp,'U')) ext=path.splitext(opts.input_count_fp)[1] if (ext == '.gz'): count_table = parse_biom_table(gzip.open(opts.input_count_fp,'rb')) else: count_table = parse_biom_table(open(opts.input_count_fp,'U')) #Need to only keep data relevant to our otu list ids=[] for x in otu_table.iterObservations(): ids.append(str(x[1])) ob_id=count_table.ObservationIds[0] filtered_otus=[] filtered_values=[] for x in ids: if count_table.sampleExists(x): filtered_otus.append(x) filtered_values.append(otu_table.observationData(x)) #filtered_values = map(list,zip(*filtered_values)) filtered_otu_table=table_factory(filtered_values,otu_table.SampleIds,filtered_otus, constructor=DenseOTUTable) copy_numbers_filtered={} for x in filtered_otus: value = count_table.getValueByIds(ob_id,x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x]={opts.metadata_identifer:value} filtered_otu_table.addObservationMetadata(copy_numbers_filtered) normalized_table = filtered_otu_table.normObservationByMetadata(opts.metadata_identifer) make_output_dir_for_file(opts.output_otu_fp) open(opts.output_otu_fp,'w').write(\ normalized_table.getBiomFormatJsonString('PICRUST'))
def main(): _, opts, _ = parse_command_line_parameters(**script_info) download_picrust_files( output_path=DATA_DIR, with_confidence=opts.with_confidence, gg_version=opts.gg_version, type_of_prediction=opts.type_of_prediction, force=opts.force, )
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) pool_by = opts.pool_by.split(',') #Construct a dict from user specified field order file_name_field_order = {} for i, field in enumerate(opts.field_order.split(',')): file_name_field_order[field] = i if opts.verbose: print "Assuming file names are in this order:", file_name_field_order for k in pool_by: #Check that we're only pooling by values that exist if k not in file_name_field_order.keys(): err_text=\ "Bad value for option '--pool_by'. Can't pool by '%s'. Valid categories are: %s" %(k,\ ",".join(file_name_field_order.keys())) raise ValueError(err_text) if opts.verbose: print "Pooling results by:", pool_by file_name_delimiter = '--' pooled_observations,pooled_expectations = pool_test_dataset_dir(opts.trait_table_dir,\ opts.exp_trait_table_dir,file_name_delimiter=file_name_delimiter,\ file_name_field_order=file_name_field_order,pool_by=pool_by,\ verbose=opts.verbose) #prediction_prefix = 'predict_traits' #expectation_prefix = 'exp_biom_traits' for tag in pooled_observations.keys(): obs_table = pooled_observations[tag] exp_table = pooled_expectations[tag] #obs_table_filename = file_name_delimiter.join([prediction_prefix]+[t for t in tag.split()]) #exp_table_filename = file_name_delimiter.join([expectation_prefix]+[t for t in tag.split()]) obs_table_filename = file_name_delimiter.join(['predict_traits'] + [t for t in tag.split()]) exp_table_filename = file_name_delimiter.join(['exp_biom_table'] + [t for t in tag.split()]) obs_outpath = join(opts.output_dir, obs_table_filename) exp_outpath = join(opts.output_dir, exp_table_filename) print obs_outpath print exp_outpath f = open(obs_outpath, 'w') f.write(obs_table.delimitedSelf()) f.close() f = open(exp_outpath, 'w') f.write(exp_table.delimitedSelf()) f.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ext=path.splitext(opts.input_count_table)[1] if opts.verbose: print "Loading count table: ", opts.input_count_table if (ext == '.gz'): genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb')) else: genome_table = parse_biom_table(open(opts.input_count_table,'U')) make_output_dir_for_file(opts.output_metagenome_table) if opts.accuracy_metrics: # Calculate accuracy metrics #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False) #print "Unweighted NSTI:", unweighted_nsti weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True) samples= weighted_nsti[0] nstis = list(weighted_nsti[1]) #print "Samples:",samples #print "NSTIs:",nstis samples_and_nstis = zip(samples,nstis) #print "Samples and NSTIs:",samples_and_nstis lines = ["#Sample\tMetric\tValue\n"] #print weighted_nsti for sample,nsti in samples_and_nstis: line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti)) lines.append(line) if opts.verbose: for l in sorted(lines): print l if opts.verbose: print "Writing accuracy information to file:", opts.accuracy_metrics open(opts.accuracy_metrics,'w').writelines(sorted(lines)) if opts.verbose: print "Predicting the metagenome..." predicted_metagenomes = predict_metagenomes(otu_table,genome_table) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) if(opts.format_tab_delimited): open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf()) else: open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) min_args = 1 if len(args) < min_args: option_parser.error('A BIOM file must be provided.') file_name = args[0] #allow file to be optionally gzipped (must use extension '.gz') ext=splitext(file_name)[1] if (ext == '.gz'): table = parse_biom_table(gzip.open(file_name,'rb')) else: table = parse_biom_table(open(file_name,'U')) metadata_name=opts.metadata if metadata_name is None: max_len_metadata=0 elif table.ObservationMetadata and metadata_name in table.ObservationMetadata[0]: #figure out the longest list within the given metadata max_len_metadata = max(len(p[metadata_name]) for p in table.ObservationMetadata) else: raise ValueError("'"+metadata_name+"' was not found in the BIOM table. Please try changing --metadata to a valid metadata field.") #make the header line header=[] #make simple labels for each level in the metadata (e.g. 'Level_1', 'Level_2', etc.) "+1" for the observation id as well. for i in range(max_len_metadata+1): header.append('Level_'+ str(i+1)) #add the sample ids to the header line header.extend(table.SampleIds) print "\t".join(header) #now process each observation (row in the table) for obs_vals,obs_id,obs_metadata in table.iterObservations(): row=[] if max_len_metadata >0: row=obs_metadata[metadata_name] #Add blanks if the metadata doesn't fill each level if len(row) < max_len_metadata: for i in range(max_len_metadata - len(row)): row.append('') #Add the observation id as the last "Level" row.append(obs_id) #Add count data to the row row.extend(map(str,obs_vals)) print "\t".join(row)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) pool_by = opts.pool_by.split(',') #Construct a dict from user specified field order file_name_field_order = {} for i,field in enumerate(opts.field_order.split(',')): file_name_field_order[field]=i if opts.verbose: print "Assuming file names are in this order:",file_name_field_order for k in pool_by: #Check that we're only pooling by values that exist if k not in file_name_field_order.keys(): err_text=\ "Bad value for option '--pool_by'. Can't pool by '%s'. Valid categories are: %s" %(k,\ ",".join(file_name_field_order.keys())) raise ValueError(err_text) if opts.verbose: print "Pooling results by:",pool_by file_name_delimiter='--' pooled_observations,pooled_expectations = pool_test_dataset_dir(opts.trait_table_dir,\ opts.exp_trait_table_dir,file_name_delimiter=file_name_delimiter,\ file_name_field_order=file_name_field_order,pool_by=pool_by,\ verbose=opts.verbose) #prediction_prefix = 'predict_traits' #expectation_prefix = 'exp_biom_traits' for tag in pooled_observations.keys(): obs_table = pooled_observations[tag] exp_table = pooled_expectations[tag] #obs_table_filename = file_name_delimiter.join([prediction_prefix]+[t for t in tag.split()]) #exp_table_filename = file_name_delimiter.join([expectation_prefix]+[t for t in tag.split()]) obs_table_filename = file_name_delimiter.join(['predict_traits']+[t for t in tag.split()]) exp_table_filename = file_name_delimiter.join(['exp_biom_table']+[t for t in tag.split()]) obs_outpath = join(opts.output_dir,obs_table_filename) exp_outpath = join(opts.output_dir,exp_table_filename) print obs_outpath print exp_outpath f=open(obs_outpath,'w') f.write(obs_table.delimitedSelf()) f.close() f=open(exp_outpath,'w') f.write(exp_table.delimitedSelf()) f.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) fastaInput = opts.input_dir fastaOut = opts.output_dir out = open(fastaOut, 'w') onlyVars = getVariable(fastaInput) onlyVars[0] out.writelines(str(onlyVars)) out.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) start_time = datetime.now() t = LoadTree(opts.input_tree) translation_dict = {} for i, tip in enumerate(t.iterTips()): translation_dict[tip.Name] = i single_rate = False #Generate commands telling BayesTraits which nodes to reconstruct bayestraits_commands = make_bayestraits_script(t, translation_dict, comments=False, single_rate=single_rate) #TODO: make this dynamic #Temporarily assuming there is a nexus file available nexus_fp = opts.input_tree.rsplit(".", 1)[0] + ".nexus" command_fp = "./bayestraits_commands.txt" path_to_bayestraits = "../" outfile = "./bayestrait_reconstruction.trait_table" command_file = open(command_fp, "w+") command_file.writelines(bayestraits_commands) command_file.close() command_file = open(command_fp, "U") bayestraits = BayesTraits() bayestraits_result = bayestraits(data=(nexus_fp, opts.input_trait_data, command_fp)) #print "StdOut:",result["StdOut"].read() print "StdErr:", bayestraits_result["StdErr"].read() print "Return code:", bayestraits_result["ExitStatus"] results = parse_reconstruction_output( bayestraits_result['StdOut'].readlines()) #print "Reconstructions:",results #Reconstruction results f = open(outfile, "w+") f.writelines(results) f.close() end_time = datetime.now() print "Start time:", start_time print "End time:", end_time print "Time to reconstruct:", end_time - start_time bayestraits_result.cleanUp()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table = parse_biom_table(open(opts.input_path,'U')) tree = DndParser(open(opts.tree_path),UniFracTreeNode) dic = otu_table._data #A = dict_to_csmat(dic) A = dic otus_id = otu_table.ObservationIds if opts.metrics=='unweighted': print unifrac_mix(A,otus_id,tree) if opts.metrics=='weighted': s = sum_dict(dic) print unifrac_mix_weighted(A,otus_id,tree,s)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table = parse_biom_table(open(opts.input_path, 'U')) tree = DndParser(open(opts.tree_path), UniFracTreeNode) dic = otu_table._data #A = dict_to_csmat(dic) A = dic otus_id = otu_table.ObservationIds if opts.metrics == 'unweighted': print unifrac_mix(A, otus_id, tree) if opts.metrics == 'weighted': s = sum_dict(dic) print unifrac_mix_weighted(A, otus_id, tree, s)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.submit_jobs and not opts.make_jobs: option_parser.error('Must pass -m if passing -s. (Sorry about this, '+\ 'it\'s for backwards-compatibility.)') min_args = 2 if len(args) < min_args: option_parser.error('Exactly two arguments are required.') output_dir = './' run_commands(output_dir,open(args[0]).readlines(),args[1],\ submit_jobs=opts.submit_jobs,\ keep_temp=True,num_jobs=opts.num_jobs)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.submit_jobs and not opts.make_jobs: option_parser.error('Must pass -m if passing -s. (Sorry about this, '+\ 'it\'s for backwards-compatibility.)') min_args = 2 if len(args) < min_args: option_parser.error('Exactly two arguments are required.') output_dir = './' run_commands(output_dir,open(args[0]).readlines(),args[1],\ submit_jobs=opts.submit_jobs,\ keep_temp=True,num_jobs=opts.num_jobs)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) start_time = datetime.now() t = LoadTree(opts.input_tree) translation_dict = {} for i,tip in enumerate(t.iterTips()): translation_dict[tip.Name] = i single_rate = False #Generate commands telling BayesTraits which nodes to reconstruct bayestraits_commands = make_bayestraits_script(t,translation_dict,comments=False,single_rate=single_rate) #TODO: make this dynamic #Temporarily assuming there is a nexus file available nexus_fp = opts.input_tree.rsplit(".",1)[0] +".nexus" command_fp = "./bayestraits_commands.txt" path_to_bayestraits = "../" outfile = "./bayestrait_reconstruction.trait_table" command_file = open(command_fp,"w+") command_file.writelines(bayestraits_commands) command_file.close() command_file = open(command_fp,"U") bayestraits=BayesTraits() bayestraits_result = bayestraits(data=(nexus_fp,opts.input_trait_data,command_fp)) #print "StdOut:",result["StdOut"].read() print "StdErr:",bayestraits_result["StdErr"].read() print "Return code:",bayestraits_result["ExitStatus"] results = parse_reconstruction_output(bayestraits_result['StdOut'].readlines()) #print "Reconstructions:",results #Reconstruction results f = open(outfile,"w+") f.writelines(results) f.close() end_time = datetime.now() print "Start time:", start_time print "End time:",end_time print "Time to reconstruct:", end_time - start_time bayestraits_result.cleanUp()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) tr = parse_newick(open(opts.tree_fp),PhyloNode) tip_dists, all_nodes = tr.tipToTipDistances() # tipTo returns a list of actual node objects all_ids = [node.Name for node in all_nodes] o = open(opts.taxa_fp) group_ids = [i.strip() for i in o.readline().split(',')] o.close() # check that there are at least 2 ids in the group, otherwise the math fails if len(group_ids) < 2: option_parser.error('you must have at least 2 taxa specified' +\ ' in the taxa file or the math will fail.') # make sure specified taxa are in the tree, break at first failure for i in group_ids: try: all_ids.index(i) except ValueError: option_parser.error('Taxa '+i+' not found in the tree. You may'+\ ' have specified an internal node.') if len(all_ids)==len(group_ids): #m ust be the same set of ids if above check passes option_parser.error('The taxa_ids you specified contain every tip'+\ ' in the tree. The NRI and NTI formulas will fail with these values'+\ ' because there is no standard deviation of mpd or mntd, and thus'+\ ' division by zero will occur. In addition, the concept of over/under'+\ ' dispersion of a group of taxa (what NRI/NTI measure) is done in'+\ ' reference to the tree they are a part of. If the group being tested'+\ ' is the entire tree, the idea of over/under dispersion does not make'+\ ' much sense.') # mapping from string of method name to function handle method_lookup = {'nri':nri, 'nti':nti} methods = opts.methods.split(',') for method in methods: if method not in method_lookup: option_parser.error("unknown method: %s; valid methods are: %s" % (method, ', '.join(method_lookup.keys()))) for method in methods: print method+':', method_lookup[method](tip_dists, all_ids, group_ids, iters=opts.iters)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) tr = parse_newick(open(opts.tree_fp),PhyloNode) tip_dists, all_nodes = tr.tipToTipDistances() #all_nodes is list node objs all_ids = [node.Name for node in all_nodes] o = open(opts.taxa_fp) group_ids = [i.strip() for i in o.readline().split(',')] o.close() # check that there are at least 2 ids in the group, otherwise the math fails if len(group_ids) < 2: option_parser.error('Not enough taxa in the taxa file.You must have '+\ ' at least 2 taxa specified' +\ ' in the taxa file or the standard deviation of the distance will '+\ ' be zero, causing both NRI and NTI to fail.') # check that all_ids contains every group_id if not set(group_ids).issubset(all_ids): raise option_parser.error('There are taxa in the taxa file which are '+\ 'not found in the tree. You may have specified an internal node.') # check that all_ids != group_ids if len(all_ids)==len(group_ids): #must be same set if above passes option_parser.error('The taxa_ids you specified contain every tip'+\ ' in the tree. The NRI and NTI formulas will fail '+\ ' because there is no standard deviation of mpd or mntd, and thus'+\ ' division by zero. In addition, the concept of over/under'+\ ' dispersion of a group of taxa (what NRI/NTI measure) is done in'+\ ' reference to the tree they are a part of. If the group being'+\ ' tested is the entire tree, the idea of over/under dispersion '+\ ' makes little sense.') # mapping from string of method name to function handle method_lookup = {'nri':nri, 'nti':nti} methods = opts.methods.split(',') for method in methods: if method not in method_lookup: option_parser.error("Unknown method: %s; valid methods are: %s" % \ (method, ', '.join(method_lookup.keys()))) for method in methods: print method+':', method_lookup[method](tip_dists, all_ids, group_ids, iters=opts.iters)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) suppress_errors = opts.suppress_errors input_fps = [] for input_fp in opts.input_fps.split(','): input_fps.extend(glob(input_fp)) for input_fp in input_fps: i = 0 try: input_f = open(input_fp, 'U') except IOError, e: if suppress_errors: continue else: print input_fp, e for s in MinimalFastaParser(input_f): i += 1 print input_fp, i
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.level <= 0: parser.error("level must be greater than zero!") collapse_f = make_collapse_f(opts.metadata_category, opts.level, opts.ignore) table = parse_biom_table(open(opts.input_fp)) result = table.collapseObservationsByMetadata(collapse_f, one_to_many=True, norm=False,one_to_many_md_key=opts.metadata_category) f = open(opts.output_fp,'w') if(opts.format_tab_delimited): f.write(result.delimitedSelf(header_key="KEGG Pathways",header_value="KEGG Pathways",metadata_formatter=lambda s: '; '.join(s))) else: f.write(result.getBiomFormatJsonString('picrust %s - categorize_by_function'\ % __version__)) f.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) suppress_errors = opts.suppress_errors input_fps = [] for input_fp in opts.input_fps.split(','): input_fps.extend(glob(input_fp)) for input_fp in input_fps: i = 0 try: input_f = open(input_fp,'U') except IOError,e: if suppress_errors: continue else: print input_fp, e for s in MinimalFastaParser(input_f): i += 1 print input_fp, i
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.verbose: print "Loading sequencing depth table: ", opts.input_seq_depth_file scaling_factors = {} for sample_id, depth in parse_seq_count_file(open(opts.input_seq_depth_file, "U")): scaling_factors[sample_id] = depth if opts.verbose: print "Loading count table: ", opts.input_count_table genome_table = load_table(opts.input_count_table) if opts.verbose: print "Scaling the metagenome..." scaled_metagenomes = scale_metagenomes(genome_table, scaling_factors) if opts.verbose: print "Writing results to output file: ", opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) write_biom_table(scaled_metagenomes, opts.output_metagenome_table)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.level <= 0: parser.error("level must be greater than zero!") collapse_f = make_collapse_f(opts.metadata_category, opts.level, opts.ignore) table = load_table(opts.input_fp) result = table.collapse(collapse_f, axis='observation', one_to_many=True, norm=False,one_to_many_md_key=opts.metadata_category) if(opts.format_tab_delimited): f = open(opts.output_fp,'w') f.write(result.to_tsv(header_key=opts.metadata_category, header_value=opts.metadata_category, metadata_formatter=lambda s: '; '.join(s))) f.close() else: format_fs = {opts.metadata_category: vlen_list_of_str_formatter} write_biom_table(result, opts.output_fp, format_fs=format_fs)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading sequencing depth table: ",opts.input_seq_depth_file scaling_factors = {} for sample_id,depth in parse_seq_count_file(open(opts.input_seq_depth_file,'U')): scaling_factors[sample_id]=depth if opts.verbose: print "Loading count table: ", opts.input_count_table genome_table = load_table(opts.input_count_table) if opts.verbose: print "Scaling the metagenome..." scaled_metagenomes = scale_metagenomes(genome_table,scaling_factors) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) write_biom_table(scaled_metagenomes, opts.output_metagenome_table)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) count(opts)
def main(): # Parse input to get parameters option_parser, opts, args =\ parse_command_line_parameters(**script_info) tree_file = opts.input_tree trait_table_fp = opts.input_trait_table verbose = opts.verbose #Set output base file names trait_table_base = 'trait_table.tab' pruned_tree_base = 'pruned_tree.newick' reference_tree_base = 'reference_tree.newick' output_dir = make_output_dir(opts.output_dir,strict=False) output_table_fp = join(output_dir,trait_table_base) output_tree_fp = join(output_dir,pruned_tree_base) output_reference_tree_fp = join(output_dir,reference_tree_base) #Handle parameters with more complex defaults delimiter_map = {"space":" ","tab":"\t","comma":","} input_delimiter = delimiter_map[opts.input_table_delimiter] output_delimiter = delimiter_map[opts.output_table_delimiter] if verbose: print "Running with options:" print "\t%s:%s" %("Tree file",tree_file) print "\t%s:%s" %("Trait table",trait_table_fp) print "\t%s:%s" %("Output tree",output_tree_fp) print "\t%s:%s" %("Output reference tree",output_reference_tree_fp) print "\t%s:%s" %("Output trait table",output_table_fp) print "\t%s:%s" %("Add branch length to root",opts.add_branch_length_to_root) print "\t%s:%s" %("Convert to NEXUS?",opts.convert_to_nexus) print "\t%s:%s" %("Input trait table delimiter",opts.input_table_delimiter) print "\t%s:%s" %("Output trait table delimiter",opts.output_table_delimiter) # Begin reformatting root_name = "root" if opts.no_minimum_branch_length: min_branch_length = None else: min_branch_length = 0.0001 #Load inputs if verbose: print "Loading tree...." input_tree = DndParser(open(tree_file)) if verbose: print "Loading trait table..." trait_table = open(trait_table_fp,"U") trait_table_lines = trait_table.readlines() if not trait_table_lines: raise IOError("No lines could be loaded from file %s. Please check the input file." %trait_table_fp) #Get id mappings from mapping file if opts.tree_to_trait_mapping: if verbose: print "Loading tree to trait table mapping file..." mapping_file = open(opts.tree_to_trait_mapping,"U") trait_to_tree_mapping =\ make_id_mapping_dict(parse_id_mapping_file(mapping_file)) else: if verbose: print "No tree to trait mapping file specified. Assuming tree tip names and trait table names will match exactly." trait_to_tree_mapping = None # Call reformatting function using specified parameters # to get reference tree if opts.verbose: print """**BUILDING REFERENCE TREE (without respect to trait table)**""" new_reference_tree, not_useful_trait_table_lines =\ reformat_tree_and_trait_table(\ tree=input_tree,\ trait_table_lines = [],\ trait_to_tree_mapping = None,\ input_trait_table_delimiter= None,\ output_trait_table_delimiter= None,\ filter_table_by_tree_tips=False,\ convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=False,\ convert_to_bifurcating=True,\ add_branch_length_to_root=False,\ name_unnamed_nodes=True,\ min_branch_length=min_branch_length,\ verbose=opts.verbose) #Make a copy new_reference_tree_copy=new_reference_tree.deepcopy() if opts.verbose: print """**BUILDING PRUNED TREE AND TRAIT TABLE**""" # Call reformatting function using specified parameters new_tree, new_trait_table_lines = \ reformat_tree_and_trait_table(tree=new_reference_tree_copy,\ trait_table_lines = trait_table_lines,\ trait_to_tree_mapping = trait_to_tree_mapping,\ input_trait_table_delimiter= input_delimiter,\ output_trait_table_delimiter=output_delimiter,\ filter_table_by_tree_tips=True,\ convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=True,\ convert_to_bifurcating=False,\ add_branch_length_to_root=False,\ name_unnamed_nodes=False,\ min_branch_length=min_branch_length,\ verbose=opts.verbose) #Alter reference tree to only contain tips in OTU table (and of course trait table) if opts.limit_tree_to_otus_fp: if opts.verbose: print "Pruning reference tree to contain only tips in OTU table (and trait table)...." otu_table = open(opts.limit_tree_to_otus_fp,"U") otu_table_lines = otu_table.readlines() header_line,otu_table_fields =parse_trait_table(otu_table_lines,delimiter = input_delimiter,has_header=False) header_line,trait_table_fields =\ parse_trait_table(new_trait_table_lines,delimiter = input_delimiter) tips_to_keep = list(otu_table_fields) + list(trait_table_fields) tips_to_keep_in_tree = filter_table_by_presence_in_tree(new_reference_tree_copy,tips_to_keep) new_reference_tree = filter_tree_tips_by_presence_in_table(new_reference_tree_copy,\ tips_to_keep_in_tree,verbose=opts.verbose) if opts.verbose: print "Almost finished. Writing trees and trait table to files..." #Write results to files # Open output files output_trait_table_file = open(output_table_fp,"w+") output_tree_file = open(output_tree_fp,"w+") output_reference_tree_file = open(output_reference_tree_fp,"w+") #Output trait table file if opts.verbose: print "Writing trait table to:", output_table_fp output_trait_table_file.write("\n".join(new_trait_table_lines)) trait_table.close() output_trait_table_file.close() #Output tree file if opts.verbose: print "Writing pruned tree to:", output_tree_fp if opts.convert_to_nexus is True: lines = nexus_lines_from_tree(new_tree) output_tree_file.write("\n".join(map(str,lines))) else: output_tree_file.write(new_tree.getNewick(with_distances=True)) output_tree_file.close() if opts.verbose: print "Writing reference tree to:", output_reference_tree_fp #Output reference tree file output_reference_tree_file.write(new_reference_tree.getNewick(with_distances=True)) output_reference_tree_file.close()
script_info['script_description'] = """If input_tgz has one file: extract it an rename it as output_path. If input_tgz has multiple files: extract them in a directory named output_path. If input_tgz is not a tgz file (must be a file, not a directory): rename the input file as output_path""" script_info['script_usage'] = [("Example:", "Extract the content of the tgz file named 'in.tgz' into the directory 'out_dir'", "%prog -i in.tgz -o out_dir")] script_info['output_description'] = "" script_info['required_options'] = [ make_option('-i', '--input_tgz', type="existing_filepath", help='File path for the tgz file to uncompress'), make_option('-o', '--output_path', type="new_path", help='Path where to extract the contents of the tgz file') ] script_info['optional_options'] = [] script_info['version'] = __version__ def extract_if_is_tgz(tgz_fp, output_path): try: extract_from_tgz(tgz_fp, output_path) except ValueError, e: # The input if str(e) == ERROR_MSG: copyfile(tgz_fp, output_path) else: raise ValueError, e if __name__ == '__main__': option_parser, opts, args = parse_command_line_parameters(**script_info) tgz_fp = opts.input_tgz output_path = opts.output_path extract_if_is_tgz(tgz_fp, output_path)
def main(): # Parse input to get parameters option_parser, opts, args =\ parse_command_line_parameters(**script_info) tree_file = opts.input_tree trait_table_fp = opts.input_trait_table verbose = opts.verbose #Set output base file names trait_table_base = 'trait_table.tab' pruned_tree_base = 'pruned_tree.newick' reference_tree_base = 'reference_tree.newick' output_dir = make_output_dir(opts.output_dir, strict=False) output_table_fp = join(output_dir, trait_table_base) output_tree_fp = join(output_dir, pruned_tree_base) output_reference_tree_fp = join(output_dir, reference_tree_base) #Handle parameters with more complex defaults delimiter_map = {"space": " ", "tab": "\t", "comma": ","} input_delimiter = delimiter_map[opts.input_table_delimiter] output_delimiter = delimiter_map[opts.output_table_delimiter] if verbose: print "Running with options:" print "\t%s:%s" % ("Tree file", tree_file) print "\t%s:%s" % ("Trait table", trait_table_fp) print "\t%s:%s" % ("Output tree", output_tree_fp) print "\t%s:%s" % ("Output reference tree", output_reference_tree_fp) print "\t%s:%s" % ("Output trait table", output_table_fp) print "\t%s:%s" % ("Add branch length to root", opts.add_branch_length_to_root) print "\t%s:%s" % ("Convert to NEXUS?", opts.convert_to_nexus) print "\t%s:%s" % ("Input trait table delimiter", opts.input_table_delimiter) print "\t%s:%s" % ("Output trait table delimiter", opts.output_table_delimiter) # Begin reformatting root_name = "root" #format_for_bayestraits = True #TODO: this will become a new function in the bayestraits app controller #if format_for_bayestraits: # convert_to_nexus = True # convert_to_bifurcating = True # filter_table_by_tree_tips = True # filter_tree_by_table_entries = True # enforce_min_branch_length = True # convert_trait_floats_to_ints = True if opts.no_minimum_branch_length: min_branch_length = None else: min_branch_length = 0.0001 #Load inputs if verbose: print "Loading tree...." input_tree = DndParser(open(tree_file)) #input_tree =DndParser(open(tree_file), constructor=PicrustNode) #input_tree = load_picrust_tree(opts.input_tree,opts.verbose) if verbose: print "Loading trait table..." trait_table = open(trait_table_fp, "U") trait_table_lines = trait_table.readlines() if not trait_table_lines: raise IOError( "No lines could be loaded from file %s. Please check the input file." % trait_table_fp) #Get id mappings from mapping file if opts.tree_to_trait_mapping: if verbose: print "Loading tree to trait table mapping file..." mapping_file = open(opts.tree_to_trait_mapping, "U") trait_to_tree_mapping =\ make_id_mapping_dict(parse_id_mapping_file(mapping_file)) else: if verbose: print "No tree to trait mapping file specified. Assuming tree tip names and trait table names will match exactly." trait_to_tree_mapping = None # Call reformatting function using specified parameters # to get reference tree if opts.verbose: print """**BUILDING REFERENCE TREE (without respect to trait table)**""" new_reference_tree, not_useful_trait_table_lines =\ reformat_tree_and_trait_table(\ tree=input_tree,\ trait_table_lines = [],\ trait_to_tree_mapping = None,\ input_trait_table_delimiter= None,\ output_trait_table_delimiter= None,\ filter_table_by_tree_tips=False,\ convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=False,\ convert_to_bifurcating=True,\ add_branch_length_to_root=False,\ name_unnamed_nodes=True,\ min_branch_length=min_branch_length,\ verbose=opts.verbose) #Make a copy new_reference_tree_copy = new_reference_tree.deepcopy() if opts.verbose: print """**BUILDING PRUNED TREE AND TRAIT TABLE**""" # Call reformatting function using specified parameters new_tree, new_trait_table_lines = \ reformat_tree_and_trait_table(tree=new_reference_tree_copy,\ trait_table_lines = trait_table_lines,\ trait_to_tree_mapping = trait_to_tree_mapping,\ input_trait_table_delimiter= input_delimiter,\ output_trait_table_delimiter=output_delimiter,\ filter_table_by_tree_tips=True,\ convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=True,\ convert_to_bifurcating=False,\ add_branch_length_to_root=False,\ name_unnamed_nodes=False,\ min_branch_length=min_branch_length,\ verbose=opts.verbose) #Alter reference tree to only contain tips in OTU table (and of course trait table) if opts.limit_tree_to_otus_fp: if opts.verbose: print "Pruning reference tree to contain only tips in OTU table (and trait table)...." otu_table = open(opts.limit_tree_to_otus_fp, "U") otu_table_lines = otu_table.readlines() header_line, otu_table_fields = parse_trait_table( otu_table_lines, delimiter=input_delimiter, has_header=False) header_line,trait_table_fields =\ parse_trait_table(new_trait_table_lines,delimiter = input_delimiter) tips_to_keep = list(otu_table_fields) + list(trait_table_fields) tips_to_keep_in_tree = filter_table_by_presence_in_tree( new_reference_tree_copy, tips_to_keep) new_reference_tree = filter_tree_tips_by_presence_in_table(new_reference_tree_copy,\ tips_to_keep_in_tree,verbose=opts.verbose) if opts.verbose: print "Almost finished. Writing trees and trait table to files..." #Write results to files # Open output files output_trait_table_file = open(output_table_fp, "w+") output_tree_file = open(output_tree_fp, "w+") output_reference_tree_file = open(output_reference_tree_fp, "w+") #Output trait table file if opts.verbose: print "Writing trait table to:", output_table_fp output_trait_table_file.write("\n".join(new_trait_table_lines)) trait_table.close() output_trait_table_file.close() #Output tree file if opts.verbose: print "Writing pruned tree to:", output_tree_fp if opts.convert_to_nexus is True: lines = nexus_lines_from_tree(new_tree) output_tree_file.write("\n".join(map(str, lines))) else: output_tree_file.write(new_tree.getNewick(with_distances=True)) output_tree_file.close() if opts.verbose: print "Writing reference tree to:", output_reference_tree_fp #Output reference tree file output_reference_tree_file.write( new_reference_tree.getNewick(with_distances=True)) output_reference_tree_file.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) tmp_dir = 'jobs/' make_output_dir(tmp_dir) #Run the jobs script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py') if (opts.parallel_method == 'sge'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_sge.py') elif (opts.parallel_method == 'multithreaded'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs.py') elif (opts.parallel_method == 'torque'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_torque.py') else: raise RuntimeError if (opts.verbose): print "Loading tree..." tree = load_picrust_tree(opts.tree, opts.verbose) all_tips = [tip.Name for tip in tree.tips()] if (opts.verbose): print "Total number of possible tips to predict: {0}".format( len(all_tips)) created_tmp_files = [] output_files = {} output_files['counts'] = [] if opts.reconstruction_confidence: output_files['variances'] = [] output_files['upper_CI'] = [] output_files['lower_CI'] = [] if opts.already_calculated: all_tips = get_tips_not_in_precalc(all_tips, opts.already_calculated) if opts.verbose: print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format( len(all_tips)) #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_') jobs = open(jobs_fp, 'w') created_tmp_files.append(jobs_fp) if (opts.verbose): print "Creating temporary input files in: ", tmp_dir num_tips_per_job = 1000 for tips_to_predict in [ all_tips[i:i + num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job) ]: #create tmp output files tmp_output_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='out_predict_traits_') output_files['counts'].append(tmp_output_fp) tip_to_predict_str = ','.join(list(tips_to_predict)) if opts.reconstruction_confidence: outfile_base, extension = splitext(tmp_output_fp) output_files['variances'].append(outfile_base + "_variances.tab") output_files['upper_CI'].append(outfile_base + "_upper_CI.tab") output_files['lower_CI'].append(outfile_base + "_lower_CI.tab") #create the job command cmd = "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format( script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp) else: cmd = "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format( script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp) #NOTE: Calculating NSTI this way is convenient, #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on. if opts.calculate_accuracy_metrics: cmd = cmd + " -a" #add job command to the the jobs file jobs.write(cmd + "\n") jobs.close() #add all output files to tmp list (used later for deletion) for predict_type in output_files: created_tmp_files.extend(output_files[predict_type]) if (opts.verbose): print "Launching parallel jobs." #run the job command job_prefix = 'picrust' submit_jobs(cluster_jobs_fp, jobs_fp, job_prefix, num_jobs=opts.num_jobs, delay=opts.delay) if (opts.verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files['counts']) if (opts.verbose): print "Jobs are done running." make_output_dir_for_file(opts.output_trait_table) outfile_base, extension = splitext(opts.output_trait_table) for predict_type in sorted(output_files): #Combine output files if opts.verbose: print "Combining all output files for " + predict_type combined_predictions = combine_predict_trait_output( output_files[predict_type]) if opts.verbose: print "Writing combined file for " + predict_type if predict_type == 'counts': #Output in whatever format the user wants if opts.output_precalc_file_in_biom: open(opts.output_trait_table, 'w').write( format_biom_table( convert_precalc_to_biom(combined_predictions))) else: open(opts.output_trait_table, 'w').write(combined_predictions) else: if opts.output_precalc_file_in_biom: open(outfile_base + "_" + predict_type + ".biom", 'w').write( format_biom_table( convert_precalc_to_biom(combined_predictions))) else: open(outfile_base + "_" + predict_type + ".tab", 'w').write(combined_predictions) #clean up all tmp files for file in created_tmp_files: remove(file)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ids_to_load = otu_table.ObservationIds if(opts.input_count_table is None): #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz) precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if opts.verbose: print "Loading count table: ", input_count_table if (ext == '.gz'): genome_table_fh = gzip.open(input_count_table,'rb') else: genome_table_fh = open(input_count_table,'U') #In the genome/trait table genomes are the samples and #genes are the observations if opts.load_precalc_file_in_biom: if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = parse_biom_table(genome_table_fh.read()) else: genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load) partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_fp make_output_dir_for_file(opts.output_fp) open(opts.output_fp,'w').write(output_text)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading tree from file:", opts.tree # Load Tree #tree = LoadTree(opts.tree) tree = load_picrust_tree(opts.tree, opts.verbose) table_headers =[] traits={} #load the asr trait table using the previous list of functions to order the arrays if opts.reconstructed_trait_table: table_headers,traits =\ update_trait_dict_from_file(opts.reconstructed_trait_table) #Only load confidence intervals on the reconstruction #If we actually have ASR values in the analysis if opts.reconstruction_confidence: if opts.verbose: print "Loading ASR confidence data from file:",\ opts.reconstruction_confidence asr_confidence_output = open(opts.reconstruction_confidence) asr_min_vals,asr_max_vals, params,column_mapping =\ parse_asr_confidence_output(asr_confidence_output) brownian_motion_parameter = params['sigma'][0] brownian_motion_error = params['sigma'][1] if opts.verbose: print "Done. Loaded %i confidence interval values." %(len(asr_max_vals)) print "Brownian motion parameter:",brownian_motion_parameter else: brownian_motion_parameter = None #load the trait table into a dict with organism names as keys and arrays as functions table_headers,genome_traits =\ update_trait_dict_from_file(opts.observed_trait_table,table_headers) #Combine the trait tables overwriting the asr ones if they exist in the genome trait table. traits.update(genome_traits) # Specify the attribute where we'll store the reconstructions trait_label = "Reconstruction" if opts.verbose: print "Assigning traits to tree..." # Decorate tree using the traits tree = assign_traits_to_tree(traits,tree, trait_label=trait_label) if opts.reconstruction_confidence: if opts.verbose: print "Assigning trait confidence intervals to tree..." tree = assign_traits_to_tree(asr_min_vals,tree,\ trait_label="lower_bound") tree = assign_traits_to_tree(asr_max_vals,tree,\ trait_label="upper_bound") if opts.verbose: print "Collecting list of nodes to predict..." #Start by predict all tip nodes. nodes_to_predict = [tip.Name for tip in tree.tips()] if opts.verbose: print "Found %i nodes to predict." % len(nodes_to_predict) if opts.limit_predictions_to_organisms: organism_id_str = opts.limit_predictions_to_organisms ok_organism_ids = organism_id_str.split(',') ok_organism_ids = [n.strip() for n in ok_organism_ids] for f in set_label_conversion_fns(True,True): ok_organism_ids = [f(i) for i in ok_organism_ids] if opts.verbose: print "Limiting predictions to user-specified ids:",\ ",".join(ok_organism_ids) if not ok_organism_ids: raise RuntimeError(\ "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\ % opts.limit_predictions_to_organisms) nodes_to_predict =\ [n for n in nodes_to_predict if n in ok_organism_ids] if not nodes_to_predict: raise RuntimeError(\ "Filtering by user-specified ids resulted in an empty set of nodes to predict. Are the ids on the commmand-line and tree ids in the same format? Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0])) if opts.verbose: print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" %(len(nodes_to_predict)) if opts.limit_predictions_by_otu_table: if opts.verbose: print "Limiting predictions to ids in user-specified OTU table:",\ opts.limit_predictions_by_otu_table otu_table = open(opts.limit_predictions_by_otu_table,"U") #Parse OTU table for ids otu_ids =\ extract_ids_from_table(otu_table.readlines(),delimiter="\t") if not otu_ids: raise RuntimeError(\ "Found no valid ids in input OTU table: %s. Is the path correct?"\ % opts.limit_predictions_by_otu_table) nodes_to_predict =\ [n for n in nodes_to_predict if n in otu_ids] if not nodes_to_predict: raise RuntimeError(\ "Filtering by OTU table resulted in an empty set of nodes to predict. Are the OTU ids and tree ids in the same format? Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0])) if opts.verbose: print "After filtering by OTU table, %i nodes remain to be predicted" %(len(nodes_to_predict)) # Calculate accuracy of PICRUST for the given tree, sequenced genomes # and set of ndoes to predict accuracy_metrics = ['NSTI'] accuracy_metric_results = None if opts.output_accuracy_metrics: if opts.verbose: print "Calculating accuracy metrics: %s" %([",".join(accuracy_metrics)]) accuracy_metric_results = {} if 'NSTI' in accuracy_metrics: nsti_result,min_distances =\ calc_nearest_sequenced_taxon_index(tree,\ limit_to_tips = nodes_to_predict,\ trait_label = trait_label, verbose=opts.verbose) #accuracy_metric_results['NSTI'] = nsti_result for organism in min_distances.keys(): accuracy_metric_results[organism] = {'NSTI': min_distances[organism]} if opts.verbose: print "NSTI:", nsti_result #Write accuracy metrics to file if opts.verbose: print "Writing accuracy metrics to file:",opts.output_accuracy_metrics f = open(opts.output_accuracy_metrics,'w+') lines = ["metric\torganism\tvalue\n"] for organism in accuracy_metric_results.keys(): for metric in accuracy_metric_results[organism].keys(): lines.append('\t'.join([metric,organism,\ str(accuracy_metric_results[organism][metric])])+'\n') f.writelines(sorted(lines)) f.close() if opts.verbose: print "Generating predictions using method:",opts.prediction_method if opts.weighting_method == 'exponential': #For now, use exponential weighting weight_fn = make_neg_exponential_weight_fn(e) elif opts.weighting_method == 'linear': #Linear weight function weight_fn = linear_weight elif opts.weighting_method == 'equal_weight': weight_fn = equal_weight variances=None #Overwritten by methods that calc variance if opts.prediction_method == 'asr_and_weighting': if opts.reconstruction_confidence: # Perform predictions using reconstructed ancestral states predictions,variances =\ predict_traits_from_ancestors(tree,nodes_to_predict,\ trait_label=trait_label,\ lower_bound_trait_label="lower_bound",\ upper_bound_trait_label="upper_bound",\ calc_confidence_intervals = True,\ brownian_motion_parameter=brownian_motion_parameter,\ use_self_in_prediction = True,\ weight_fn =weight_fn,verbose=opts.verbose) else: predictions =\ predict_traits_from_ancestors(tree,nodes_to_predict,\ trait_label=trait_label,\ use_self_in_prediction = True,\ weight_fn =weight_fn,verbose=opts.verbose) elif opts.prediction_method == 'weighting_only': #Ignore ancestral information predictions =\ weighted_average_tip_prediction(tree,nodes_to_predict,\ trait_label=trait_label,\ use_self_in_prediction = True,\ weight_fn =weight_fn,verbose=opts.verbose) elif opts.prediction_method == 'nearest_neighbor': predictions = predict_nearest_neighbor(tree,nodes_to_predict,\ trait_label=trait_label,\ use_self_in_prediction = True, tips_only = True) elif opts.prediction_method == 'random_neighbor': predictions = predict_random_neighbor(tree,\ nodes_to_predict,trait_label=trait_label,\ use_self_in_prediction = True) else: error_template =\ "Prediction method '%s' is not supported. Valid methods are: %s'" error_text = error_template %(opts.prediction_method,\ ", ".join(METHOD_CHOICES)) if opts.verbose: print "Converting results to .biom format for output..." #convert to biom format (and transpose) biom_predictions=biom_table_from_predictions(predictions,table_headers) #In the .biom table, organisms are 'samples' and traits are 'observations #(by analogy with a metagenomic sample) #Therefore, we associate the trait variances with the per-observation metadata #print "variances:",variances #print "BIOM observations:", [o for o in biom_predictions.iterObservations()] #print "BIOM samples:", [s for s in biom_predictions.iterSamples()] if variances is not None: if opts.verbose: print "Adding variance information to output .biom table, as per-observation metadata with key 'variance'..." biom_predictions.addSampleMetadata(variances) if accuracy_metric_results is not None: if opts.verbose: print "Adding accuracy metrics (%s) to biom table as per-observation metadata..." %(",".join(accuracy_metrics)) biom_predictions.addSampleMetadata(accuracy_metric_results) #Add variance information as per observation metadata if opts.verbose: print "Writing biom format prediction results to file: ",opts.output_trait_table #write biom table to file make_output_dir_for_file(opts.output_trait_table) open(opts.output_trait_table,'w').write(\ format_biom_table(biom_predictions))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) pool_by = opts.pool_by.split(',') #create output directory make_output_dir(opts.output_dir) #Construct a dict from user specified field order file_name_field_order = {} for i,field in enumerate(opts.field_order.split(',')): file_name_field_order[field]=i if opts.verbose: print "Assuming file names are in this order:",file_name_field_order for k in pool_by: #Check that we're only pooling by values that exist if k not in file_name_field_order.keys(): err_text=\ "Bad value for option '--pool_by'. Can't pool by '%s'. Valid categories are: %s" %(k,\ ",".join(file_name_field_order.keys())) raise ValueError(err_text) if opts.verbose: print "Pooling results by:",pool_by roc_success_criteria = ['binary','exact','int_exact'] scatter_lines,correlation_lines,roc_result_lines,roc_auc_lines =\ evaluate_test_dataset_dir(opts.trait_table_dir,\ opts.exp_trait_table_dir,file_name_delimiter="--",\ file_name_field_order=file_name_field_order,pool_by=pool_by,\ roc_success_criteria=roc_success_criteria,verbose=opts.verbose) #Output scatter data output_fp = join(opts.output_dir,'evaluation_scatter_data.tab') if opts.verbose: print "Writing scatter plot data to:",output_fp file_lines = scatter_lines f = open(output_fp,"w+") f.writelines(file_lines) f.close() #Output correlation data output_fp = join(opts.output_dir,'evaluation_correlation_data.tab') if opts.verbose: print "Writing correlation data to:",output_fp file_lines = correlation_lines f = open(output_fp,"w+") f.writelines(file_lines) f.close() #Output raw ROC plot data if opts.verbose: print "Writing ROC data..." for c in roc_result_lines.keys(): output_fp = join(opts.output_dir,'evaluation_roc_data_%s.tab' %c) if opts.verbose: print "Outputting ROC data for success criterion %s to: %s" %(c,output_fp) file_lines = roc_result_lines[c] f = open(output_fp,"w+") f.writelines(file_lines) f.close() #Output summary ROC AUC data if opts.verbose: print "Writing ROC AUC data..." for c in roc_auc_lines.keys(): output_fp = join(opts.output_dir,'evaluation_roc_auc_data_%s.tab' %c) file_lines = roc_auc_lines[c] if opts.verbose: print "Outputting ROC AUC data for success criterion %s to: %s" %(c,output_fp) f = open(output_fp,"w+") f.writelines(file_lines) f.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #set some defaults for the options input_dir = opts.input_dir output_dir = opts.output_dir or input_dir tmp_dir = opts.tmp_dir or output_dir parallel_method = opts.parallel_method asr_method = opts.asr_method predict_traits_method = opts.prediction_method if opts.num_jobs > 20 and parallel_method == 'multithreaded': raise ValueError( 'You probably dont want to run multithreaded evaluations with a large num_jobs. Please adjust options num_jobs and or parallel_method' ) if opts.with_confidence and asr_method not in ['ace_ml', 'ace_reml']: raise ValueError( "PICRUST currently only supports confidence intervals with the ace_ml and ace_reml ASR methods" ) if opts.verbose: print "Reconstruction method:", asr_method print "Prediction method:", predict_traits_method print "Parallel method:", parallel_method print "num_jobs:", opts.num_jobs print "\nOutput will be saved here:'%s'" % output_dir #create the output directory unless it already exists make_output_dir(output_dir) if (parallel_method == 'sge'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_sge.py') elif (parallel_method == 'multithreaded'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs.py') elif (parallel_method == 'torque'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_torque.py') else: raise RuntimeError #get the test datasets to run in the input directory (based on exp_traits files) expect_test_files = glob(join(input_dir, 'exp_traits--*')) test_datasets = {} for file_name in expect_test_files: test_id = file_name.replace(join(input_dir, 'exp_traits--'), '', 1) #create a dict with the test files as values in the ref list test_datasets[test_id] = [ join(input_dir, 'test_trait_table--' + test_id), join(input_dir, 'test_tree--' + test_id), join(input_dir, 'exp_traits--' + test_id) ] created_tmp_files = [] output_files = [] #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_') jobs = open(jobs_fp, 'w') created_tmp_files.append(jobs_fp) #get location of scripts we need to run asr_script_fp = join(get_picrust_project_dir(), 'scripts', 'ancestral_state_reconstruction.py') predict_traits_script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py') #run each test dataset through the pipeline for test_id in test_datasets: asr_out_fp = join(output_dir, 'asr--' + asr_method + '--' + test_id) asr_params_out_fp = join( output_dir, '--'.join(['asr', asr_method, 'asr_params', test_id])) created_tmp_files.append(asr_out_fp) if opts.check_for_null_files and exists( asr_out_fp) and file_contains_nulls(asr_out_fp): #remove file if opts.verbose: print "Existing ASR file contains null characters. Will run ASR again after removing: " + asr_out_fp remove(asr_out_fp) if exists(asr_out_fp) and not opts.force: if opts.verbose: print "Output file: {0} already exists, so we will skip it.".format( asr_out_fp) asr_cmd = "echo 'Skipping ASR for %s, file %s exists already'" % ( test_id, asr_out_fp) else: #create the asr command asr_cmd = """python {0} -i "{1}" -t "{2}" -m {3} -o "{4}" -c "{5}" """.format( asr_script_fp, test_datasets[test_id][0], test_datasets[test_id][1], asr_method, asr_out_fp, asr_params_out_fp) predict_traits_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\ opts.weighting_method,test_id])) if opts.with_accuracy: predict_traits_accuracy_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\ opts.weighting_method,'accuracy_metrics',test_id])) if opts.check_for_null_files and exists( predict_traits_out_fp) and file_contains_nulls( predict_traits_out_fp): if opts.verbose: print "Existing trait predictions file contains null characters. Will run it again after removing: " + predict_traits_out_fp remove(predict_traits_out_fp) if exists(predict_traits_out_fp) and not opts.force: if opts.verbose: print "Prediction file: {0} already exists. Skipping ASR and prediction for this organism".format( predict_traits_out_fp) continue output_files.append(predict_traits_out_fp) genome_id = split('--', test_id)[2] if predict_traits_method == 'nearest_neighbor': #don't do asr step predict_traits_cmd = """python {0} -i "{1}" -t "{2}" -g "{3}" -o "{4}" -m "{5}" """.format( predict_traits_script_fp, test_datasets[test_id][0], opts.ref_tree, genome_id, predict_traits_out_fp, predict_traits_method) jobs.write(predict_traits_cmd + "\n") else: #create the predict traits command predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -r "{3}" -g "{4}" -o "{5}" -m "{6}" -w {7} """.format(predict_traits_script_fp,\ test_datasets[test_id][0], opts.ref_tree, asr_out_fp,genome_id, predict_traits_out_fp,predict_traits_method,opts.weighting_method) #Instruct predict_traits to use confidence intervals output by ASR if opts.with_confidence: confidence_param = ' -c "%s"' % (asr_params_out_fp) predict_traits_cmd = predict_traits_cmd + confidence_param #Instruct predict traits to output the NTSI measure of distance to #nearby sequences. if opts.with_accuracy: accuracy_param = ' -a "%s"' % (predict_traits_accuracy_out_fp) predict_traits_cmd = predict_traits_cmd + accuracy_param #add job command to the the jobs file jobs.write(asr_cmd + ';' + predict_traits_cmd + "\n") jobs.close() #created_tmp_files.extend(output_files) #submit the jobs job_prefix = 'eval_' if opts.verbose: print "Submitting jobs:", cluster_jobs_fp, jobs_fp, job_prefix, opts.num_jobs submit_jobs(cluster_jobs_fp, jobs_fp, job_prefix, num_jobs=opts.num_jobs)
def main(): """Generate test trees given parameters""" option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading trait table..." input_trait_table = open(opts.input_trait_table, "U") if opts.verbose: print "Loading tree..." #PicrustNode seems to run into very slow/memory intentsive perfromance... #tree = DndParser(open(opts.input_tree),constructor=PicrustNode) tree = DndParser(open(opts.input_tree)) if opts.verbose: print "Parsing trait table..." #Find which taxa are to be used in tests #(by default trait table taxa) trait_table_header,trait_table_fields = \ parse_trait_table(input_trait_table) if opts.verbose: print "Ensuring tree and trait table labels are formatted consistently..." label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose) fix_tree_labels(tree, label_conversion_fns) trait_table_fields = convert_trait_table_entries(trait_table_fields,\ value_conversion_fns = [],\ label_conversion_fns = label_conversion_fns) trait_table_fields = [t for t in trait_table_fields] print "Number of trait table fields with single quotes:",\ len([t for t in trait_table_fields if "'" in t[0]]) if opts.verbose: print "Making output directory..." make_output_dir(opts.output_dir) if opts.limit_to_tips: included_tips = opts.limit_to_tips.split(",") if opts.verbose: print "Limiting test datasets to %i tips: %s" % ( len(included_tips), included_tips) else: included_tips = False method_fns =\ {"exclude_tips_by_distance":\ make_distance_based_exclusion_fn,\ "randomize_tip_labels_by_distance":\ make_distance_based_tip_label_randomizer } test_fn_factory = method_fns[opts.method] if opts.verbose: print "Setting tree modification method to:", opts.method print "(%s)" % test_fn_factory.__doc__ modify_tree = True if opts.suppress_tree_modification: if opts.verbose: print "Suppressing modification of tree when making test datasets" modify_tree = False if opts.verbose: print "Starting generation of test datsets" test_datasets = \ yield_genome_test_data_by_distance(tree,trait_table_fields,\ test_fn_factory,min_dist = opts.min_dist,\ max_dist=opts.max_dist,increment=opts.dist_increment,\ modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose) if opts.verbose: print "Writing files for test datasets" for curr_dist,test_tree,tip_to_predict,\ expected_traits,test_trait_table_fields in test_datasets: if included_tips is not False: if tip_to_predict not in included_tips: if opts.verbose: print "Skipping tip %s: limiting to tip(s): %s" % ( tip_to_predict, included_tips) continue #Make a safe version of tip to predict # So odd characters like | don't mess up OS safe_tip_to_predict = "'%s'" % tip_to_predict #Write tree base_name = "--".join(map(str, ["test_tree", opts.method, curr_dist])) curr_filepath = write_tree(opts.output_dir, base_name, test_tree, safe_tip_to_predict) if opts.verbose: print "Wrote test tree to: %s" % curr_filepath #Write expected trait table base_name = "--".join( map(str, ["exp_traits", opts.method, curr_dist, safe_tip_to_predict])) exp_trait_table_lines = [trait_table_header] exp_trait_table_lines.append("\t".join(expected_traits) + "\n") #print "Expected_trait_table_lines:",exp_trait_table_lines filename = os.path.join(opts.output_dir, base_name) if opts.verbose: print "Writing expected trait table to:", filename f = open(filename, "w") f.write("".join(exp_trait_table_lines)) f.close() #Output a transposed, BIOM format expectation table for comparison with predict_traits output #NOTE: this is a clumsy way of getting the translated trait table # but more elegant, direct methods (directly feeding data to biom's table_factory) # weren't working for me readily. In the future, we should streamline this process # Leaving as is for now since this code is mostly for developers so speed/elegence # are probably not essential here. #Let the hackishness begin #Reload the tab-delimited trait table header, fields = parse_trait_table(open(filename, "U")) fields = [f for f in fields] #converts generator to list #Transpose table for .BIOM format so that Observation ids are KOs transposed_header, transposed_trait_table_lines =\ transpose_trait_table_fields(fields,header,\ id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t") #Eliminate newline in header trans_trait_table_lines = [transposed_header.strip()] trans_trait_table_lines.extend( ["\t".join(r) for r in transposed_trait_table_lines]) trans_trait_table = '\n'.join(trans_trait_table_lines) #Write BIOM format expected trait table base_name = "--".join( map(str, [ "exp_biom_traits", opts.method, curr_dist, safe_tip_to_predict ])) expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\ table_format = "tab-delimited") #print "Expected_trait_table_lines:",exp_trait_table_lines filename = os.path.join(opts.output_dir, base_name) if opts.verbose: print "Writing BIOM-format expected trait table to:", filename f = open(filename, "w") f.write(format_biom_table(expected_biom_table)) f.close() #Write test trait table test_trait_table_fields = test_trait_table_fields if expected_traits in test_trait_table_fields: test_trait_table_fields.remove(expected_traits) test_trait_table_lines = [trait_table_header] test_trait_table_lines.extend( ["\t".join(r) + "\n" for r in test_trait_table_fields]) #print "Test_trait_table_lines:",test_trait_table_lines base_name = "--".join( map(str, [ "test_trait_table", opts.method, curr_dist, safe_tip_to_predict ])) filename = os.path.join(opts.output_dir, base_name) if opts.verbose: print "Writing test trait table to:", filename f = open(filename, "w") f.write("".join(test_trait_table_lines)) f.close() if opts.verbose: print "Done generating test datasets"
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_ext = path.splitext(opts.input_otu_fp)[1] if opts.input_format_classic: otu_table = parse_classic_table_to_rich_table( open(opts.input_otu_fp, 'U'), None, None, None, DenseOTUTable) else: try: otu_table = parse_biom_table(open(opts.input_otu_fp, 'U')) except ValueError: raise ValueError( "Error loading OTU table! If not in BIOM format use '-f' option.\n" ) ids_to_load = otu_table.ObservationIds if (opts.input_count_fp is None): #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name = '_'.join( ['16S', opts.gg_version, 'precalculated.tab.gz']) input_count_table = join(get_picrust_project_dir(), 'picrust', 'data', precalc_file_name) else: input_count_table = opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext = path.splitext(input_count_table)[1] if (ext == '.gz'): count_table_fh = gzip.open(input_count_table, 'rb') else: count_table_fh = open(input_count_table, 'U') if opts.load_precalc_file_in_biom: count_table = parse_biom_table(count_table_fh.read()) else: count_table = convert_precalc_to_biom(count_table_fh, ids_to_load) #Need to only keep data relevant to our otu list ids = [] for x in otu_table.iterObservations(): ids.append(str(x[1])) ob_id = count_table.ObservationIds[0] filtered_otus = [] filtered_values = [] for x in ids: if count_table.sampleExists(x): filtered_otus.append(x) filtered_values.append(otu_table.observationData(x)) #filtered_values = map(list,zip(*filtered_values)) filtered_otu_table = table_factory(filtered_values, otu_table.SampleIds, filtered_otus, constructor=DenseOTUTable) copy_numbers_filtered = {} for x in filtered_otus: value = count_table.getValueByIds(ob_id, x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x] = {opts.metadata_identifer: value} filtered_otu_table.addObservationMetadata(copy_numbers_filtered) normalized_table = filtered_otu_table.normObservationByMetadata( opts.metadata_identifer) #move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'ObservationMetadata') normalized_otu_table = transfer_sample_metadata(otu_table, normalized_table, 'SampleMetadata') make_output_dir_for_file(opts.output_otu_fp) open(opts.output_otu_fp, 'w').write(format_biom_table(normalized_table))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) print_picrust_config()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if (opts.suppress_unit_tests and opts.suppress_script_usage_tests): option_parser.error("You're suppressing both test types. Nothing to run.") test_dir = abspath(dirname(__file__)) unittest_good_pattern = re.compile('OK\s*$') application_not_found_pattern = re.compile('ApplicationNotFoundError') python_name = 'python' bad_tests = [] missing_application_tests = [] # Run through all of PICRUSt's unit tests, and keep track of any files which # fail unit tests. if not opts.suppress_unit_tests: unittest_names = [] if not opts.unittest_glob: for root, dirs, files in walk(test_dir): for name in files: if name.startswith('test_') and name.endswith('.py'): unittest_names.append(join(root,name)) else: for fp in glob(opts.unittest_glob): fn = split(fp)[1] if fn.startswith('test_') and fn.endswith('.py'): unittest_names.append(abspath(fp)) unittest_names.sort() for unittest_name in unittest_names: print "Testing %s:\n" % unittest_name command = '%s %s -v' % (python_name, unittest_name) stdout, stderr, return_value = system_call(command) print stderr if not unittest_good_pattern.search(stderr): if application_not_found_pattern.search(stderr): missing_application_tests.append(unittest_name) else: bad_tests.append(unittest_name) if not opts.suppress_script_usage_tests: try: from qiime.test import run_script_usage_tests except ImportError: print "QIIME not installed so not running script tests." opts.suppress_script_usage_tests=True else: test_data_dir = join(get_picrust_project_dir(),'picrust_test_data') scripts_dir = join(get_picrust_project_dir(),'scripts') if opts.script_usage_tests != None: script_usage_tests = opts.script_usage_tests.split(',') else: script_usage_tests = None # Run the script usage testing functionality script_usage_result_summary, num_script_usage_example_failures = \ run_script_usage_tests( test_data_dir=test_data_dir, scripts_dir=scripts_dir, working_dir='/tmp/', verbose=True, tests=script_usage_tests, force_overwrite=True, timeout=300) print "==============\nResult summary\n==============" if not opts.suppress_unit_tests: print "\nUnit test result summary\n------------------------\n" if bad_tests: print "\nFailed the following unit tests.\n%s" % '\n'.join(bad_tests) if missing_application_tests: print "\nFailed the following unit tests, in part or whole due "+\ "to missing external applications.\nDepending on the PICRUSt features "+\ "you plan to use, this may not be critical.\n%s"\ % '\n'.join(missing_application_tests) if not (missing_application_tests or bad_tests): print "\nAll unit tests passed.\n\n" if not opts.suppress_script_usage_tests: print "\nScript usage test result summary\n------------------------------------\n" print script_usage_result_summary print "" # If script usage tests weren't suppressed,we can't have any failures. script_usage_tests_success = (opts.suppress_script_usage_tests or num_script_usage_example_failures == 0) # If any of the unit tests or script usage tests fail, or if we have any # missing application errors, use return code 1 (as python's unittest # module does to indicate one or more failures). return_code = 1 if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and script_usage_tests_success): return_code = 0 return return_code
def main(): # INPUT COMMAND LINE OPTIONS voption_parser, opts, args =\ parse_command_line_parameters(**script_info) ########### VOCAL ########### if opts.verbose: print script_info['script_description'] print "####################################### INITIALIZATION #####################################" print date_print() print "Verbose: " + str(opts.verbose) print "PWD: " + os.getcwd() ########## OUTPUT ########### # If output folder specified: make output folder if opts.output_fp: out_bool, out_path = dir_pipe(opts.output_fp, overwrite=opts.overwrite) if (out_bool == True) and (opts.verbose == True): print "Output: " + dir_fullpath(out_path) # print output elif opts.verbose == True: print "Output: Could not be created: " + out_path # if not found then ### PRINT MODULES ### #ref_modules(justModuleIn=True, justFunctionIn=True) ### PRINT FUNCTIONS ### #ref_modules(justModuleIn=False, justFunctionIn=True) ### PRINT FUNCTIONS & DESCRIPTIONS ### #ref_modules(justModuleIn=False, justFunctionIn=False) ########## INPUT ############ if opts.verbose: print "########################################## INPUT [-i] ######################################" input_input_fps = input_pipe(opts.input_fps, optTitle="[-i --input_fps]", verboseIn=opts.verbose) ######### CUSTOM ############ if opts.verbose: print "########################################### CUSTOM #########################################" if input_input_fps: print_enumlist(input_input_fps) # GET PANDAS FILES IMPORTED # pdInputs = [] for inF in input_input_fps: pdInputs.append(pd_ui(inF)) # STOCHASTIC FIRST, THEN HOST-MICROBE TREE COMPARISON # pd_summarize(pdInputs[0], locIn="Stochatic Comparison") pd_summarize(pdInputs[1], locIn="Host-Microbe Comparison") metricIn = "MatchingCluster" hostMicrobeScore = pdInputs[1][metricIn][0] print print "Host-Microbe Score: " + str(hostMicrobeScore) print print "Better Score: " + str( len(pdInputs[0][pdInputs[0][metricIn] < hostMicrobeScore])) print "Worse Score: " + str( len(pdInputs[0][pdInputs[0][metricIn] > hostMicrobeScore])) print "Equiv Score: " + str( len(pdInputs[0][pdInputs[0][metricIn] == hostMicrobeScore])) print "P-value better: " + str( float(len(pdInputs[0][pdInputs[0][metricIn] < hostMicrobeScore])) / 100000.0) print print "Better\Equal Score: " + str( len(pdInputs[0][pdInputs[0][metricIn] <= hostMicrobeScore])) print "Worse Score: " + str( len(pdInputs[0][pdInputs[0][metricIn] > hostMicrobeScore])) print "P-value Better/Equal: " + str( float(len(pdInputs[0][pdInputs[0][metricIn] <= hostMicrobeScore])) / 100000.0) print print "Max Stochastic Metric: " + str(max(pdInputs[0][metricIn])) ############################# if opts.verbose: print "############################################ END ###########################################" print "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\//////////////////////////////////////////////"
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #if we specify we want NSTI only then we have to calculate it first if opts.output_accuracy_metrics_only: opts.calculate_accuracy_metrics=True if opts.verbose: print "Loading tree from file:", opts.tree if opts.no_round: round_opt = False else: round_opt = True # Load Tree tree = load_picrust_tree(opts.tree, opts.verbose) table_headers=[] traits={} #load the asr trait table using the previous list of functions to order the arrays if opts.reconstructed_trait_table: table_headers,traits =\ update_trait_dict_from_file(opts.reconstructed_trait_table) #Only load confidence intervals on the reconstruction #If we actually have ASR values in the analysis if opts.reconstruction_confidence: if opts.verbose: print "Loading ASR confidence data from file:",\ opts.reconstruction_confidence print "Assuming confidence data is of type:",opts.confidence_format asr_confidence_output = open(opts.reconstruction_confidence) asr_min_vals,asr_max_vals, params,column_mapping =\ parse_asr_confidence_output(asr_confidence_output,format=opts.confidence_format) if 'sigma' in params: brownian_motion_parameter = params['sigma'][0] else: brownian_motion_parameter = None if opts.verbose: print "Done. Loaded %i confidence interval values." %(len(asr_max_vals)) print "Brownian motion parameter:",brownian_motion_parameter else: brownian_motion_parameter = None #load the trait table into a dict with organism names as keys and arrays as functions table_headers,genome_traits =\ update_trait_dict_from_file(opts.observed_trait_table,table_headers) #Combine the trait tables overwriting the asr ones if they exist in the genome trait table. traits.update(genome_traits) # Specify the attribute where we'll store the reconstructions trait_label = "Reconstruction" if opts.verbose: print "Assigning traits to tree..." # Decorate tree using the traits tree = assign_traits_to_tree(traits,tree, trait_label=trait_label) if opts.reconstruction_confidence: if opts.verbose: print "Assigning trait confidence intervals to tree..." tree = assign_traits_to_tree(asr_min_vals,tree,\ trait_label="lower_bound") tree = assign_traits_to_tree(asr_max_vals,tree,\ trait_label="upper_bound") if brownian_motion_parameter is None: if opts.verbose: print "No Brownian motion parameters loaded. Inferring these from 95% confidence intervals..." brownian_motion_parameter = get_brownian_motion_param_from_confidence_intervals(tree,\ upper_bound_trait_label="upper_bound",\ lower_bound_trait_label="lower_bound",\ trait_label=trait_label,\ confidence=0.95) if opts.verbose: print "Inferred the following rate parameters:",brownian_motion_parameter if opts.verbose: print "Collecting list of nodes to predict..." #Start by predict all tip nodes. nodes_to_predict = [tip.Name for tip in tree.tips()] if opts.verbose: print "Found %i nodes to predict." % len(nodes_to_predict) if opts.limit_predictions_to_organisms: organism_id_str = opts.limit_predictions_to_organisms ok_organism_ids = organism_id_str.split(',') ok_organism_ids = [n.strip() for n in ok_organism_ids] for f in set_label_conversion_fns(True,True): ok_organism_ids = [f(i) for i in ok_organism_ids] if opts.verbose: print "Limiting predictions to user-specified ids:",\ ",".join(ok_organism_ids) if not ok_organism_ids: raise RuntimeError(\ "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\ % opts.limit_predictions_to_organisms) nodes_to_predict =\ [n for n in nodes_to_predict if n in ok_organism_ids] if not nodes_to_predict: raise RuntimeError(\ "Filtering by user-specified ids resulted in an empty set of nodes to predict. Are the ids on the commmand-line and tree ids in the same format? Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0])) if opts.verbose: print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" %(len(nodes_to_predict)) if opts.limit_predictions_by_otu_table: if opts.verbose: print "Limiting predictions to ids in user-specified OTU table:",\ opts.limit_predictions_by_otu_table otu_table = open(opts.limit_predictions_by_otu_table,"U") #Parse OTU table for ids otu_ids =\ extract_ids_from_table(otu_table.readlines(),delimiter="\t") if not otu_ids: raise RuntimeError(\ "Found no valid ids in input OTU table: %s. Is the path correct?"\ % opts.limit_predictions_by_otu_table) nodes_to_predict =\ [n for n in nodes_to_predict if n in otu_ids] if not nodes_to_predict: raise RuntimeError(\ "Filtering by OTU table resulted in an empty set of nodes to predict. Are the OTU ids and tree ids in the same format? Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0])) if opts.verbose: print "After filtering by OTU table, %i nodes remain to be predicted" %(len(nodes_to_predict)) # Calculate accuracy of PICRUST for the given tree, sequenced genomes # and set of ndoes to predict accuracy_metrics = ['NSTI'] accuracy_metric_results = None if opts.calculate_accuracy_metrics: if opts.verbose: print "Calculating accuracy metrics: %s" %([",".join(accuracy_metrics)]) accuracy_metric_results = {} if 'NSTI' in accuracy_metrics: nsti_result,min_distances =\ calc_nearest_sequenced_taxon_index(tree,\ limit_to_tips = nodes_to_predict,\ trait_label = trait_label, verbose=opts.verbose) #accuracy_metric_results['NSTI'] = nsti_result for organism in min_distances.keys(): accuracy_metric_results[organism] = {'NSTI': min_distances[organism]} if opts.verbose: print "NSTI:", nsti_result if opts.output_accuracy_metrics_only: #Write accuracy metrics to file if opts.verbose: print "Writing accuracy metrics to file:",opts.output_accuracy_metrics f = open(opts.output_accuracy_metrics_only,'w+') f.write("metric\torganism\tvalue\n") lines =[] for organism in accuracy_metric_results.keys(): for metric in accuracy_metric_results[organism].keys(): lines.append('\t'.join([metric,organism,\ str(accuracy_metric_results[organism][metric])])+'\n') f.writelines(sorted(lines)) f.close() exit() if opts.verbose: print "Generating predictions using method:",opts.prediction_method if opts.weighting_method == 'exponential': #For now, use exponential weighting weight_fn = make_neg_exponential_weight_fn(e) variances=None #Overwritten by methods that calc variance confidence_intervals=None #Overwritten by methods that calc variance if opts.prediction_method == 'asr_and_weighting': # Perform predictions using reconstructed ancestral states if opts.reconstruction_confidence: predictions,variances,confidence_intervals =\ predict_traits_from_ancestors(tree,nodes_to_predict,\ trait_label=trait_label,\ lower_bound_trait_label="lower_bound",\ upper_bound_trait_label="upper_bound",\ calc_confidence_intervals = True,\ brownian_motion_parameter=brownian_motion_parameter,\ weight_fn=weight_fn,verbose=opts.verbose, round_predictions=round_opt) else: predictions =\ predict_traits_from_ancestors(tree,nodes_to_predict,\ trait_label=trait_label,\ weight_fn =weight_fn,verbose=opts.verbose, round_predictions=round_opt) elif opts.prediction_method == 'weighting_only': #Ignore ancestral information predictions =\ weighted_average_tip_prediction(tree,nodes_to_predict,\ trait_label=trait_label,\ weight_fn =weight_fn,verbose=opts.verbose) elif opts.prediction_method == 'nearest_neighbor': predictions = predict_nearest_neighbor(tree,nodes_to_predict,\ trait_label=trait_label,tips_only = True) elif opts.prediction_method == 'random_neighbor': predictions = predict_random_neighbor(tree,\ nodes_to_predict,trait_label=trait_label) if opts.verbose: print "Done making predictions." make_output_dir_for_file(opts.output_trait_table) out_fh=open(opts.output_trait_table,'w') #Generate the table of biom predictions if opts.verbose: print "Converting results to .biom format for output..." biom_predictions=biom_table_from_predictions(predictions,table_headers,\ observation_metadata=None,\ sample_metadata=accuracy_metric_results,convert_to_int=False) if opts.verbose: print "Writing prediction results to file: ",opts.output_trait_table if opts.output_precalc_file_in_biom: #write biom table to file write_biom_table(biom_predictions, opts.output_trait_table) else: #convert to precalc (tab-delimited) format out_fh = open(opts.output_trait_table, 'w') out_fh.write(convert_biom_to_precalc(biom_predictions)) out_fh.close() #Write out variance information to file if variances: if opts.verbose: print "Converting variances to BIOM format" if opts.output_precalc_file_in_biom: suffix='.biom' else: suffix='.tab' biom_prediction_variances=biom_table_from_predictions({k:v['variance'] for k,v in variances.iteritems()},table_headers,\ observation_metadata=None,\ sample_metadata=None,convert_to_int=False) outfile_base,extension = splitext(opts.output_trait_table) variance_outfile = outfile_base+"_variances"+suffix make_output_dir_for_file(variance_outfile) if opts.verbose: print "Writing variance information to file:",variance_outfile if opts.output_precalc_file_in_biom: write_biom_table(biom_prediction_variances, variance_outfile) else: open(variance_outfile,'w').write(\ convert_biom_to_precalc(biom_prediction_variances)) if confidence_intervals: if opts.verbose: print "Converting upper confidence interval values to BIOM format" biom_prediction_upper_CI=biom_table_from_predictions({k:v['upper_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\ observation_metadata=None,\ sample_metadata=None,convert_to_int=False) outfile_base,extension = splitext(opts.output_trait_table) upper_CI_outfile = outfile_base+"_upper_CI"+suffix make_output_dir_for_file(upper_CI_outfile) if opts.verbose: print "Writing upper confidence limit information to file:",upper_CI_outfile if opts.output_precalc_file_in_biom: write_biom_table(biom_prediction_upper_CI, upper_CI_outfile) else: open(upper_CI_outfile,'w').write(\ convert_biom_to_precalc(biom_prediction_upper_CI)) biom_prediction_lower_CI=biom_table_from_predictions({k:v['lower_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\ observation_metadata=None,\ sample_metadata=None,convert_to_int=False) outfile_base,extension = splitext(opts.output_trait_table) lower_CI_outfile = outfile_base+"_lower_CI"+suffix make_output_dir_for_file(lower_CI_outfile) if opts.verbose: print "Writing lower confidence limit information to file",lower_CI_outfile if opts.output_precalc_file_in_biom: write_biom_table(biom_prediction_lower_CI, lower_CI_outfile) else: open(lower_CI_outfile,'w').write(\ convert_biom_to_precalc(biom_prediction_lower_CI))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading OTU table: ", opts.input_otu_table otu_table = load_table(opts.input_otu_table) ids_to_load = otu_table.ids(axis='observation').tolist() # Determine whether user wants predictions round to nearest whole # number or not. if opts.no_round: round_flag = False else: round_flag = True if opts.verbose: print "Done loading OTU table containing %i samples and %i OTUs." \ %(len(otu_table.ids()),len(otu_table.ids(axis='observation'))) #Hardcoded loaction of the precalculated datasets for PICRUSt, #relative to the project directory precalc_data_dir = join(get_picrust_project_dir(), 'picrust', 'data') # Load a table of gene counts by OTUs. #This can be either user-specified or precalculated genome_table_fp = determine_data_table_fp(precalc_data_dir,\ opts.type_of_prediction,opts.gg_version,\ user_specified_table=opts.input_count_table,verbose=opts.verbose) if opts.verbose: print "Loading gene count data from file: %s" % genome_table_fp genome_table= load_data_table(genome_table_fp,\ load_data_table_in_biom=opts.load_precalc_file_in_biom,\ suppress_subset_loading=opts.suppress_subset_loading,\ ids_to_load=ids_to_load,verbose=opts.verbose,transpose=True) if opts.verbose: print "Loaded %i genes across %i OTUs from gene count table" \ %(len(genome_table.ids(axis='observation')),len(genome_table.ids())) if opts.with_confidence: if opts.input_variance_table: variance_table_fp = opts.input_variance_table else: variance_table_fp = determine_data_table_fp(precalc_data_dir,\ opts.type_of_prediction,opts.gg_version,\ precalc_file_suffix='precalculated_variances.tab.gz',\ user_specified_table=opts.input_count_table) if opts.verbose: print "Loading variance information from table: %s" \ %variance_table_fp variance_table= load_data_table(variance_table_fp,\ load_data_table_in_biom=opts.load_precalc_file_in_biom,\ suppress_subset_loading=opts.suppress_subset_loading,\ ids_to_load=ids_to_load,transpose=True) if opts.verbose: print "Loaded %i genes across %i OTUs from variance table" \ %(len(variance_table.ids(axis='observation')),len(variance_table.ids())) #Raise an error if the genome table and variance table differ #in the genomes they contain. #better to find out now than have something obscure happen latter on if opts.verbose: print "Checking that genome table and variance table are consistent" try: assert set(variance_table.ids(axis='observation')) == set( genome_table.ids(axis='observation')) except AssertionError, e: for var_id in variance_table.ids(axis='observation'): if var_id not in genome_table.ids(axis='observation'): print "Variance table ObsId %s not in genome_table ObsIds" % var_id raise AssertionError( "Variance table and genome table contain different gene ids") try: assert set(variance_table.ids()) == set(genome_table.ids()) except AssertionError, e: for var_id in variance_table.ids(): if var_id not in genome_table.ids(): print "Variance table SampleId %s not in genome_table SampleIds" % var_id raise AssertionError( "Variance table and genome table contain different OTU ids")
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if (opts.suppress_unit_tests and opts.suppress_script_usage_tests): option_parser.error("You're suppressing both test types. Nothing to run.") test_dir = abspath(dirname(__file__)) unittest_good_pattern = re.compile('OK\s*$') application_not_found_pattern = re.compile('ApplicationNotFoundError') python_name = 'python' bad_tests = [] missing_application_tests = [] # Run through all of PICRUSt's unit tests, and keep track of any files which # fail unit tests. if not opts.suppress_unit_tests: unittest_names = [] if not opts.unittest_glob: for root, dirs, files in walk(test_dir): for name in files: if name.startswith('test_') and name.endswith('.py'): unittest_names.append(join(root,name)) else: for fp in glob(opts.unittest_glob): fn = split(fp)[1] if fn.startswith('test_') and fn.endswith('.py'): unittest_names.append(abspath(fp)) unittest_names.sort() for unittest_name in unittest_names: print "Testing %s:\n" % unittest_name command = '%s %s -v' % (python_name, unittest_name) stdout, stderr, return_value = system_call(command) print stderr if not unittest_good_pattern.search(stderr): if application_not_found_pattern.search(stderr): missing_application_tests.append(unittest_name) else: bad_tests.append(unittest_name) if not opts.suppress_script_usage_tests: try: from qiime.test import run_script_usage_tests except ImportError: print "QIIME not installed so not running script tests." opts.suppress_script_usage_tests=True else: test_data_dir = join(get_picrust_project_dir(),'picrust_test_data') scripts_dir = join(get_picrust_project_dir(),'scripts') if opts.script_usage_tests != None: script_usage_tests = opts.script_usage_tests.split(',') else: script_usage_tests = None # Run the script usage testing functionality script_usage_result_summary, num_script_usage_example_failures = \ run_script_usage_tests( qiime_test_data_dir=test_data_dir, qiime_scripts_dir=scripts_dir, working_dir='/tmp/', verbose=True, tests=script_usage_tests, failure_log_fp=None, force_overwrite=True) print "==============\nResult summary\n==============" if not opts.suppress_unit_tests: print "\nUnit test result summary\n------------------------\n" if bad_tests: print "\nFailed the following unit tests.\n%s" % '\n'.join(bad_tests) if missing_application_tests: print "\nFailed the following unit tests, in part or whole due "+\ "to missing external applications.\nDepending on the PICRUSt features "+\ "you plan to use, this may not be critical.\n%s"\ % '\n'.join(missing_application_tests) if not (missing_application_tests or bad_tests): print "\nAll unit tests passed.\n\n" if not opts.suppress_script_usage_tests: print "\nScript usage test result summary\n------------------------------------\n" print script_usage_result_summary print "" # If script usage tests weren't suppressed,we can't have any failures. script_usage_tests_success = (opts.suppress_script_usage_tests or num_script_usage_example_failures == 0) # If any of the unit tests or script usage tests fail, or if we have any # missing application errors, use return code 1 (as python's unittest # module does to indicate one or more failures). return_code = 1 if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and script_usage_tests_success): return_code = 0 return return_code
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) verbose = opts.verbose min_args = 1 if len(args) < min_args: option_parser.error( 'One or more predicted biom files must be provided.') observed_files = args make_output_dir_for_file(opts.output_fp) out_fh = open(opts.output_fp, 'w') if verbose: print "Loading expected trait table file:", opts.exp_trait_table_fp exp_table = load_table(opts.exp_trait_table_fp) header_printed = False header_keys = [] delimiter = "\t" for observed_file in observed_files: observed_file_name = basename(observed_file) if verbose: print "Loading predicted trait table file:", observed_file_name obs_table = load_table(observed_file) if opts.compare_observations: if verbose: print "Transposing tables to allow evaluation of observations (instead of samples)..." obs_table = obs_table.transpose() exp_table = exp_table.transpose() if verbose: print "Matching predicted and expected tables..." obs, exp = match_biom_tables( obs_table, exp_table, verbose=verbose, limit_to_expected_observations=opts.limit_to_expected_observations, limit_to_observed_observations=opts.limit_to_observed_observations, normalize=opts.normalize, shuffle_samples=opts.shuffle_samples) if verbose: print "Calculating accuracy stats for all observations..." #import pdb; pdb.set_trace() for i in obs: if verbose: print "Calculating stats for: ", i if opts.not_relative_abundance_scores: results = calculate_accuracy_stats_from_observations( obs[i], exp[i], success_criterion='binary') else: results = calculate_accuracy_stats_from_observations( obs[i], exp[i], success_criterion='ra_exact') #If first pass then print out header if not header_printed: header_printed = True header_keys = sorted(results.keys()) out_fh.write( delimiter.join(['file', 'label'] + header_keys) + "\n") #print results using same order as header values = [observed_file_name, i ] + ['{0:.3g}'.format(results[x]) for x in header_keys] out_str = delimiter.join(map(str, values)) + "\n" out_fh.write(out_str)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading OTU table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ids_to_load = otu_table.ObservationIds if opts.verbose: print "Done loading OTU table containing %i samples and %i OTUs." \ %(len(otu_table.SampleIds),len(otu_table.ObservationIds)) #Hardcoded loaction of the precalculated datasets for PICRUSt, #relative to the project directory precalc_data_dir=join(get_picrust_project_dir(),'picrust','data') # Load a table of gene counts by OTUs. #This can be either user-specified or precalculated genome_table_fp = determine_data_table_fp(precalc_data_dir,\ opts.type_of_prediction,opts.gg_version,\ user_specified_table=opts.input_count_table,verbose=opts.verbose) if opts.verbose: print "Loading gene count data from file: %s" %genome_table_fp genome_table= load_data_table(genome_table_fp,\ load_data_table_in_biom=opts.load_precalc_file_in_biom,\ suppress_subset_loading=opts.suppress_subset_loading,\ ids_to_load=ids_to_load,verbose=opts.verbose,transpose=True) if opts.verbose: print "Loaded %i genes across %i OTUs from gene count table" \ %(len(genome_table.ObservationIds),len(genome_table.SampleIds)) if opts.with_confidence: if opts.input_variance_table: variance_table_fp = opts.input_variance_table else: variance_table_fp = determine_data_table_fp(precalc_data_dir,\ opts.type_of_prediction,opts.gg_version,\ precalc_file_suffix='precalculated_variances.tab.gz',\ user_specified_table=opts.input_count_table) if opts.verbose: print "Loading variance information from table: %s" \ %variance_table_fp variance_table= load_data_table(variance_table_fp,\ load_data_table_in_biom=opts.load_precalc_file_in_biom,\ suppress_subset_loading=opts.suppress_subset_loading,\ ids_to_load=ids_to_load,transpose=True) if opts.verbose: print "Loaded %i genes across %i OTUs from variance table" \ %(len(variance_table.ObservationIds),len(variance_table.SampleIds)) #Raise an error if the genome table and variance table differ #in the genomes they contain. #better to find out now than have something obscure happen latter on if opts.verbose: print "Checking that genome table and variance table are consistent" try: assert set(variance_table.ObservationIds) == set(genome_table.ObservationIds) except AssertionError,e: for var_id in variance_table.ObservationIds: if var_id not in genome_table.ObservationIds: print "Variance table ObsId %s not in genome_table ObsIds" %var_id raise AssertionError("Variance table and genome table contain different gene ids") try: assert set(variance_table.SampleIds) == set(genome_table.SampleIds) except AssertionError,e: for var_id in variance_table.SampleIds: if var_id not in genome_table.SampleIds: print "Variance table SampleId %s not in genome_table SampleIds" %var_id raise AssertionError("Variance table and genome table contain different OTU ids")
from optparse import OptionParser, make_option options = [ make_option('-i','--biom_fp',type="string", help='the BIological Observation Matrix filepath'), make_option('-a','--axis', type='string', help="The axis to subset over, either 'samples' or 'observations'"), make_option('-s','--ids_fp',type="string", help="A file containing a single column of IDs to retain"), make_option('-o','--output_fp',type="string", help="A file to write the result to") ] if __name__ == '__main__': if cogent_cl_parsing: option_parser, opts, args =\ parse_command_line_parameters(**script_info) else: parser = OptionParser(option_list=options) opts, args = parser.parse_args() ids = [l.strip() for l in open(opts.ids_fp)] biom_str = open(opts.biom_fp).read() idxs, new_axis_md = get_axis_indices(biom_str, ids, opts.axis) new_data = direct_slice_data(biom_str, idxs, opts.axis) output = open(opts.output_fp,'w') # multiple walks over the file. bad form, but easy right now # ...should add a yield_and_ignore parser or something. output.write('{') output.write(direct_parse_key(biom_str, "id"))
def main(): """Generate test trees given parameters""" option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading trait table..." input_trait_table = open(opts.input_trait_table,"U") if opts.verbose: print "Loading tree..." #PicrustNode seems to run into very slow/memory intentsive perfromance... #tree = DndParser(open(opts.input_tree),constructor=PicrustNode) tree = DndParser(open(opts.input_tree)) if opts.verbose: print "Parsing trait table..." #Find which taxa are to be used in tests #(by default trait table taxa) trait_table_header,trait_table_fields = \ parse_trait_table(input_trait_table) if opts.verbose: print "Ensuring tree and trait table labels are formatted consistently..." label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose) fix_tree_labels(tree,label_conversion_fns) trait_table_fields = convert_trait_table_entries(trait_table_fields,\ value_conversion_fns = [],\ label_conversion_fns = label_conversion_fns) trait_table_fields = [t for t in trait_table_fields] print "Number of trait table fields with single quotes:",\ len([t for t in trait_table_fields if "'" in t[0]]) if opts.verbose: print "Making output directory..." make_output_dir(opts.output_dir) if opts.limit_to_tips: included_tips = opts.limit_to_tips.split(",") if opts.verbose: print "Limiting test datasets to %i tips: %s" %(len(included_tips),included_tips) else: included_tips = False method_fns =\ {"exclude_tips_by_distance":\ make_distance_based_exclusion_fn,\ "randomize_tip_labels_by_distance":\ make_distance_based_tip_label_randomizer } test_fn_factory = method_fns[opts.method] if opts.verbose: print "Setting tree modification method to:", opts.method print "(%s)" % test_fn_factory.__doc__ modify_tree = True if opts.suppress_tree_modification: if opts.verbose: print "Suppressing modification of tree when making test datasets" modify_tree = False if opts.verbose: print "Starting generation of test datsets" test_datasets = \ yield_genome_test_data_by_distance(tree,trait_table_fields,\ test_fn_factory,min_dist = opts.min_dist,\ max_dist=opts.max_dist,increment=opts.dist_increment,\ modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose) if opts.verbose: print "Writing files for test datasets" for curr_dist,test_tree,tip_to_predict,\ expected_traits,test_trait_table_fields in test_datasets: if included_tips is not False: if tip_to_predict not in included_tips: if opts.verbose: print "Skipping tip %s: limiting to tip(s): %s" %(tip_to_predict,included_tips) continue #Make a safe version of tip to predict # So odd characters like | don't mess up OS safe_tip_to_predict = "'%s'"%tip_to_predict #Write tree base_name = "--".join(map(str,["test_tree",opts.method,curr_dist])) curr_filepath = write_tree(opts.output_dir,base_name,test_tree,safe_tip_to_predict) if opts.verbose: print "Wrote test tree to: %s" % curr_filepath #Write expected trait table base_name = "--".join(map(str,["exp_traits",opts.method,curr_dist,safe_tip_to_predict])) exp_trait_table_lines = [trait_table_header] exp_trait_table_lines.append("\t".join(expected_traits)+"\n") #print "Expected_trait_table_lines:",exp_trait_table_lines filename=os.path.join(opts.output_dir,base_name) if opts.verbose: print "Writing expected trait table to:", filename f=open(filename,"w") f.write("".join(exp_trait_table_lines)) f.close() #Output a transposed, BIOM format expectation table for comparison with predict_traits output #NOTE: this is a clumsy way of getting the translated trait table # but more elegant, direct methods (directly feeding data to biom's table_factory) # weren't working for me readily. In the future, we should streamline this process # Leaving as is for now since this code is mostly for developers so speed/elegence # are probably not essential here. #Let the hackishness begin #Reload the tab-delimited trait table header, fields = parse_trait_table(open(filename,"U")) fields = [f for f in fields] #converts generator to list #Transpose table for .BIOM format so that Observation ids are KOs transposed_header, transposed_trait_table_lines =\ transpose_trait_table_fields(fields,header,\ id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t") #Eliminate newline in header trans_trait_table_lines = [transposed_header.strip()] trans_trait_table_lines.extend(["\t".join(r) for r in transposed_trait_table_lines]) trans_trait_table = '\n'.join(trans_trait_table_lines) #Write BIOM format expected trait table base_name = "--".join(map(str,["exp_biom_traits",opts.method,curr_dist,safe_tip_to_predict])) expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\ table_format = "tab-delimited") #print "Expected_trait_table_lines:",exp_trait_table_lines filename=os.path.join(opts.output_dir,base_name) if opts.verbose: print "Writing BIOM-format expected trait table to:", filename f=open(filename,"w") f.write(format_biom_table(expected_biom_table)) f.close() #Write test trait table test_trait_table_fields = test_trait_table_fields if expected_traits in test_trait_table_fields: test_trait_table_fields.remove(expected_traits) test_trait_table_lines = [trait_table_header] test_trait_table_lines.extend(["\t".join(r)+"\n" for r in test_trait_table_fields]) #print "Test_trait_table_lines:",test_trait_table_lines base_name = "--".join(map(str,["test_trait_table",opts.method,curr_dist,safe_tip_to_predict])) filename=os.path.join(opts.output_dir,base_name) if opts.verbose: print "Writing test trait table to:", filename f=open(filename,"w") f.write("".join(test_trait_table_lines)) f.close() if opts.verbose: print "Done generating test datasets"
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #if we specify we want NSTI only then we have to calculate it first if opts.output_accuracy_metrics_only: opts.calculate_accuracy_metrics = True if opts.verbose: print "Loading tree from file:", opts.tree # Load Tree #tree = LoadTree(opts.tree) tree = load_picrust_tree(opts.tree, opts.verbose) table_headers = [] traits = {} #load the asr trait table using the previous list of functions to order the arrays if opts.reconstructed_trait_table: table_headers,traits =\ update_trait_dict_from_file(opts.reconstructed_trait_table) #Only load confidence intervals on the reconstruction #If we actually have ASR values in the analysis if opts.reconstruction_confidence: if opts.verbose: print "Loading ASR confidence data from file:",\ opts.reconstruction_confidence print "Assuming confidence data is of type:", opts.confidence_format asr_confidence_output = open(opts.reconstruction_confidence) asr_min_vals,asr_max_vals, params,column_mapping =\ parse_asr_confidence_output(asr_confidence_output,format=opts.confidence_format) if 'sigma' in params: brownian_motion_parameter = params['sigma'][0] else: brownian_motion_parameter = None if opts.verbose: print "Done. Loaded %i confidence interval values." % ( len(asr_max_vals)) print "Brownian motion parameter:", brownian_motion_parameter else: brownian_motion_parameter = None #load the trait table into a dict with organism names as keys and arrays as functions table_headers,genome_traits =\ update_trait_dict_from_file(opts.observed_trait_table,table_headers) #Combine the trait tables overwriting the asr ones if they exist in the genome trait table. traits.update(genome_traits) # Specify the attribute where we'll store the reconstructions trait_label = "Reconstruction" if opts.verbose: print "Assigning traits to tree..." # Decorate tree using the traits tree = assign_traits_to_tree(traits, tree, trait_label=trait_label) if opts.reconstruction_confidence: if opts.verbose: print "Assigning trait confidence intervals to tree..." tree = assign_traits_to_tree(asr_min_vals,tree,\ trait_label="lower_bound") tree = assign_traits_to_tree(asr_max_vals,tree,\ trait_label="upper_bound") if brownian_motion_parameter is None: if opts.verbose: print "No Brownian motion parameters loaded. Inferring these from 95% confidence intervals..." brownian_motion_parameter = get_brownian_motion_param_from_confidence_intervals(tree,\ upper_bound_trait_label="upper_bound",\ lower_bound_trait_label="lower_bound",\ trait_label=trait_label,\ confidence=0.95) if opts.verbose: print "Inferred the following rate parameters:", brownian_motion_parameter if opts.verbose: print "Collecting list of nodes to predict..." #Start by predict all tip nodes. nodes_to_predict = [tip.Name for tip in tree.tips()] if opts.verbose: print "Found %i nodes to predict." % len(nodes_to_predict) if opts.limit_predictions_to_organisms: organism_id_str = opts.limit_predictions_to_organisms ok_organism_ids = organism_id_str.split(',') ok_organism_ids = [n.strip() for n in ok_organism_ids] for f in set_label_conversion_fns(True, True): ok_organism_ids = [f(i) for i in ok_organism_ids] if opts.verbose: print "Limiting predictions to user-specified ids:",\ ",".join(ok_organism_ids) if not ok_organism_ids: raise RuntimeError(\ "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\ % opts.limit_predictions_to_organisms) nodes_to_predict =\ [n for n in nodes_to_predict if n in ok_organism_ids] if not nodes_to_predict: raise RuntimeError(\ "Filtering by user-specified ids resulted in an empty set of nodes to predict. Are the ids on the commmand-line and tree ids in the same format? Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0])) if opts.verbose: print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" % ( len(nodes_to_predict)) if opts.limit_predictions_by_otu_table: if opts.verbose: print "Limiting predictions to ids in user-specified OTU table:",\ opts.limit_predictions_by_otu_table otu_table = open(opts.limit_predictions_by_otu_table, "U") #Parse OTU table for ids otu_ids =\ extract_ids_from_table(otu_table.readlines(),delimiter="\t") if not otu_ids: raise RuntimeError(\ "Found no valid ids in input OTU table: %s. Is the path correct?"\ % opts.limit_predictions_by_otu_table) nodes_to_predict =\ [n for n in nodes_to_predict if n in otu_ids] if not nodes_to_predict: raise RuntimeError(\ "Filtering by OTU table resulted in an empty set of nodes to predict. Are the OTU ids and tree ids in the same format? Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0])) if opts.verbose: print "After filtering by OTU table, %i nodes remain to be predicted" % ( len(nodes_to_predict)) # Calculate accuracy of PICRUST for the given tree, sequenced genomes # and set of ndoes to predict accuracy_metrics = ['NSTI'] accuracy_metric_results = None if opts.calculate_accuracy_metrics: if opts.verbose: print "Calculating accuracy metrics: %s" % ( [",".join(accuracy_metrics)]) accuracy_metric_results = {} if 'NSTI' in accuracy_metrics: nsti_result,min_distances =\ calc_nearest_sequenced_taxon_index(tree,\ limit_to_tips = nodes_to_predict,\ trait_label = trait_label, verbose=opts.verbose) #accuracy_metric_results['NSTI'] = nsti_result for organism in min_distances.keys(): accuracy_metric_results[organism] = { 'NSTI': min_distances[organism] } if opts.verbose: print "NSTI:", nsti_result if opts.output_accuracy_metrics_only: #Write accuracy metrics to file if opts.verbose: print "Writing accuracy metrics to file:", opts.output_accuracy_metrics f = open(opts.output_accuracy_metrics_only, 'w+') f.write("metric\torganism\tvalue\n") lines = [] for organism in accuracy_metric_results.keys(): for metric in accuracy_metric_results[organism].keys(): lines.append('\t'.join([metric,organism,\ str(accuracy_metric_results[organism][metric])])+'\n') f.writelines(sorted(lines)) f.close() exit() if opts.verbose: print "Generating predictions using method:", opts.prediction_method if opts.weighting_method == 'exponential': #For now, use exponential weighting weight_fn = make_neg_exponential_weight_fn(e) variances = None #Overwritten by methods that calc variance confidence_intervals = None #Overwritten by methods that calc variance if opts.prediction_method == 'asr_and_weighting': # Perform predictions using reconstructed ancestral states if opts.reconstruction_confidence: predictions,variances,confidence_intervals =\ predict_traits_from_ancestors(tree,nodes_to_predict,\ trait_label=trait_label,\ lower_bound_trait_label="lower_bound",\ upper_bound_trait_label="upper_bound",\ calc_confidence_intervals = True,\ brownian_motion_parameter=brownian_motion_parameter,\ weight_fn =weight_fn,verbose=opts.verbose) else: predictions =\ predict_traits_from_ancestors(tree,nodes_to_predict,\ trait_label=trait_label,\ weight_fn =weight_fn,verbose=opts.verbose) elif opts.prediction_method == 'weighting_only': #Ignore ancestral information predictions =\ weighted_average_tip_prediction(tree,nodes_to_predict,\ trait_label=trait_label,\ weight_fn =weight_fn,verbose=opts.verbose) elif opts.prediction_method == 'nearest_neighbor': predictions = predict_nearest_neighbor(tree,nodes_to_predict,\ trait_label=trait_label,tips_only = True) elif opts.prediction_method == 'random_neighbor': predictions = predict_random_neighbor(tree,\ nodes_to_predict,trait_label=trait_label) if opts.verbose: print "Done making predictions." make_output_dir_for_file(opts.output_trait_table) out_fh = open(opts.output_trait_table, 'w') #Generate the table of biom predictions if opts.verbose: print "Converting results to .biom format for output..." biom_predictions=biom_table_from_predictions(predictions,table_headers,\ observation_metadata=None,\ sample_metadata=accuracy_metric_results,convert_to_int=False) if opts.verbose: print "Writing prediction results to file: ", opts.output_trait_table if opts.output_precalc_file_in_biom: #write biom table to file write_biom_table(biom_predictions, opts.output_trait_table) else: #convert to precalc (tab-delimited) format out_fh = open(opts.output_trait_table, 'w') out_fh.write(convert_biom_to_precalc(biom_predictions)) out_fh.close() #Write out variance information to file if variances: if opts.verbose: print "Converting variances to BIOM format" if opts.output_precalc_file_in_biom: suffix = '.biom' else: suffix = '.tab' biom_prediction_variances=biom_table_from_predictions({k:v['variance'] for k,v in variances.iteritems()},table_headers,\ observation_metadata=None,\ sample_metadata=None,convert_to_int=False) outfile_base, extension = splitext(opts.output_trait_table) variance_outfile = outfile_base + "_variances" + suffix make_output_dir_for_file(variance_outfile) if opts.verbose: print "Writing variance information to file:", variance_outfile if opts.output_precalc_file_in_biom: write_biom_table(biom_prediction_variances, variance_outfile) else: open(variance_outfile,'w').write(\ convert_biom_to_precalc(biom_prediction_variances)) if confidence_intervals: if opts.verbose: print "Converting upper confidence interval values to BIOM format" biom_prediction_upper_CI=biom_table_from_predictions({k:v['upper_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\ observation_metadata=None,\ sample_metadata=None,convert_to_int=False) outfile_base, extension = splitext(opts.output_trait_table) upper_CI_outfile = outfile_base + "_upper_CI" + suffix make_output_dir_for_file(upper_CI_outfile) if opts.verbose: print "Writing upper confidence limit information to file:", upper_CI_outfile if opts.output_precalc_file_in_biom: write_biom_table(biom_prediction_upper_CI, upper_CI_outfile) else: open(upper_CI_outfile,'w').write(\ convert_biom_to_precalc(biom_prediction_upper_CI)) biom_prediction_lower_CI=biom_table_from_predictions({k:v['lower_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\ observation_metadata=None,\ sample_metadata=None,convert_to_int=False) outfile_base, extension = splitext(opts.output_trait_table) lower_CI_outfile = outfile_base + "_lower_CI" + suffix make_output_dir_for_file(lower_CI_outfile) if opts.verbose: print "Writing lower confidence limit information to file", lower_CI_outfile if opts.output_precalc_file_in_biom: write_biom_table(biom_prediction_lower_CI, lower_CI_outfile) else: open(lower_CI_outfile,'w').write(\ convert_biom_to_precalc(biom_prediction_lower_CI))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table = load_table(opts.input_otu_fp) ids_to_load = otu_table.ids(axis='observation') if(opts.input_count_fp is None): #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if (ext == '.gz'): count_table_fh = gzip.open(input_count_table,'rb') else: count_table_fh = open(input_count_table,'U') if opts.load_precalc_file_in_biom: count_table = load_table(count_table_fh) else: count_table = convert_precalc_to_biom(count_table_fh, ids_to_load) #Need to only keep data relevant to our otu list ids=[] for x in otu_table.iter(axis='observation'): ids.append(str(x[1])) ob_id=count_table.ids(axis='observation')[0] filtered_otus=[] filtered_values=[] for x in ids: if count_table.exists(x, axis='sample'): filtered_otus.append(x) filtered_values.append(otu_table.data(x, axis='observation')) filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids()) copy_numbers_filtered={} for x in filtered_otus: value = count_table.get_value_by_ids(ob_id,x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x]={opts.metadata_identifer:value} filtered_otu_table.add_metadata(copy_numbers_filtered, axis='observation') def metadata_norm(v, i, md): return v / float(md[opts.metadata_identifer]) normalized_table = filtered_otu_table.transform(metadata_norm, axis='observation') #move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'observation') make_output_dir_for_file(opts.output_otu_fp) write_biom_table(normalized_table, opts.output_otu_fp)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) verbose=opts.verbose min_args = 1 if len(args) < min_args: option_parser.error('One or more predicted biom files must be provided.') observed_files=args make_output_dir_for_file(opts.output_fp) out_fh=open(opts.output_fp,'w') if verbose: print "Loading expected trait table file:",opts.exp_trait_table_fp exp_table =parse_biom_table(open(opts.exp_trait_table_fp,'U')) header_printed=False header_keys=[] delimiter="\t" for observed_file in observed_files: observed_file_name=basename(observed_file) if verbose: print "Loading predicted trait table file:",observed_file_name obs_table =parse_biom_table(open(observed_file,'U')) if opts.compare_observations: if verbose: print "Transposing tables to allow evaluation of observations (instead of samples)..." obs_table=transpose_biom(obs_table) exp_table=transpose_biom(exp_table) if verbose: print "Matching predicted and expected tables..." obs,exp=match_biom_tables(obs_table,exp_table,verbose=verbose,limit_to_expected_observations=opts.limit_to_expected_observations,limit_to_observed_observations=opts.limit_to_observed_observations,normalize=opts.normalize,shuffle_samples=opts.shuffle_samples) if verbose: print "Calculating accuracy stats for all observations..." #import pdb; pdb.set_trace() for i in obs: if verbose: print "Calculating stats for: ",i if opts.not_relative_abundance_scores: results=calculate_accuracy_stats_from_observations(obs[i],exp[i],success_criterion='binary') else: results=calculate_accuracy_stats_from_observations(obs[i],exp[i],success_criterion='ra_exact') #If first pass then print out header if not header_printed: header_printed=True header_keys=sorted(results.keys()) out_fh.write(delimiter.join(['file','label']+header_keys)+"\n") #print results using same order as header values=[observed_file_name,i]+['{0:.3g}'.format(results[x]) for x in header_keys] out_str=delimiter.join(map(str,values))+"\n" out_fh.write(out_str)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #set some defaults for the options input_dir=opts.input_dir output_dir=opts.output_dir or input_dir tmp_dir=opts.tmp_dir or output_dir parallel_method=opts.parallel_method asr_method = opts.asr_method predict_traits_method = opts.prediction_method if opts.num_jobs > 20 and parallel_method == 'multithreaded': raise ValueError('You probably dont want to run multithreaded evaluations with a large num_jobs. Please adjust options num_jobs and or parallel_method') if opts.with_confidence and asr_method not in ['ace_ml','ace_reml']: raise ValueError("PICRUST currently only supports confidence intervals with the ace_ml and ace_reml ASR methods") if opts.verbose: print "Reconstruction method:",asr_method print "Prediction method:",predict_traits_method print "Parallel method:",parallel_method print "num_jobs:",opts.num_jobs print "\nOutput will be saved here:'%s'" %output_dir #create the output directory unless it already exists make_output_dir(output_dir) if(parallel_method=='sge'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py') elif(parallel_method=='multithreaded'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py') elif(parallel_method=='torque'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py') else: raise RuntimeError #get the test datasets to run in the input directory (based on exp_traits files) expect_test_files=glob(join(input_dir,'exp_traits--*')) test_datasets={} for file_name in expect_test_files: test_id=file_name.replace(join(input_dir,'exp_traits--'),'',1) #create a dict with the test files as values in the ref list test_datasets[test_id]=[ join(input_dir,'test_trait_table--'+test_id),join(input_dir,'test_tree--'+test_id),join(input_dir,'exp_traits--'+test_id)] created_tmp_files=[] output_files=[] #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_') jobs=open(jobs_fp,'w') created_tmp_files.append(jobs_fp) #get location of scripts we need to run asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py') predict_traits_script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py') #run each test dataset through the pipeline for test_id in test_datasets: asr_out_fp=join(output_dir,'asr--'+asr_method+'--'+test_id) asr_params_out_fp=join(output_dir,'--'.join(['asr',asr_method,'asr_params',test_id])) created_tmp_files.append(asr_out_fp) if opts.check_for_null_files and exists(asr_out_fp) and file_contains_nulls(asr_out_fp): #remove file if opts.verbose: print "Existing ASR file contains null characters. Will run ASR again after removing: "+asr_out_fp remove(asr_out_fp) if exists(asr_out_fp) and not opts.force: if opts.verbose: print "Output file: {0} already exists, so we will skip it.".format(asr_out_fp) asr_cmd = "echo 'Skipping ASR for %s, file %s exists already'" %(test_id,asr_out_fp) else: #create the asr command asr_cmd= """python {0} -i "{1}" -t "{2}" -m {3} -o "{4}" -c "{5}" """.format(asr_script_fp, test_datasets[test_id][0], test_datasets[test_id][1], asr_method, asr_out_fp, asr_params_out_fp) predict_traits_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\ opts.weighting_method,test_id])) if opts.with_accuracy: predict_traits_accuracy_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\ opts.weighting_method,'accuracy_metrics',test_id])) if opts.check_for_null_files and exists(predict_traits_out_fp) and file_contains_nulls(predict_traits_out_fp): if opts.verbose: print "Existing trait predictions file contains null characters. Will run it again after removing: "+predict_traits_out_fp remove(predict_traits_out_fp) if exists(predict_traits_out_fp) and not opts.force: if opts.verbose: print "Prediction file: {0} already exists. Skipping ASR and prediction for this organism".format(predict_traits_out_fp) continue output_files.append(predict_traits_out_fp) genome_id=split('--',test_id)[2] if predict_traits_method == 'nearest_neighbor': #don't do asr step predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -g "{3}" -o "{4}" -m "{5}" """.format(predict_traits_script_fp, test_datasets[test_id][0], opts.ref_tree, genome_id, predict_traits_out_fp,predict_traits_method) jobs.write(predict_traits_cmd+"\n") else: #create the predict traits command predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -r "{3}" -g "{4}" -o "{5}" -m "{6}" -w {7} """.format(predict_traits_script_fp,\ test_datasets[test_id][0], opts.ref_tree, asr_out_fp,genome_id, predict_traits_out_fp,predict_traits_method,opts.weighting_method) #Instruct predict_traits to use confidence intervals output by ASR if opts.with_confidence: confidence_param = ' -c "%s"' %(asr_params_out_fp) predict_traits_cmd = predict_traits_cmd + confidence_param #Instruct predict traits to output the NTSI measure of distance to #nearby sequences. if opts.with_accuracy: accuracy_param = ' -a "%s"' %(predict_traits_accuracy_out_fp) predict_traits_cmd = predict_traits_cmd + accuracy_param #add job command to the the jobs file jobs.write(asr_cmd+';'+predict_traits_cmd+"\n") jobs.close() #created_tmp_files.extend(output_files) #submit the jobs job_prefix='eval_' if opts.verbose: print "Submitting jobs:",cluster_jobs_fp,jobs_fp,job_prefix,opts.num_jobs submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = load_table(opts.input_otu_table) ids_to_load = otu_table.ids(axis='observation') if(opts.input_count_table is None): #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz) precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if opts.verbose: print "Loading count table: ", input_count_table if (ext == '.gz'): genome_table_fh = gzip.open(input_count_table,'rb') else: genome_table_fh = open(input_count_table,'U') #In the genome/trait table genomes are the samples and #genes are the observations if opts.load_precalc_file_in_biom: if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = load_table(genome_table_fh) else: genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load) ok_functional_categories = None metadata_type = None if opts.limit_to_functional_categories: ok_functional_categories = opts.limit_to_functional_categories.split("|") if opts.verbose: print "Limiting to functional categories: %s" %(str(ok_functional_categories)) # Either KEGG_Pathways or COG_Category needs # to be assigned to metadata_key to limit to # functional categories (not needed for # individual functions) if opts.type_of_prediction == "ko": metadata_type = "KEGG_Pathways" elif opts.type_of_prediction == "cog": metadata_type = "COG_Category" elif opts.type_of_prediction == "rfam": exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)") partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\ limit_to_functional_categories = ok_functional_categories , metadata_key = metadata_type ) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_fp make_output_dir_for_file(opts.output_fp) open(opts.output_fp,'w').write(output_text)