def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if(opts.parallel): tmp_dir='jobs/' make_output_dir(tmp_dir) asr_table, ci_table =run_asr_in_parallel(tree=opts.input_tree_fp,table=opts.input_trait_table_fp,asr_method=opts.asr_method,parallel_method=opts.parallel_method, num_jobs=opts.num_jobs,tmp_dir=tmp_dir,verbose=opts.verbose) else: #call the apporpriate ASR app controller if(opts.asr_method == 'wagner'): asr_table = wagner_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,HALT_EXEC=opts.debug) elif(opts.asr_method == 'bayestraits'): pass elif(opts.asr_method == 'ace_ml'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'ML',HALT_EXEC=opts.debug) elif(opts.asr_method == 'ace_pic'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'pic',HALT_EXEC=opts.debug) elif(opts.asr_method == 'ace_reml'): asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'REML',HALT_EXEC=opts.debug) #output the table to file make_output_dir_for_file(opts.output_fp) asr_table.writeToFile(opts.output_fp,sep='\t') #output the CI file (unless the method is wagner) if not (opts.asr_method == 'wagner'): make_output_dir_for_file(opts.output_ci_fp) ci_table.writeToFile(opts.output_ci_fp,sep='\t')
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #set some defaults for the options input_dir = opts.input_dir output_dir = opts.output_dir or input_dir tmp_dir = opts.tmp_dir or output_dir parallel_method = opts.parallel_method asr_method = opts.asr_method predict_traits_method = opts.prediction_method if opts.num_jobs > 20 and parallel_method == 'multithreaded': raise ValueError( 'You probably dont want to run multithreaded evaluations with a large num_jobs. Please adjust options num_jobs and or parallel_method' ) if opts.with_confidence and asr_method not in ['ace_ml', 'ace_reml']: raise ValueError( "PICRUST currently only supports confidence intervals with the ace_ml and ace_reml ASR methods" ) if opts.verbose: print "Reconstruction method:", asr_method print "Prediction method:", predict_traits_method print "Parallel method:", parallel_method print "num_jobs:", opts.num_jobs print "\nOutput will be saved here:'%s'" % output_dir #create the output directory unless it already exists make_output_dir(output_dir) if (parallel_method == 'sge'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_sge.py') elif (parallel_method == 'multithreaded'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs.py') elif (parallel_method == 'torque'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_torque.py') else: raise RuntimeError #get the test datasets to run in the input directory (based on exp_traits files) expect_test_files = glob(join(input_dir, 'exp_traits--*')) test_datasets = {} for file_name in expect_test_files: test_id = file_name.replace(join(input_dir, 'exp_traits--'), '', 1) #create a dict with the test files as values in the ref list test_datasets[test_id] = [ join(input_dir, 'test_trait_table--' + test_id), join(input_dir, 'test_tree--' + test_id), join(input_dir, 'exp_traits--' + test_id) ] created_tmp_files = [] output_files = [] #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_') jobs = open(jobs_fp, 'w') created_tmp_files.append(jobs_fp) #get location of scripts we need to run asr_script_fp = join(get_picrust_project_dir(), 'scripts', 'ancestral_state_reconstruction.py') predict_traits_script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py') #run each test dataset through the pipeline for test_id in test_datasets: asr_out_fp = join(output_dir, 'asr--' + asr_method + '--' + test_id) asr_params_out_fp = join( output_dir, '--'.join(['asr', asr_method, 'asr_params', test_id])) created_tmp_files.append(asr_out_fp) if opts.check_for_null_files and exists( asr_out_fp) and file_contains_nulls(asr_out_fp): #remove file if opts.verbose: print "Existing ASR file contains null characters. Will run ASR again after removing: " + asr_out_fp remove(asr_out_fp) if exists(asr_out_fp) and not opts.force: if opts.verbose: print "Output file: {0} already exists, so we will skip it.".format( asr_out_fp) asr_cmd = "echo 'Skipping ASR for %s, file %s exists already'" % ( test_id, asr_out_fp) else: #create the asr command asr_cmd = """python {0} -i "{1}" -t "{2}" -m {3} -o "{4}" -c "{5}" """.format( asr_script_fp, test_datasets[test_id][0], test_datasets[test_id][1], asr_method, asr_out_fp, asr_params_out_fp) predict_traits_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\ opts.weighting_method,test_id])) if opts.with_accuracy: predict_traits_accuracy_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\ opts.weighting_method,'accuracy_metrics',test_id])) if opts.check_for_null_files and exists( predict_traits_out_fp) and file_contains_nulls( predict_traits_out_fp): if opts.verbose: print "Existing trait predictions file contains null characters. Will run it again after removing: " + predict_traits_out_fp remove(predict_traits_out_fp) if exists(predict_traits_out_fp) and not opts.force: if opts.verbose: print "Prediction file: {0} already exists. Skipping ASR and prediction for this organism".format( predict_traits_out_fp) continue output_files.append(predict_traits_out_fp) genome_id = split('--', test_id)[2] if predict_traits_method == 'nearest_neighbor': #don't do asr step predict_traits_cmd = """python {0} -i "{1}" -t "{2}" -g "{3}" -o "{4}" -m "{5}" """.format( predict_traits_script_fp, test_datasets[test_id][0], opts.ref_tree, genome_id, predict_traits_out_fp, predict_traits_method) jobs.write(predict_traits_cmd + "\n") else: #create the predict traits command predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -r "{3}" -g "{4}" -o "{5}" -m "{6}" -w {7} """.format(predict_traits_script_fp,\ test_datasets[test_id][0], opts.ref_tree, asr_out_fp,genome_id, predict_traits_out_fp,predict_traits_method,opts.weighting_method) #Instruct predict_traits to use confidence intervals output by ASR if opts.with_confidence: confidence_param = ' -c "%s"' % (asr_params_out_fp) predict_traits_cmd = predict_traits_cmd + confidence_param #Instruct predict traits to output the NTSI measure of distance to #nearby sequences. if opts.with_accuracy: accuracy_param = ' -a "%s"' % (predict_traits_accuracy_out_fp) predict_traits_cmd = predict_traits_cmd + accuracy_param #add job command to the the jobs file jobs.write(asr_cmd + ';' + predict_traits_cmd + "\n") jobs.close() #created_tmp_files.extend(output_files) #submit the jobs job_prefix = 'eval_' if opts.verbose: print "Submitting jobs:", cluster_jobs_fp, jobs_fp, job_prefix, opts.num_jobs submit_jobs(cluster_jobs_fp, jobs_fp, job_prefix, num_jobs=opts.num_jobs)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #set some defaults for the options input_dir=opts.input_dir output_dir=opts.output_dir or input_dir tmp_dir=opts.tmp_dir or output_dir parallel_method=opts.parallel_method asr_method = opts.asr_method predict_traits_method = opts.prediction_method if opts.num_jobs > 20 and parallel_method == 'multithreaded': raise ValueError('You probably dont want to run multithreaded evaluations with a large num_jobs. Please adjust options num_jobs and or parallel_method') if opts.with_confidence and asr_method not in ['ace_ml','ace_reml']: raise ValueError("PICRUST currently only supports confidence intervals with the ace_ml and ace_reml ASR methods") if opts.verbose: print "Reconstruction method:",asr_method print "Prediction method:",predict_traits_method print "Parallel method:",parallel_method print "num_jobs:",opts.num_jobs print "\nOutput will be saved here:'%s'" %output_dir #create the output directory unless it already exists make_output_dir(output_dir) if(parallel_method=='sge'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py') elif(parallel_method=='multithreaded'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py') elif(parallel_method=='torque'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py') else: raise RuntimeError #get the test datasets to run in the input directory (based on exp_traits files) expect_test_files=glob(join(input_dir,'exp_traits--*')) test_datasets={} for file_name in expect_test_files: test_id=file_name.replace(join(input_dir,'exp_traits--'),'',1) #create a dict with the test files as values in the ref list test_datasets[test_id]=[ join(input_dir,'test_trait_table--'+test_id),join(input_dir,'test_tree--'+test_id),join(input_dir,'exp_traits--'+test_id)] created_tmp_files=[] output_files=[] #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_') jobs=open(jobs_fp,'w') created_tmp_files.append(jobs_fp) #get location of scripts we need to run asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py') predict_traits_script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py') #run each test dataset through the pipeline for test_id in test_datasets: asr_out_fp=join(output_dir,'asr--'+asr_method+'--'+test_id) asr_params_out_fp=join(output_dir,'--'.join(['asr',asr_method,'asr_params',test_id])) created_tmp_files.append(asr_out_fp) if opts.check_for_null_files and exists(asr_out_fp) and file_contains_nulls(asr_out_fp): #remove file if opts.verbose: print "Existing ASR file contains null characters. Will run ASR again after removing: "+asr_out_fp remove(asr_out_fp) if exists(asr_out_fp) and not opts.force: if opts.verbose: print "Output file: {0} already exists, so we will skip it.".format(asr_out_fp) asr_cmd = "echo 'Skipping ASR for %s, file %s exists already'" %(test_id,asr_out_fp) else: #create the asr command asr_cmd= """python {0} -i "{1}" -t "{2}" -m {3} -o "{4}" -c "{5}" """.format(asr_script_fp, test_datasets[test_id][0], test_datasets[test_id][1], asr_method, asr_out_fp, asr_params_out_fp) predict_traits_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\ opts.weighting_method,test_id])) if opts.with_accuracy: predict_traits_accuracy_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\ opts.weighting_method,'accuracy_metrics',test_id])) if opts.check_for_null_files and exists(predict_traits_out_fp) and file_contains_nulls(predict_traits_out_fp): if opts.verbose: print "Existing trait predictions file contains null characters. Will run it again after removing: "+predict_traits_out_fp remove(predict_traits_out_fp) if exists(predict_traits_out_fp) and not opts.force: if opts.verbose: print "Prediction file: {0} already exists. Skipping ASR and prediction for this organism".format(predict_traits_out_fp) continue output_files.append(predict_traits_out_fp) genome_id=split('--',test_id)[2] if predict_traits_method == 'nearest_neighbor': #don't do asr step predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -g "{3}" -o "{4}" -m "{5}" """.format(predict_traits_script_fp, test_datasets[test_id][0], opts.ref_tree, genome_id, predict_traits_out_fp,predict_traits_method) jobs.write(predict_traits_cmd+"\n") else: #create the predict traits command predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -r "{3}" -g "{4}" -o "{5}" -m "{6}" -w {7} """.format(predict_traits_script_fp,\ test_datasets[test_id][0], opts.ref_tree, asr_out_fp,genome_id, predict_traits_out_fp,predict_traits_method,opts.weighting_method) #Instruct predict_traits to use confidence intervals output by ASR if opts.with_confidence: confidence_param = ' -c "%s"' %(asr_params_out_fp) predict_traits_cmd = predict_traits_cmd + confidence_param #Instruct predict traits to output the NTSI measure of distance to #nearby sequences. if opts.with_accuracy: accuracy_param = ' -a "%s"' %(predict_traits_accuracy_out_fp) predict_traits_cmd = predict_traits_cmd + accuracy_param #add job command to the the jobs file jobs.write(asr_cmd+';'+predict_traits_cmd+"\n") jobs.close() #created_tmp_files.extend(output_files) #submit the jobs job_prefix='eval_' if opts.verbose: print "Submitting jobs:",cluster_jobs_fp,jobs_fp,job_prefix,opts.num_jobs submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs)
def main(): # Parse input to get parameters option_parser, opts, args =\ parse_command_line_parameters(**script_info) tree_file = opts.input_tree trait_table_fp = opts.input_trait_table verbose = opts.verbose #Set output base file names trait_table_base = 'trait_table.tab' pruned_tree_base = 'pruned_tree.newick' reference_tree_base = 'reference_tree.newick' output_dir = make_output_dir(opts.output_dir,strict=False) output_table_fp = join(output_dir,trait_table_base) output_tree_fp = join(output_dir,pruned_tree_base) output_reference_tree_fp = join(output_dir,reference_tree_base) #Handle parameters with more complex defaults delimiter_map = {"space":" ","tab":"\t","comma":","} input_delimiter = delimiter_map[opts.input_table_delimiter] output_delimiter = delimiter_map[opts.output_table_delimiter] if verbose: print "Running with options:" print "\t%s:%s" %("Tree file",tree_file) print "\t%s:%s" %("Trait table",trait_table_fp) print "\t%s:%s" %("Output tree",output_tree_fp) print "\t%s:%s" %("Output reference tree",output_reference_tree_fp) print "\t%s:%s" %("Output trait table",output_table_fp) print "\t%s:%s" %("Add branch length to root",opts.add_branch_length_to_root) print "\t%s:%s" %("Convert to NEXUS?",opts.convert_to_nexus) print "\t%s:%s" %("Input trait table delimiter",opts.input_table_delimiter) print "\t%s:%s" %("Output trait table delimiter",opts.output_table_delimiter) # Begin reformatting root_name = "root" if opts.no_minimum_branch_length: min_branch_length = None else: min_branch_length = 0.0001 #Load inputs if verbose: print "Loading tree...." input_tree = DndParser(open(tree_file)) if verbose: print "Loading trait table..." trait_table = open(trait_table_fp,"U") trait_table_lines = trait_table.readlines() if not trait_table_lines: raise IOError("No lines could be loaded from file %s. Please check the input file." %trait_table_fp) #Get id mappings from mapping file if opts.tree_to_trait_mapping: if verbose: print "Loading tree to trait table mapping file..." mapping_file = open(opts.tree_to_trait_mapping,"U") trait_to_tree_mapping =\ make_id_mapping_dict(parse_id_mapping_file(mapping_file)) else: if verbose: print "No tree to trait mapping file specified. Assuming tree tip names and trait table names will match exactly." trait_to_tree_mapping = None # Call reformatting function using specified parameters # to get reference tree if opts.verbose: print """**BUILDING REFERENCE TREE (without respect to trait table)**""" new_reference_tree, not_useful_trait_table_lines =\ reformat_tree_and_trait_table(\ tree=input_tree,\ trait_table_lines = [],\ trait_to_tree_mapping = None,\ input_trait_table_delimiter= None,\ output_trait_table_delimiter= None,\ filter_table_by_tree_tips=False,\ convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=False,\ convert_to_bifurcating=True,\ add_branch_length_to_root=False,\ name_unnamed_nodes=True,\ min_branch_length=min_branch_length,\ verbose=opts.verbose) #Make a copy new_reference_tree_copy=new_reference_tree.deepcopy() if opts.verbose: print """**BUILDING PRUNED TREE AND TRAIT TABLE**""" # Call reformatting function using specified parameters new_tree, new_trait_table_lines = \ reformat_tree_and_trait_table(tree=new_reference_tree_copy,\ trait_table_lines = trait_table_lines,\ trait_to_tree_mapping = trait_to_tree_mapping,\ input_trait_table_delimiter= input_delimiter,\ output_trait_table_delimiter=output_delimiter,\ filter_table_by_tree_tips=True,\ convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=True,\ convert_to_bifurcating=False,\ add_branch_length_to_root=False,\ name_unnamed_nodes=False,\ min_branch_length=min_branch_length,\ verbose=opts.verbose) #Alter reference tree to only contain tips in OTU table (and of course trait table) if opts.limit_tree_to_otus_fp: if opts.verbose: print "Pruning reference tree to contain only tips in OTU table (and trait table)...." otu_table = open(opts.limit_tree_to_otus_fp,"U") otu_table_lines = otu_table.readlines() header_line,otu_table_fields =parse_trait_table(otu_table_lines,delimiter = input_delimiter,has_header=False) header_line,trait_table_fields =\ parse_trait_table(new_trait_table_lines,delimiter = input_delimiter) tips_to_keep = list(otu_table_fields) + list(trait_table_fields) tips_to_keep_in_tree = filter_table_by_presence_in_tree(new_reference_tree_copy,tips_to_keep) new_reference_tree = filter_tree_tips_by_presence_in_table(new_reference_tree_copy,\ tips_to_keep_in_tree,verbose=opts.verbose) if opts.verbose: print "Almost finished. Writing trees and trait table to files..." #Write results to files # Open output files output_trait_table_file = open(output_table_fp,"w+") output_tree_file = open(output_tree_fp,"w+") output_reference_tree_file = open(output_reference_tree_fp,"w+") #Output trait table file if opts.verbose: print "Writing trait table to:", output_table_fp output_trait_table_file.write("\n".join(new_trait_table_lines)) trait_table.close() output_trait_table_file.close() #Output tree file if opts.verbose: print "Writing pruned tree to:", output_tree_fp if opts.convert_to_nexus is True: lines = nexus_lines_from_tree(new_tree) output_tree_file.write("\n".join(map(str,lines))) else: output_tree_file.write(new_tree.getNewick(with_distances=True)) output_tree_file.close() if opts.verbose: print "Writing reference tree to:", output_reference_tree_fp #Output reference tree file output_reference_tree_file.write(new_reference_tree.getNewick(with_distances=True)) output_reference_tree_file.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) pool_by = opts.pool_by.split(',') #create output directory make_output_dir(opts.output_dir) #file_name_field_order={'file_type':0,"prediction_method":1,\ # "weighting_method":2,"holdout_method":3,"distance":4,"organism":5} #Construct a dict from user specified field order file_name_field_order = {} for i,field in enumerate(opts.field_order.split(',')): file_name_field_order[field]=i if opts.verbose: print "Assuming file names are in this order:",file_name_field_order for k in pool_by: #Check that we're only pooling by values that exist if k not in file_name_field_order.keys(): err_text=\ "Bad value for option '--pool_by'. Can't pool by '%s'. Valid categories are: %s" %(k,\ ",".join(file_name_field_order.keys())) raise ValueError(err_text) if opts.verbose: print "Pooling results by:",pool_by roc_success_criteria = ['binary','exact','int_exact'] scatter_lines,correlation_lines,roc_result_lines,roc_auc_lines =\ evaluate_test_dataset_dir(opts.trait_table_dir,\ opts.exp_trait_table_dir,file_name_delimiter="--",\ file_name_field_order=file_name_field_order,pool_by=pool_by,\ roc_success_criteria=roc_success_criteria,verbose=opts.verbose) #Output scatter data output_fp = join(opts.output_dir,'evaluation_scatter_data.tab') if opts.verbose: print "Writing scatter plot data to:",output_fp file_lines = scatter_lines f = open(output_fp,"w+") f.writelines(file_lines) f.close() #Output correlation data output_fp = join(opts.output_dir,'evaluation_correlation_data.tab') if opts.verbose: print "Writing correlation data to:",output_fp file_lines = correlation_lines f = open(output_fp,"w+") f.writelines(file_lines) f.close() #Output raw ROC plot data if opts.verbose: print "Writing ROC data..." for c in roc_result_lines.keys(): output_fp = join(opts.output_dir,'evaluation_roc_data_%s.tab' %c) if opts.verbose: print "Outputting ROC data for success criterion %s to: %s" %(c,output_fp) file_lines = roc_result_lines[c] f = open(output_fp,"w+") f.writelines(file_lines) f.close() #Output summary ROC AUC data if opts.verbose: print "Writing ROC AUC data..." for c in roc_auc_lines.keys(): output_fp = join(opts.output_dir,'evaluation_roc_auc_data_%s.tab' %c) file_lines = roc_auc_lines[c] if opts.verbose: print "Outputting ROC AUC data for success criterion %s to: %s" %(c,output_fp) f = open(output_fp,"w+") f.writelines(file_lines) f.close()
def main(): """Generate test trees given parameters""" option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading trait table..." input_trait_table = open(opts.input_trait_table,"U") if opts.verbose: print "Loading tree..." #PicrustNode seems to run into very slow/memory intentsive perfromance... #tree = DndParser(open(opts.input_tree),constructor=PicrustNode) tree = DndParser(open(opts.input_tree)) if opts.verbose: print "Parsing trait table..." #Find which taxa are to be used in tests #(by default trait table taxa) trait_table_header,trait_table_fields = \ parse_trait_table(input_trait_table) if opts.verbose: print "Ensuring tree and trait table labels are formatted consistently..." label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose) fix_tree_labels(tree,label_conversion_fns) trait_table_fields = convert_trait_table_entries(trait_table_fields,\ value_conversion_fns = [],\ label_conversion_fns = label_conversion_fns) trait_table_fields = [t for t in trait_table_fields] print "Number of trait table fields with single quotes:",\ len([t for t in trait_table_fields if "'" in t[0]]) if opts.verbose: print "Making output directory..." make_output_dir(opts.output_dir) if opts.limit_to_tips: included_tips = opts.limit_to_tips.split(",") if opts.verbose: print "Limiting test datasets to %i tips: %s" %(len(included_tips),included_tips) else: included_tips = False method_fns =\ {"exclude_tips_by_distance":\ make_distance_based_exclusion_fn,\ "randomize_tip_labels_by_distance":\ make_distance_based_tip_label_randomizer } test_fn_factory = method_fns[opts.method] if opts.verbose: print "Setting tree modification method to:", opts.method print "(%s)" % test_fn_factory.__doc__ modify_tree = True if opts.suppress_tree_modification: if opts.verbose: print "Suppressing modification of tree when making test datasets" modify_tree = False if opts.verbose: print "Starting generation of test datsets" test_datasets = \ yield_genome_test_data_by_distance(tree,trait_table_fields,\ test_fn_factory,min_dist = opts.min_dist,\ max_dist=opts.max_dist,increment=opts.dist_increment,\ modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose) if opts.verbose: print "Writing files for test datasets" for curr_dist,test_tree,tip_to_predict,\ expected_traits,test_trait_table_fields in test_datasets: if included_tips is not False: if tip_to_predict not in included_tips: if opts.verbose: print "Skipping tip %s: limiting to tip(s): %s" %(tip_to_predict,included_tips) continue #Make a safe version of tip to predict # So odd characters like | don't mess up OS safe_tip_to_predict = "'%s'"%tip_to_predict #Write tree base_name = "--".join(map(str,["test_tree",opts.method,curr_dist])) curr_filepath = write_tree(opts.output_dir,base_name,test_tree,safe_tip_to_predict) if opts.verbose: print "Wrote test tree to: %s" % curr_filepath #Write expected trait table base_name = "--".join(map(str,["exp_traits",opts.method,curr_dist,safe_tip_to_predict])) exp_trait_table_lines = [trait_table_header] exp_trait_table_lines.append("\t".join(expected_traits)+"\n") #print "Expected_trait_table_lines:",exp_trait_table_lines filename=os.path.join(opts.output_dir,base_name) if opts.verbose: print "Writing expected trait table to:", filename f=open(filename,"w") f.write("".join(exp_trait_table_lines)) f.close() #Output a transposed, BIOM format expectation table for comparison with predict_traits output #NOTE: this is a clumsy way of getting the translated trait table # but more elegant, direct methods (directly feeding data to biom's table_factory) # weren't working for me readily. In the future, we should streamline this process # Leaving as is for now since this code is mostly for developers so speed/elegence # are probably not essential here. #Let the hackishness begin #Reload the tab-delimited trait table header, fields = parse_trait_table(open(filename,"U")) fields = [f for f in fields] #converts generator to list #Transpose table for .BIOM format so that Observation ids are KOs transposed_header, transposed_trait_table_lines =\ transpose_trait_table_fields(fields,header,\ id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t") #Eliminate newline in header trans_trait_table_lines = [transposed_header.strip()] trans_trait_table_lines.extend(["\t".join(r) for r in transposed_trait_table_lines]) trans_trait_table = '\n'.join(trans_trait_table_lines) #Write BIOM format expected trait table base_name = "--".join(map(str,["exp_biom_traits",opts.method,curr_dist,safe_tip_to_predict])) expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\ table_format = "tab-delimited") #print "Expected_trait_table_lines:",exp_trait_table_lines filename=os.path.join(opts.output_dir,base_name) if opts.verbose: print "Writing BIOM-format expected trait table to:", filename f=open(filename,"w") f.write(format_biom_table(expected_biom_table)) f.close() #Write test trait table test_trait_table_fields = test_trait_table_fields if expected_traits in test_trait_table_fields: test_trait_table_fields.remove(expected_traits) test_trait_table_lines = [trait_table_header] test_trait_table_lines.extend(["\t".join(r)+"\n" for r in test_trait_table_fields]) #print "Test_trait_table_lines:",test_trait_table_lines base_name = "--".join(map(str,["test_trait_table",opts.method,curr_dist,safe_tip_to_predict])) filename=os.path.join(opts.output_dir,base_name) if opts.verbose: print "Writing test trait table to:", filename f=open(filename,"w") f.write("".join(test_trait_table_lines)) f.close() if opts.verbose: print "Done generating test datasets"
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) pool_by = opts.pool_by.split(',') #create output directory make_output_dir(opts.output_dir) #Construct a dict from user specified field order file_name_field_order = {} for i,field in enumerate(opts.field_order.split(',')): file_name_field_order[field]=i if opts.verbose: print "Assuming file names are in this order:",file_name_field_order for k in pool_by: #Check that we're only pooling by values that exist if k not in file_name_field_order.keys(): err_text=\ "Bad value for option '--pool_by'. Can't pool by '%s'. Valid categories are: %s" %(k,\ ",".join(file_name_field_order.keys())) raise ValueError(err_text) if opts.verbose: print "Pooling results by:",pool_by roc_success_criteria = ['binary','exact','int_exact'] scatter_lines,correlation_lines,roc_result_lines,roc_auc_lines =\ evaluate_test_dataset_dir(opts.trait_table_dir,\ opts.exp_trait_table_dir,file_name_delimiter="--",\ file_name_field_order=file_name_field_order,pool_by=pool_by,\ roc_success_criteria=roc_success_criteria,verbose=opts.verbose) #Output scatter data output_fp = join(opts.output_dir,'evaluation_scatter_data.tab') if opts.verbose: print "Writing scatter plot data to:",output_fp file_lines = scatter_lines f = open(output_fp,"w+") f.writelines(file_lines) f.close() #Output correlation data output_fp = join(opts.output_dir,'evaluation_correlation_data.tab') if opts.verbose: print "Writing correlation data to:",output_fp file_lines = correlation_lines f = open(output_fp,"w+") f.writelines(file_lines) f.close() #Output raw ROC plot data if opts.verbose: print "Writing ROC data..." for c in roc_result_lines.keys(): output_fp = join(opts.output_dir,'evaluation_roc_data_%s.tab' %c) if opts.verbose: print "Outputting ROC data for success criterion %s to: %s" %(c,output_fp) file_lines = roc_result_lines[c] f = open(output_fp,"w+") f.writelines(file_lines) f.close() #Output summary ROC AUC data if opts.verbose: print "Writing ROC AUC data..." for c in roc_auc_lines.keys(): output_fp = join(opts.output_dir,'evaluation_roc_auc_data_%s.tab' %c) file_lines = roc_auc_lines[c] if opts.verbose: print "Outputting ROC AUC data for success criterion %s to: %s" %(c,output_fp) f = open(output_fp,"w+") f.writelines(file_lines) f.close()
def main(): """Generate test trees given parameters""" option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading trait table..." input_trait_table = open(opts.input_trait_table, "U") if opts.verbose: print "Loading tree..." #PicrustNode seems to run into very slow/memory intentsive perfromance... #tree = DndParser(open(opts.input_tree),constructor=PicrustNode) tree = DndParser(open(opts.input_tree)) if opts.verbose: print "Parsing trait table..." #Find which taxa are to be used in tests #(by default trait table taxa) trait_table_header,trait_table_fields = \ parse_trait_table(input_trait_table) if opts.verbose: print "Ensuring tree and trait table labels are formatted consistently..." label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose) fix_tree_labels(tree, label_conversion_fns) trait_table_fields = convert_trait_table_entries(trait_table_fields,\ value_conversion_fns = [],\ label_conversion_fns = label_conversion_fns) trait_table_fields = [t for t in trait_table_fields] print "Number of trait table fields with single quotes:",\ len([t for t in trait_table_fields if "'" in t[0]]) if opts.verbose: print "Making output directory..." make_output_dir(opts.output_dir) if opts.limit_to_tips: included_tips = opts.limit_to_tips.split(",") if opts.verbose: print "Limiting test datasets to %i tips: %s" % ( len(included_tips), included_tips) else: included_tips = False method_fns =\ {"exclude_tips_by_distance":\ make_distance_based_exclusion_fn,\ "randomize_tip_labels_by_distance":\ make_distance_based_tip_label_randomizer } test_fn_factory = method_fns[opts.method] if opts.verbose: print "Setting tree modification method to:", opts.method print "(%s)" % test_fn_factory.__doc__ modify_tree = True if opts.suppress_tree_modification: if opts.verbose: print "Suppressing modification of tree when making test datasets" modify_tree = False if opts.verbose: print "Starting generation of test datsets" test_datasets = \ yield_genome_test_data_by_distance(tree,trait_table_fields,\ test_fn_factory,min_dist = opts.min_dist,\ max_dist=opts.max_dist,increment=opts.dist_increment,\ modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose) if opts.verbose: print "Writing files for test datasets" for curr_dist,test_tree,tip_to_predict,\ expected_traits,test_trait_table_fields in test_datasets: if included_tips is not False: if tip_to_predict not in included_tips: if opts.verbose: print "Skipping tip %s: limiting to tip(s): %s" % ( tip_to_predict, included_tips) continue #Make a safe version of tip to predict # So odd characters like | don't mess up OS safe_tip_to_predict = "'%s'" % tip_to_predict #Write tree base_name = "--".join(map(str, ["test_tree", opts.method, curr_dist])) curr_filepath = write_tree(opts.output_dir, base_name, test_tree, safe_tip_to_predict) if opts.verbose: print "Wrote test tree to: %s" % curr_filepath #Write expected trait table base_name = "--".join( map(str, ["exp_traits", opts.method, curr_dist, safe_tip_to_predict])) exp_trait_table_lines = [trait_table_header] exp_trait_table_lines.append("\t".join(expected_traits) + "\n") #print "Expected_trait_table_lines:",exp_trait_table_lines filename = os.path.join(opts.output_dir, base_name) if opts.verbose: print "Writing expected trait table to:", filename f = open(filename, "w") f.write("".join(exp_trait_table_lines)) f.close() #Output a transposed, BIOM format expectation table for comparison with predict_traits output #NOTE: this is a clumsy way of getting the translated trait table # but more elegant, direct methods (directly feeding data to biom's table_factory) # weren't working for me readily. In the future, we should streamline this process # Leaving as is for now since this code is mostly for developers so speed/elegence # are probably not essential here. #Let the hackishness begin #Reload the tab-delimited trait table header, fields = parse_trait_table(open(filename, "U")) fields = [f for f in fields] #converts generator to list #Transpose table for .BIOM format so that Observation ids are KOs transposed_header, transposed_trait_table_lines =\ transpose_trait_table_fields(fields,header,\ id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t") #Eliminate newline in header trans_trait_table_lines = [transposed_header.strip()] trans_trait_table_lines.extend( ["\t".join(r) for r in transposed_trait_table_lines]) trans_trait_table = '\n'.join(trans_trait_table_lines) #Write BIOM format expected trait table base_name = "--".join( map(str, [ "exp_biom_traits", opts.method, curr_dist, safe_tip_to_predict ])) expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\ table_format = "tab-delimited") #print "Expected_trait_table_lines:",exp_trait_table_lines filename = os.path.join(opts.output_dir, base_name) if opts.verbose: print "Writing BIOM-format expected trait table to:", filename f = open(filename, "w") f.write(format_biom_table(expected_biom_table)) f.close() #Write test trait table test_trait_table_fields = test_trait_table_fields if expected_traits in test_trait_table_fields: test_trait_table_fields.remove(expected_traits) test_trait_table_lines = [trait_table_header] test_trait_table_lines.extend( ["\t".join(r) + "\n" for r in test_trait_table_fields]) #print "Test_trait_table_lines:",test_trait_table_lines base_name = "--".join( map(str, [ "test_trait_table", opts.method, curr_dist, safe_tip_to_predict ])) filename = os.path.join(opts.output_dir, base_name) if opts.verbose: print "Writing test trait table to:", filename f = open(filename, "w") f.write("".join(test_trait_table_lines)) f.close() if opts.verbose: print "Done generating test datasets"
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) tmp_dir='jobs/' make_output_dir(tmp_dir) #Run the jobs script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py') if(opts.parallel_method=='sge'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py') elif(opts.parallel_method=='multithreaded'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py') elif(opts.parallel_method=='torque'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py') else: raise RuntimeError if(opts.verbose): print "Loading tree..." tree = load_picrust_tree(opts.tree, opts.verbose) all_tips = [tip.Name for tip in tree.tips()] if(opts.verbose): print "Total number of possible tips to predict: {0}".format(len(all_tips)) created_tmp_files=[] output_files={} output_files['counts']=[] if opts.reconstruction_confidence: output_files['variances']=[] output_files['upper_CI']=[] output_files['lower_CI']=[] if opts.already_calculated: all_tips=get_tips_not_in_precalc(all_tips,opts.already_calculated) if opts.verbose: print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(len(all_tips)) #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_') jobs=open(jobs_fp,'w') created_tmp_files.append(jobs_fp) if(opts.verbose): print "Creating temporary input files in: ",tmp_dir num_tips_per_job=1000 for tips_to_predict in [all_tips[i:i+num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job)]: #create tmp output files tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_predict_traits_') output_files['counts'].append(tmp_output_fp) tip_to_predict_str=','.join(list(tips_to_predict)) if opts.reconstruction_confidence: outfile_base,extension = splitext(tmp_output_fp) output_files['variances'].append(outfile_base+"_variances.tab") output_files['upper_CI'].append(outfile_base+"_upper_CI.tab") output_files['lower_CI'].append(outfile_base+"_lower_CI.tab") #create the job command cmd= "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp) else: cmd= "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp) #NOTE: Calculating NSTI this way is convenient, #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on. if opts.calculate_accuracy_metrics: cmd=cmd+" -a" #add job command to the the jobs file jobs.write(cmd+"\n") jobs.close() #add all output files to tmp list (used later for deletion) for predict_type in output_files: created_tmp_files.extend(output_files[predict_type]) if(opts.verbose): print "Launching parallel jobs." #run the job command job_prefix='picrust' submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs,delay=opts.delay) if(opts.verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files['counts']) if(opts.verbose): print "Jobs are done running." make_output_dir_for_file(opts.output_trait_table) outfile_base,extension = splitext(opts.output_trait_table) for predict_type in sorted(output_files): #Combine output files if opts.verbose: print "Combining all output files for "+ predict_type combined_predictions=combine_predict_trait_output(output_files[predict_type]) if opts.verbose: print "Writing combined file for "+predict_type if predict_type == 'counts': #Output in whatever format the user wants if opts.output_precalc_file_in_biom: open(opts.output_trait_table,'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions))) else: open(opts.output_trait_table,'w').write(combined_predictions) else: if opts.output_precalc_file_in_biom: open(outfile_base+"_"+predict_type+".biom",'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions))) else: open(outfile_base+"_"+predict_type+".tab",'w').write(combined_predictions) #clean up all tmp files for file in created_tmp_files: remove(file)
def main(): # Parse input to get parameters option_parser, opts, args =\ parse_command_line_parameters(**script_info) tree_file = opts.input_tree trait_table_fp = opts.input_trait_table verbose = opts.verbose #Set output base file names trait_table_base = 'trait_table.tab' pruned_tree_base = 'pruned_tree.newick' reference_tree_base = 'reference_tree.newick' output_dir = make_output_dir(opts.output_dir, strict=False) output_table_fp = join(output_dir, trait_table_base) output_tree_fp = join(output_dir, pruned_tree_base) output_reference_tree_fp = join(output_dir, reference_tree_base) #Handle parameters with more complex defaults delimiter_map = {"space": " ", "tab": "\t", "comma": ","} input_delimiter = delimiter_map[opts.input_table_delimiter] output_delimiter = delimiter_map[opts.output_table_delimiter] if verbose: print "Running with options:" print "\t%s:%s" % ("Tree file", tree_file) print "\t%s:%s" % ("Trait table", trait_table_fp) print "\t%s:%s" % ("Output tree", output_tree_fp) print "\t%s:%s" % ("Output reference tree", output_reference_tree_fp) print "\t%s:%s" % ("Output trait table", output_table_fp) print "\t%s:%s" % ("Add branch length to root", opts.add_branch_length_to_root) print "\t%s:%s" % ("Convert to NEXUS?", opts.convert_to_nexus) print "\t%s:%s" % ("Input trait table delimiter", opts.input_table_delimiter) print "\t%s:%s" % ("Output trait table delimiter", opts.output_table_delimiter) # Begin reformatting root_name = "root" #format_for_bayestraits = True #TODO: this will become a new function in the bayestraits app controller #if format_for_bayestraits: # convert_to_nexus = True # convert_to_bifurcating = True # filter_table_by_tree_tips = True # filter_tree_by_table_entries = True # enforce_min_branch_length = True # convert_trait_floats_to_ints = True if opts.no_minimum_branch_length: min_branch_length = None else: min_branch_length = 0.0001 #Load inputs if verbose: print "Loading tree...." input_tree = DndParser(open(tree_file)) #input_tree =DndParser(open(tree_file), constructor=PicrustNode) #input_tree = load_picrust_tree(opts.input_tree,opts.verbose) if verbose: print "Loading trait table..." trait_table = open(trait_table_fp, "U") trait_table_lines = trait_table.readlines() if not trait_table_lines: raise IOError( "No lines could be loaded from file %s. Please check the input file." % trait_table_fp) #Get id mappings from mapping file if opts.tree_to_trait_mapping: if verbose: print "Loading tree to trait table mapping file..." mapping_file = open(opts.tree_to_trait_mapping, "U") trait_to_tree_mapping =\ make_id_mapping_dict(parse_id_mapping_file(mapping_file)) else: if verbose: print "No tree to trait mapping file specified. Assuming tree tip names and trait table names will match exactly." trait_to_tree_mapping = None # Call reformatting function using specified parameters # to get reference tree if opts.verbose: print """**BUILDING REFERENCE TREE (without respect to trait table)**""" new_reference_tree, not_useful_trait_table_lines =\ reformat_tree_and_trait_table(\ tree=input_tree,\ trait_table_lines = [],\ trait_to_tree_mapping = None,\ input_trait_table_delimiter= None,\ output_trait_table_delimiter= None,\ filter_table_by_tree_tips=False,\ convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=False,\ convert_to_bifurcating=True,\ add_branch_length_to_root=False,\ name_unnamed_nodes=True,\ min_branch_length=min_branch_length,\ verbose=opts.verbose) #Make a copy new_reference_tree_copy = new_reference_tree.deepcopy() if opts.verbose: print """**BUILDING PRUNED TREE AND TRAIT TABLE**""" # Call reformatting function using specified parameters new_tree, new_trait_table_lines = \ reformat_tree_and_trait_table(tree=new_reference_tree_copy,\ trait_table_lines = trait_table_lines,\ trait_to_tree_mapping = trait_to_tree_mapping,\ input_trait_table_delimiter= input_delimiter,\ output_trait_table_delimiter=output_delimiter,\ filter_table_by_tree_tips=True,\ convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=True,\ convert_to_bifurcating=False,\ add_branch_length_to_root=False,\ name_unnamed_nodes=False,\ min_branch_length=min_branch_length,\ verbose=opts.verbose) #Alter reference tree to only contain tips in OTU table (and of course trait table) if opts.limit_tree_to_otus_fp: if opts.verbose: print "Pruning reference tree to contain only tips in OTU table (and trait table)...." otu_table = open(opts.limit_tree_to_otus_fp, "U") otu_table_lines = otu_table.readlines() header_line, otu_table_fields = parse_trait_table( otu_table_lines, delimiter=input_delimiter, has_header=False) header_line,trait_table_fields =\ parse_trait_table(new_trait_table_lines,delimiter = input_delimiter) tips_to_keep = list(otu_table_fields) + list(trait_table_fields) tips_to_keep_in_tree = filter_table_by_presence_in_tree( new_reference_tree_copy, tips_to_keep) new_reference_tree = filter_tree_tips_by_presence_in_table(new_reference_tree_copy,\ tips_to_keep_in_tree,verbose=opts.verbose) if opts.verbose: print "Almost finished. Writing trees and trait table to files..." #Write results to files # Open output files output_trait_table_file = open(output_table_fp, "w+") output_tree_file = open(output_tree_fp, "w+") output_reference_tree_file = open(output_reference_tree_fp, "w+") #Output trait table file if opts.verbose: print "Writing trait table to:", output_table_fp output_trait_table_file.write("\n".join(new_trait_table_lines)) trait_table.close() output_trait_table_file.close() #Output tree file if opts.verbose: print "Writing pruned tree to:", output_tree_fp if opts.convert_to_nexus is True: lines = nexus_lines_from_tree(new_tree) output_tree_file.write("\n".join(map(str, lines))) else: output_tree_file.write(new_tree.getNewick(with_distances=True)) output_tree_file.close() if opts.verbose: print "Writing reference tree to:", output_reference_tree_fp #Output reference tree file output_reference_tree_file.write( new_reference_tree.getNewick(with_distances=True)) output_reference_tree_file.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) tmp_dir = 'jobs/' make_output_dir(tmp_dir) #Run the jobs script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py') if (opts.parallel_method == 'sge'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_sge.py') elif (opts.parallel_method == 'multithreaded'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs.py') elif (opts.parallel_method == 'torque'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_torque.py') else: raise RuntimeError if (opts.verbose): print "Loading tree..." tree = load_picrust_tree(opts.tree, opts.verbose) all_tips = [tip.Name for tip in tree.tips()] if (opts.verbose): print "Total number of possible tips to predict: {0}".format( len(all_tips)) created_tmp_files = [] output_files = {} output_files['counts'] = [] if opts.reconstruction_confidence: output_files['variances'] = [] output_files['upper_CI'] = [] output_files['lower_CI'] = [] if opts.already_calculated: all_tips = get_tips_not_in_precalc(all_tips, opts.already_calculated) if opts.verbose: print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format( len(all_tips)) #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_') jobs = open(jobs_fp, 'w') created_tmp_files.append(jobs_fp) if (opts.verbose): print "Creating temporary input files in: ", tmp_dir num_tips_per_job = 1000 for tips_to_predict in [ all_tips[i:i + num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job) ]: #create tmp output files tmp_output_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='out_predict_traits_') output_files['counts'].append(tmp_output_fp) tip_to_predict_str = ','.join(list(tips_to_predict)) if opts.reconstruction_confidence: outfile_base, extension = splitext(tmp_output_fp) output_files['variances'].append(outfile_base + "_variances.tab") output_files['upper_CI'].append(outfile_base + "_upper_CI.tab") output_files['lower_CI'].append(outfile_base + "_lower_CI.tab") #create the job command cmd = "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format( script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp) else: cmd = "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format( script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp) #NOTE: Calculating NSTI this way is convenient, #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on. if opts.calculate_accuracy_metrics: cmd = cmd + " -a" #add job command to the the jobs file jobs.write(cmd + "\n") jobs.close() #add all output files to tmp list (used later for deletion) for predict_type in output_files: created_tmp_files.extend(output_files[predict_type]) if (opts.verbose): print "Launching parallel jobs." #run the job command job_prefix = 'picrust' submit_jobs(cluster_jobs_fp, jobs_fp, job_prefix, num_jobs=opts.num_jobs, delay=opts.delay) if (opts.verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files['counts']) if (opts.verbose): print "Jobs are done running." make_output_dir_for_file(opts.output_trait_table) outfile_base, extension = splitext(opts.output_trait_table) for predict_type in sorted(output_files): #Combine output files if opts.verbose: print "Combining all output files for " + predict_type combined_predictions = combine_predict_trait_output( output_files[predict_type]) if opts.verbose: print "Writing combined file for " + predict_type if predict_type == 'counts': #Output in whatever format the user wants if opts.output_precalc_file_in_biom: open(opts.output_trait_table, 'w').write( format_biom_table( convert_precalc_to_biom(combined_predictions))) else: open(opts.output_trait_table, 'w').write(combined_predictions) else: if opts.output_precalc_file_in_biom: open(outfile_base + "_" + predict_type + ".biom", 'w').write( format_biom_table( convert_precalc_to_biom(combined_predictions))) else: open(outfile_base + "_" + predict_type + ".tab", 'w').write(combined_predictions) #clean up all tmp files for file in created_tmp_files: remove(file)