def run_asr_in_parallel(tree, table, asr_method, parallel_method='sge',tmp_dir='jobs/',num_jobs=100, verbose=False): '''Runs the ancestral state reconstructions in parallel''' asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py') if(parallel_method=='sge'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_picrust_jobs_sge.py') elif(parallel_method=='multithreaded'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_picrust_jobs.py') elif(parallel_method=='torque'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_picrust_jobs_torque.py') else: raise RuntimeError if(verbose): print "Loading trait table..." #foreach trait in the table, create a new tmp file with just that trait, and create the job command and add it a tmp jobs file table=LoadTable(filename=table, header=True, sep='\t') #get dimensions of the table dim=table.Shape created_tmp_files=[] output_files=[] ci_files=[] #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_asr_') jobs=open(jobs_fp,'w') created_tmp_files.append(jobs_fp) if(verbose): print "Creating temporary input files in: ",tmp_dir #iterate over each column for i in range(1,dim[1]): #create a new table with only a single trait single_col_table=table.getColumns([0,i]) #write the new table to a tmp file single_col_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='in_asr_') single_col_table.writeToFile(single_col_fp,sep='\t') created_tmp_files.append(single_col_fp) #create tmp output files tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_') output_files.append(tmp_output_fp) tmp_ci_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_ci_') ci_files.append(tmp_ci_fp) #create the job command cmd= "{0} -i {1} -t {2} -m {3} -o {4} -c {5}".format(asr_script_fp, single_col_fp, tree, asr_method, tmp_output_fp, tmp_ci_fp) #add job command to the the jobs file jobs.write(cmd+"\n") jobs.close() created_tmp_files.extend(output_files) created_tmp_files.extend(ci_files) if(verbose): print "Launching parallel jobs." #run the job command job_prefix='asr' submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=num_jobs) if(verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files) if(verbose): print "Jobs are done running. Now combining all tmp files." #Combine output files combined_table=combine_asr_tables(output_files) combined_ci_table=combine_asr_tables(ci_files) #create a Table object combined_table=Table(header=combined_table[0],rows=combined_table[1:]) combined_ci_table=Table(header=combined_ci_table[0],rows=combined_ci_table[1:]) #clean up all tmp files for file in created_tmp_files: remove(file) #return the combined table return combined_table,combined_ci_table
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) tmp_dir = 'jobs/' make_output_dir(tmp_dir) #Run the jobs script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py') if (opts.parallel_method == 'sge'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_sge.py') elif (opts.parallel_method == 'multithreaded'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs.py') elif (opts.parallel_method == 'torque'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_torque.py') else: raise RuntimeError if (opts.verbose): print "Loading tree..." tree = load_picrust_tree(opts.tree, opts.verbose) all_tips = [tip.Name for tip in tree.tips()] if (opts.verbose): print "Total number of possible tips to predict: {0}".format( len(all_tips)) created_tmp_files = [] output_files = {} output_files['counts'] = [] if opts.reconstruction_confidence: output_files['variances'] = [] output_files['upper_CI'] = [] output_files['lower_CI'] = [] if opts.already_calculated: all_tips = get_tips_not_in_precalc(all_tips, opts.already_calculated) if opts.verbose: print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format( len(all_tips)) #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_') jobs = open(jobs_fp, 'w') created_tmp_files.append(jobs_fp) if (opts.verbose): print "Creating temporary input files in: ", tmp_dir num_tips_per_job = 1000 for tips_to_predict in [ all_tips[i:i + num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job) ]: #create tmp output files tmp_output_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='out_predict_traits_') output_files['counts'].append(tmp_output_fp) tip_to_predict_str = ','.join(list(tips_to_predict)) if opts.reconstruction_confidence: outfile_base, extension = splitext(tmp_output_fp) output_files['variances'].append(outfile_base + "_variances.tab") output_files['upper_CI'].append(outfile_base + "_upper_CI.tab") output_files['lower_CI'].append(outfile_base + "_lower_CI.tab") #create the job command cmd = "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format( script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp) else: cmd = "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format( script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp) #NOTE: Calculating NSTI this way is convenient, #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on. if opts.calculate_accuracy_metrics: cmd = cmd + " -a" #add job command to the the jobs file jobs.write(cmd + "\n") jobs.close() #add all output files to tmp list (used later for deletion) for predict_type in output_files: created_tmp_files.extend(output_files[predict_type]) if (opts.verbose): print "Launching parallel jobs." #run the job command job_prefix = 'picrust' submit_jobs(cluster_jobs_fp, jobs_fp, job_prefix, num_jobs=opts.num_jobs, delay=opts.delay) if (opts.verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files['counts']) if (opts.verbose): print "Jobs are done running." make_output_dir_for_file(opts.output_trait_table) outfile_base, extension = splitext(opts.output_trait_table) for predict_type in sorted(output_files): #Combine output files if opts.verbose: print "Combining all output files for " + predict_type combined_predictions = combine_predict_trait_output( output_files[predict_type]) if opts.verbose: print "Writing combined file for " + predict_type if predict_type == 'counts': #Output in whatever format the user wants if opts.output_precalc_file_in_biom: open(opts.output_trait_table, 'w').write( format_biom_table( convert_precalc_to_biom(combined_predictions))) else: open(opts.output_trait_table, 'w').write(combined_predictions) else: if opts.output_precalc_file_in_biom: open(outfile_base + "_" + predict_type + ".biom", 'w').write( format_biom_table( convert_precalc_to_biom(combined_predictions))) else: open(outfile_base + "_" + predict_type + ".tab", 'w').write(combined_predictions) #clean up all tmp files for file in created_tmp_files: remove(file)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) tmp_dir='jobs/' make_output_dir(tmp_dir) #Run the jobs script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py') if(opts.parallel_method=='sge'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py') elif(opts.parallel_method=='multithreaded'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py') elif(opts.parallel_method=='torque'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py') else: raise RuntimeError if(opts.verbose): print "Loading tree..." tree = load_picrust_tree(opts.tree, opts.verbose) all_tips = [tip.Name for tip in tree.tips()] if(opts.verbose): print "Total number of possible tips to predict: {0}".format(len(all_tips)) created_tmp_files=[] output_files={} output_files['counts']=[] if opts.reconstruction_confidence: output_files['variances']=[] output_files['upper_CI']=[] output_files['lower_CI']=[] if opts.already_calculated: all_tips=get_tips_not_in_precalc(all_tips,opts.already_calculated) if opts.verbose: print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(len(all_tips)) #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_') jobs=open(jobs_fp,'w') created_tmp_files.append(jobs_fp) if(opts.verbose): print "Creating temporary input files in: ",tmp_dir num_tips_per_job=1000 for tips_to_predict in [all_tips[i:i+num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job)]: #create tmp output files tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_predict_traits_') output_files['counts'].append(tmp_output_fp) tip_to_predict_str=','.join(list(tips_to_predict)) if opts.reconstruction_confidence: outfile_base,extension = splitext(tmp_output_fp) output_files['variances'].append(outfile_base+"_variances.tab") output_files['upper_CI'].append(outfile_base+"_upper_CI.tab") output_files['lower_CI'].append(outfile_base+"_lower_CI.tab") #create the job command cmd= "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp) else: cmd= "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp) #NOTE: Calculating NSTI this way is convenient, #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on. if opts.calculate_accuracy_metrics: cmd=cmd+" -a" #add job command to the the jobs file jobs.write(cmd+"\n") jobs.close() #add all output files to tmp list (used later for deletion) for predict_type in output_files: created_tmp_files.extend(output_files[predict_type]) if(opts.verbose): print "Launching parallel jobs." #run the job command job_prefix='picrust' submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs,delay=opts.delay) if(opts.verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files['counts']) if(opts.verbose): print "Jobs are done running." make_output_dir_for_file(opts.output_trait_table) outfile_base,extension = splitext(opts.output_trait_table) for predict_type in sorted(output_files): #Combine output files if opts.verbose: print "Combining all output files for "+ predict_type combined_predictions=combine_predict_trait_output(output_files[predict_type]) if opts.verbose: print "Writing combined file for "+predict_type if predict_type == 'counts': #Output in whatever format the user wants if opts.output_precalc_file_in_biom: open(opts.output_trait_table,'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions))) else: open(opts.output_trait_table,'w').write(combined_predictions) else: if opts.output_precalc_file_in_biom: open(outfile_base+"_"+predict_type+".biom",'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions))) else: open(outfile_base+"_"+predict_type+".tab",'w').write(combined_predictions) #clean up all tmp files for file in created_tmp_files: remove(file)
def run_asr_in_parallel(tree, table, asr_method, parallel_method='sge',tmp_dir='jobs/',num_jobs=100, verbose=False): '''Runs the ancestral state reconstructions in parallel''' asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py') if(parallel_method=='sge'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py') elif(parallel_method=='multithreaded'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py') elif(parallel_method=='torque'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py') else: raise RuntimeError if(verbose): print "Loading trait table..." #foreach trait in the table, create a new tmp file with just that trait, and create the job command and add it a tmp jobs file table=LoadTable(filename=table, header=True, sep='\t') #get dimensions of the table dim=table.Shape created_tmp_files=[] output_files=[] ci_files=[] #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_asr_') jobs=open(jobs_fp,'w') created_tmp_files.append(jobs_fp) if(verbose): print "Creating temporary input files in: ",tmp_dir #iterate over each column for i in range(1,dim[1]): #create a new table with only a single trait single_col_table=table.getColumns([0,i]) #write the new table to a tmp file single_col_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='in_asr_') single_col_table.writeToFile(single_col_fp,sep='\t') created_tmp_files.append(single_col_fp) #create tmp output files tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_') output_files.append(tmp_output_fp) tmp_ci_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_ci_') ci_files.append(tmp_ci_fp) #create the job command cmd= "{0} -i {1} -t {2} -m {3} -o {4} -c {5}".format(asr_script_fp, single_col_fp, tree, asr_method, tmp_output_fp, tmp_ci_fp) #add job command to the the jobs file jobs.write(cmd+"\n") jobs.close() created_tmp_files.extend(output_files) created_tmp_files.extend(ci_files) if(verbose): print "Launching parallel jobs." #run the job command job_prefix='asr' submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=num_jobs) if(verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files) if(verbose): print "Jobs are done running. Now combining all tmp files." #Combine output files combined_table=combine_asr_tables(output_files) combined_ci_table=combine_asr_tables(ci_files) #create a Table object combined_table=Table(header=combined_table[0],rows=combined_table[1:]) combined_ci_table=Table(header=combined_ci_table[0],rows=combined_ci_table[1:]) #clean up all tmp files for file in created_tmp_files: remove(file) #return the combined table return combined_table,combined_ci_table