def test_get_picrust_project_dir(self): """get_picrust_project_dir functions as expected""" # Do an explicit check on whether the file system containing # the current file is case insensitive. case_insensitive_filesystem = \ exists(__file__.upper()) and exists(__file__.lower()) actual = get_picrust_project_dir() # I base the expected here off the imported location of # picrust/util.py here, to handle cases where either the user has # PICRUST in their PYTHONPATH, or when they've installed it with # setup.py. # If util.py moves this test will fail -- that # is what we want in this case, as the get_picrust_project_dir() # function would need to be modified. import picrust.util util_py_filepath = abspath(abspath(picrust.util.__file__)) expected = dirname(dirname(util_py_filepath)) if case_insensitive_filesystem: # make both lowercase if the file system is case insensitive actual = actual.lower() expected = expected.lower() self.assertEqual(actual,expected)
def test_get_picrust_project_dir(self): """get_picrust_project_dir functions as expected""" # Do an explicit check on whether the file system containing # the current file is case insensitive. case_insensitive_filesystem = \ exists(__file__.upper()) and exists(__file__.lower()) actual = get_picrust_project_dir() # I base the expected here off the imported location of # picrust/util.py here, to handle cases where either the user has # PICRUST in their PYTHONPATH, or when they've installed it with # setup.py. # If util.py moves this test will fail -- that # is what we want in this case, as the get_picrust_project_dir() # function would need to be modified. import picrust.util util_py_filepath = abspath(abspath(picrust.util.__file__)) expected = dirname(dirname(util_py_filepath)) if case_insensitive_filesystem: # make both lowercase if the file system is case insensitive actual = actual.lower() expected = expected.lower() self.assertEqual(actual, expected)
class Count(CommandLineApplication): """ Application controller for Count an ASR tool.""" #count_fp = environ['COUNT_JAR'] count_fp = join(get_picrust_project_dir(),'picrust','support_files','jar','Count.jar') _command = 'java -Xmx1024M -cp ' + count_fp + ' ca.umontreal.iro.evolution.genecontent.AsymmetricWagner' _parameters = {\ '-gain':ValuedParameter(Prefix='-',Name='gain',Delimiter=' '),\ '-max_paralogs':ValuedParameter(Prefix='-',Name='max_paralogs',Delimiter=' ')} _input_handler = '_input_as_paths' _suppress_stdout = False _suppress_stderr = False #need to overide this method since using command that is not executable def _error_on_missing_application(self,params): pass
class Ace(CommandLineApplication): """ Application controller for 'ace' fucntion within the 'ape' R package.""" ace_script_fp = join(get_picrust_project_dir(), 'picrust', 'support_files', 'R', 'ace.R') _command = ace_script_fp _input_handler = '_input_as_string' _suppress_stdout = False _suppress_stderr = False # Overridden to call script with R rather than directly - this is useful # because permisssions on the script are set to 644 when PICRUSt is installed # with setup.py. This is fine if we're executing it with R, but not if we're # trying to execute it directly. def _get_base_command(self): """ Returns the full command string input_arg: the argument to the command which represents the input to the program, this will be a string, either representing input or a filename to get input from """ command_parts = [] # Append a change directory to the beginning of the command to change # to self.WorkingDir before running the command # WorkingDir should be in quotes -- filenames might contain spaces cd_command = ''.join(['cd ', str(self.WorkingDir), ';']) if self._command is None: raise ApplicationError, '_command has not been set.' command = self._command parameters = self.Parameters command_parts.append(cd_command) command_parts.append("R") command_parts.append("-f") command_parts.append(command) command_parts.append("--args") command_parts.append(self._command_delimiter.join(filter(\ None,(map(str,parameters.values()))))) return self._command_delimiter.join(command_parts).strip() BaseCommand = property(_get_base_command)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading OTU table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ids_to_load = otu_table.ObservationIds if opts.verbose: print "Done loading OTU table containing %i samples and %i OTUs." \ %(len(otu_table.SampleIds),len(otu_table.ObservationIds)) #Hardcoded loaction of the precalculated datasets for PICRUSt, #relative to the project directory precalc_data_dir=join(get_picrust_project_dir(),'picrust','data') # Load a table of gene counts by OTUs. #This can be either user-specified or precalculated genome_table_fp = determine_data_table_fp(precalc_data_dir,\ opts.type_of_prediction,opts.gg_version,\ user_specified_table=opts.input_count_table,verbose=opts.verbose) if opts.verbose: print "Loading gene count data from file: %s" %genome_table_fp genome_table= load_data_table(genome_table_fp,\ load_data_table_in_biom=opts.load_precalc_file_in_biom,\ suppress_subset_loading=opts.suppress_subset_loading,\ ids_to_load=ids_to_load,verbose=opts.verbose,transpose=True) if opts.verbose: print "Loaded %i genes across %i OTUs from gene count table" \ %(len(genome_table.ObservationIds),len(genome_table.SampleIds)) if opts.with_confidence: if opts.input_variance_table: variance_table_fp = opts.input_variance_table else: variance_table_fp = determine_data_table_fp(precalc_data_dir,\ opts.type_of_prediction,opts.gg_version,\ precalc_file_suffix='precalculated_variances.tab.gz',\ user_specified_table=opts.input_count_table) if opts.verbose: print "Loading variance information from table: %s" \ %variance_table_fp variance_table= load_data_table(variance_table_fp,\ load_data_table_in_biom=opts.load_precalc_file_in_biom,\ suppress_subset_loading=opts.suppress_subset_loading,\ ids_to_load=ids_to_load,transpose=True) if opts.verbose: print "Loaded %i genes across %i OTUs from variance table" \ %(len(variance_table.ObservationIds),len(variance_table.SampleIds)) #Raise an error if the genome table and variance table differ #in the genomes they contain. #better to find out now than have something obscure happen latter on if opts.verbose: print "Checking that genome table and variance table are consistent" try: assert set(variance_table.ObservationIds) == set(genome_table.ObservationIds) except AssertionError,e: for var_id in variance_table.ObservationIds: if var_id not in genome_table.ObservationIds: print "Variance table ObsId %s not in genome_table ObsIds" %var_id raise AssertionError("Variance table and genome table contain different gene ids") try: assert set(variance_table.SampleIds) == set(genome_table.SampleIds) except AssertionError,e: for var_id in variance_table.SampleIds: if var_id not in genome_table.SampleIds: print "Variance table SampleId %s not in genome_table SampleIds" %var_id raise AssertionError("Variance table and genome table contain different OTU ids")
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = load_table(opts.input_otu_table) ids_to_load = otu_table.ids(axis='observation') if(opts.input_count_table is None): #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz) precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if opts.verbose: print "Loading count table: ", input_count_table if (ext == '.gz'): genome_table_fh = gzip.open(input_count_table,'rb') else: genome_table_fh = open(input_count_table,'U') #In the genome/trait table genomes are the samples and #genes are the observations if opts.load_precalc_file_in_biom: if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = load_table(genome_table_fh) else: genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load) ok_functional_categories = None metadata_type = None if opts.limit_to_functional_categories: ok_functional_categories = opts.limit_to_functional_categories.split("|") if opts.verbose: print "Limiting to functional categories: %s" %(str(ok_functional_categories)) # Either KEGG_Pathways or COG_Category needs # to be assigned to metadata_key to limit to # functional categories (not needed for # individual functions) if opts.type_of_prediction == "ko": metadata_type = "KEGG_Pathways" elif opts.type_of_prediction == "cog": metadata_type = "COG_Category" elif opts.type_of_prediction == "rfam": exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)") partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\ limit_to_functional_categories = ok_functional_categories , metadata_key = metadata_type ) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_fp make_output_dir_for_file(opts.output_fp) open(opts.output_fp,'w').write(output_text)
import sys script_info = {} script_info['brief_description'] = "Normalize an OTU table by marker gene copy number" script_info['script_description'] = "" script_info['script_usage'] = [ ("","Normalize the counts in raw_otus.biom. Write the resulting table to normalized_otus.biom.","%prog -i raw_otus.biom -o normalized_otus.biom"), ("","Input tab-delimited OTU table:","%prog -f -i raw_otus.tab -o predicted_metagenomes.biom") ] script_info['output_description']= "A normalized OTU table" script_info['required_options'] = [ make_option('-i','--input_otu_fp',type="existing_filepath",help='the input otu table filepath in biom format'), make_option('-o','--output_otu_fp',type="new_filepath",help='the output otu table filepath in biom format'), ] script_info['optional_options'] = [ make_option('-c','--input_count_fp',default=join(get_picrust_project_dir(),'picrust','data','16S_precalculated.biom.gz'),type="existing_filepath",help='the input marker gene counts on per otu basis in biom format (can be gzipped) [default: %default]'), make_option('--metadata_identifer', default='CopyNumber', help='identifier for copy number entry as observation metadata [default: %default]'), make_option('-f','--input_format_classic', action="store_true", default=False, help='input otu table (--input_otu_fp) is in classic Qiime format [default: %default]'), ] script_info['version'] = __version__ def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_ext=path.splitext(opts.input_otu_fp)[1] if opts.input_format_classic: otu_table=parse_classic_table_to_rich_table(open(opts.input_otu_fp,'U'),None,None,None,DenseOTUTable)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading OTU table: ", opts.input_otu_table otu_table = load_table(opts.input_otu_table) ids_to_load = otu_table.ids(axis='observation').tolist() # Determine whether user wants predictions round to nearest whole # number or not. if opts.no_round: round_flag = False else: round_flag = True if opts.verbose: print "Done loading OTU table containing %i samples and %i OTUs." \ %(len(otu_table.ids()),len(otu_table.ids(axis='observation'))) #Hardcoded loaction of the precalculated datasets for PICRUSt, #relative to the project directory precalc_data_dir = join(get_picrust_project_dir(), 'picrust', 'data') # Load a table of gene counts by OTUs. #This can be either user-specified or precalculated genome_table_fp = determine_data_table_fp(precalc_data_dir,\ opts.type_of_prediction,opts.gg_version,\ user_specified_table=opts.input_count_table,verbose=opts.verbose) if opts.verbose: print "Loading gene count data from file: %s" % genome_table_fp genome_table= load_data_table(genome_table_fp,\ load_data_table_in_biom=opts.load_precalc_file_in_biom,\ suppress_subset_loading=opts.suppress_subset_loading,\ ids_to_load=ids_to_load,verbose=opts.verbose,transpose=True) if opts.verbose: print "Loaded %i genes across %i OTUs from gene count table" \ %(len(genome_table.ids(axis='observation')),len(genome_table.ids())) if opts.with_confidence: if opts.input_variance_table: variance_table_fp = opts.input_variance_table else: variance_table_fp = determine_data_table_fp(precalc_data_dir,\ opts.type_of_prediction,opts.gg_version,\ precalc_file_suffix='precalculated_variances.tab.gz',\ user_specified_table=opts.input_count_table) if opts.verbose: print "Loading variance information from table: %s" \ %variance_table_fp variance_table= load_data_table(variance_table_fp,\ load_data_table_in_biom=opts.load_precalc_file_in_biom,\ suppress_subset_loading=opts.suppress_subset_loading,\ ids_to_load=ids_to_load,transpose=True) if opts.verbose: print "Loaded %i genes across %i OTUs from variance table" \ %(len(variance_table.ids(axis='observation')),len(variance_table.ids())) #Raise an error if the genome table and variance table differ #in the genomes they contain. #better to find out now than have something obscure happen latter on if opts.verbose: print "Checking that genome table and variance table are consistent" try: assert set(variance_table.ids(axis='observation')) == set( genome_table.ids(axis='observation')) except AssertionError, e: for var_id in variance_table.ids(axis='observation'): if var_id not in genome_table.ids(axis='observation'): print "Variance table ObsId %s not in genome_table ObsIds" % var_id raise AssertionError( "Variance table and genome table contain different gene ids") try: assert set(variance_table.ids()) == set(genome_table.ids()) except AssertionError, e: for var_id in variance_table.ids(): if var_id not in genome_table.ids(): print "Variance table SampleId %s not in genome_table SampleIds" % var_id raise AssertionError( "Variance table and genome table contain different OTU ids")
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #set some defaults for the options input_dir=opts.input_dir output_dir=opts.output_dir or input_dir tmp_dir=opts.tmp_dir or output_dir parallel_method=opts.parallel_method asr_method = opts.asr_method predict_traits_method = opts.prediction_method if opts.num_jobs > 20 and parallel_method == 'multithreaded': raise ValueError('You probably dont want to run multithreaded evaluations with a large num_jobs. Please adjust options num_jobs and or parallel_method') if opts.with_confidence and asr_method not in ['ace_ml','ace_reml']: raise ValueError("PICRUST currently only supports confidence intervals with the ace_ml and ace_reml ASR methods") if opts.verbose: print "Reconstruction method:",asr_method print "Prediction method:",predict_traits_method print "Parallel method:",parallel_method print "num_jobs:",opts.num_jobs print "\nOutput will be saved here:'%s'" %output_dir #create the output directory unless it already exists make_output_dir(output_dir) if(parallel_method=='sge'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py') elif(parallel_method=='multithreaded'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py') elif(parallel_method=='torque'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py') else: raise RuntimeError #get the test datasets to run in the input directory (based on exp_traits files) expect_test_files=glob(join(input_dir,'exp_traits--*')) test_datasets={} for file_name in expect_test_files: test_id=file_name.replace(join(input_dir,'exp_traits--'),'',1) #create a dict with the test files as values in the ref list test_datasets[test_id]=[ join(input_dir,'test_trait_table--'+test_id),join(input_dir,'test_tree--'+test_id),join(input_dir,'exp_traits--'+test_id)] created_tmp_files=[] output_files=[] #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_') jobs=open(jobs_fp,'w') created_tmp_files.append(jobs_fp) #get location of scripts we need to run asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py') predict_traits_script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py') #run each test dataset through the pipeline for test_id in test_datasets: asr_out_fp=join(output_dir,'asr--'+asr_method+'--'+test_id) asr_params_out_fp=join(output_dir,'--'.join(['asr',asr_method,'asr_params',test_id])) created_tmp_files.append(asr_out_fp) if opts.check_for_null_files and exists(asr_out_fp) and file_contains_nulls(asr_out_fp): #remove file if opts.verbose: print "Existing ASR file contains null characters. Will run ASR again after removing: "+asr_out_fp remove(asr_out_fp) if exists(asr_out_fp) and not opts.force: if opts.verbose: print "Output file: {0} already exists, so we will skip it.".format(asr_out_fp) asr_cmd = "echo 'Skipping ASR for %s, file %s exists already'" %(test_id,asr_out_fp) else: #create the asr command asr_cmd= """python {0} -i "{1}" -t "{2}" -m {3} -o "{4}" -c "{5}" """.format(asr_script_fp, test_datasets[test_id][0], test_datasets[test_id][1], asr_method, asr_out_fp, asr_params_out_fp) predict_traits_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\ opts.weighting_method,test_id])) if opts.with_accuracy: predict_traits_accuracy_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\ opts.weighting_method,'accuracy_metrics',test_id])) if opts.check_for_null_files and exists(predict_traits_out_fp) and file_contains_nulls(predict_traits_out_fp): if opts.verbose: print "Existing trait predictions file contains null characters. Will run it again after removing: "+predict_traits_out_fp remove(predict_traits_out_fp) if exists(predict_traits_out_fp) and not opts.force: if opts.verbose: print "Prediction file: {0} already exists. Skipping ASR and prediction for this organism".format(predict_traits_out_fp) continue output_files.append(predict_traits_out_fp) genome_id=split('--',test_id)[2] if predict_traits_method == 'nearest_neighbor': #don't do asr step predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -g "{3}" -o "{4}" -m "{5}" """.format(predict_traits_script_fp, test_datasets[test_id][0], opts.ref_tree, genome_id, predict_traits_out_fp,predict_traits_method) jobs.write(predict_traits_cmd+"\n") else: #create the predict traits command predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -r "{3}" -g "{4}" -o "{5}" -m "{6}" -w {7} """.format(predict_traits_script_fp,\ test_datasets[test_id][0], opts.ref_tree, asr_out_fp,genome_id, predict_traits_out_fp,predict_traits_method,opts.weighting_method) #Instruct predict_traits to use confidence intervals output by ASR if opts.with_confidence: confidence_param = ' -c "%s"' %(asr_params_out_fp) predict_traits_cmd = predict_traits_cmd + confidence_param #Instruct predict traits to output the NTSI measure of distance to #nearby sequences. if opts.with_accuracy: accuracy_param = ' -a "%s"' %(predict_traits_accuracy_out_fp) predict_traits_cmd = predict_traits_cmd + accuracy_param #add job command to the the jobs file jobs.write(asr_cmd+';'+predict_traits_cmd+"\n") jobs.close() #created_tmp_files.extend(output_files) #submit the jobs job_prefix='eval_' if opts.verbose: print "Submitting jobs:",cluster_jobs_fp,jobs_fp,job_prefix,opts.num_jobs submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) tmp_dir = 'jobs/' make_output_dir(tmp_dir) #Run the jobs script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py') if (opts.parallel_method == 'sge'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_sge.py') elif (opts.parallel_method == 'multithreaded'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs.py') elif (opts.parallel_method == 'torque'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_torque.py') else: raise RuntimeError if (opts.verbose): print "Loading tree..." tree = load_picrust_tree(opts.tree, opts.verbose) all_tips = [tip.Name for tip in tree.tips()] if (opts.verbose): print "Total number of possible tips to predict: {0}".format( len(all_tips)) created_tmp_files = [] output_files = {} output_files['counts'] = [] if opts.reconstruction_confidence: output_files['variances'] = [] output_files['upper_CI'] = [] output_files['lower_CI'] = [] if opts.already_calculated: all_tips = get_tips_not_in_precalc(all_tips, opts.already_calculated) if opts.verbose: print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format( len(all_tips)) #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_') jobs = open(jobs_fp, 'w') created_tmp_files.append(jobs_fp) if (opts.verbose): print "Creating temporary input files in: ", tmp_dir num_tips_per_job = 1000 for tips_to_predict in [ all_tips[i:i + num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job) ]: #create tmp output files tmp_output_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='out_predict_traits_') output_files['counts'].append(tmp_output_fp) tip_to_predict_str = ','.join(list(tips_to_predict)) if opts.reconstruction_confidence: outfile_base, extension = splitext(tmp_output_fp) output_files['variances'].append(outfile_base + "_variances.tab") output_files['upper_CI'].append(outfile_base + "_upper_CI.tab") output_files['lower_CI'].append(outfile_base + "_lower_CI.tab") #create the job command cmd = "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format( script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp) else: cmd = "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format( script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp) #NOTE: Calculating NSTI this way is convenient, #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on. if opts.calculate_accuracy_metrics: cmd = cmd + " -a" #add job command to the the jobs file jobs.write(cmd + "\n") jobs.close() #add all output files to tmp list (used later for deletion) for predict_type in output_files: created_tmp_files.extend(output_files[predict_type]) if (opts.verbose): print "Launching parallel jobs." #run the job command job_prefix = 'picrust' submit_jobs(cluster_jobs_fp, jobs_fp, job_prefix, num_jobs=opts.num_jobs, delay=opts.delay) if (opts.verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files['counts']) if (opts.verbose): print "Jobs are done running." make_output_dir_for_file(opts.output_trait_table) outfile_base, extension = splitext(opts.output_trait_table) for predict_type in sorted(output_files): #Combine output files if opts.verbose: print "Combining all output files for " + predict_type combined_predictions = combine_predict_trait_output( output_files[predict_type]) if opts.verbose: print "Writing combined file for " + predict_type if predict_type == 'counts': #Output in whatever format the user wants if opts.output_precalc_file_in_biom: open(opts.output_trait_table, 'w').write( format_biom_table( convert_precalc_to_biom(combined_predictions))) else: open(opts.output_trait_table, 'w').write(combined_predictions) else: if opts.output_precalc_file_in_biom: open(outfile_base + "_" + predict_type + ".biom", 'w').write( format_biom_table( convert_precalc_to_biom(combined_predictions))) else: open(outfile_base + "_" + predict_type + ".tab", 'w').write(combined_predictions) #clean up all tmp files for file in created_tmp_files: remove(file)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) otu_table = load_table(opts.input_otu_fp) ids_to_load = otu_table.ids(axis="observation") if opts.input_count_fp is None: # precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name = "_".join(["16S", opts.gg_version, "precalculated.tab.gz"]) input_count_table = join(get_picrust_project_dir(), "picrust", "data", precalc_file_name) else: input_count_table = opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext = path.splitext(input_count_table)[1] if ext == ".gz": count_table_fh = gzip.open(input_count_table, "rb") else: count_table_fh = open(input_count_table, "U") if opts.load_precalc_file_in_biom: count_table = load_table(count_table_fh) else: count_table = convert_precalc_to_biom(count_table_fh, ids_to_load) # Need to only keep data relevant to our otu list ids = [] for x in otu_table.iter(axis="observation"): ids.append(str(x[1])) ob_id = count_table.ids(axis="observation")[0] filtered_otus = [] filtered_values = [] for x in ids: if count_table.exists(x, axis="sample"): filtered_otus.append(x) filtered_values.append(otu_table.data(x, axis="observation")) filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids()) copy_numbers_filtered = {} for x in filtered_otus: value = count_table.get_value_by_ids(ob_id, x) try: # data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError, "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x] = {opts.metadata_identifer: value} filtered_otu_table.add_metadata(copy_numbers_filtered, axis="observation") def metadata_norm(v, i, md): return v / float(md[opts.metadata_identifer]) normalized_table = filtered_otu_table.transform(metadata_norm, axis="observation") # move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table, normalized_table, "observation") make_output_dir_for_file(opts.output_otu_fp) write_biom_table(normalized_table, opts.output_otu_fp)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_ext=path.splitext(opts.input_otu_fp)[1] if opts.input_format_classic: otu_table=parse_classic_table_to_rich_table(open(opts.input_otu_fp,'U'),None,None,None,DenseOTUTable) else: try: otu_table = parse_biom_table(open(opts.input_otu_fp,'U')) except ValueError: raise ValueError("Error loading OTU table! If not in BIOM format use '-f' option.\n") ids_to_load = otu_table.ObservationIds if(opts.input_count_fp is None): #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if (ext == '.gz'): count_table_fh = gzip.open(input_count_table,'rb') else: count_table_fh = open(input_count_table,'U') if opts.load_precalc_file_in_biom: count_table = parse_biom_table(count_table_fh.read()) else: count_table = convert_precalc_to_biom(count_table_fh,ids_to_load) #Need to only keep data relevant to our otu list ids=[] for x in otu_table.iterObservations(): ids.append(str(x[1])) ob_id=count_table.ObservationIds[0] filtered_otus=[] filtered_values=[] for x in ids: if count_table.sampleExists(x): filtered_otus.append(x) filtered_values.append(otu_table.observationData(x)) #filtered_values = map(list,zip(*filtered_values)) filtered_otu_table=table_factory(filtered_values,otu_table.SampleIds,filtered_otus, constructor=DenseOTUTable) copy_numbers_filtered={} for x in filtered_otus: value = count_table.getValueByIds(ob_id,x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x]={opts.metadata_identifer:value} filtered_otu_table.addObservationMetadata(copy_numbers_filtered) normalized_table = filtered_otu_table.normObservationByMetadata(opts.metadata_identifer) #move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table,normalized_table,'ObservationMetadata') normalized_otu_table = transfer_sample_metadata(otu_table,normalized_table,'SampleMetadata') make_output_dir_for_file(opts.output_otu_fp) open(opts.output_otu_fp,'w').write(format_biom_table(normalized_table))
def run_asr_in_parallel(tree, table, asr_method, parallel_method='sge',tmp_dir='jobs/',num_jobs=100, verbose=False): '''Runs the ancestral state reconstructions in parallel''' asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py') if(parallel_method=='sge'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py') elif(parallel_method=='multithreaded'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py') elif(parallel_method=='torque'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py') else: raise RuntimeError if(verbose): print "Loading trait table..." #foreach trait in the table, create a new tmp file with just that trait, and create the job command and add it a tmp jobs file table=LoadTable(filename=table, header=True, sep='\t') #get dimensions of the table dim=table.Shape created_tmp_files=[] output_files=[] ci_files=[] #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_asr_') jobs=open(jobs_fp,'w') created_tmp_files.append(jobs_fp) if(verbose): print "Creating temporary input files in: ",tmp_dir #iterate over each column for i in range(1,dim[1]): #create a new table with only a single trait single_col_table=table.getColumns([0,i]) #write the new table to a tmp file single_col_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='in_asr_') single_col_table.writeToFile(single_col_fp,sep='\t') created_tmp_files.append(single_col_fp) #create tmp output files tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_') output_files.append(tmp_output_fp) tmp_ci_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_ci_') ci_files.append(tmp_ci_fp) #create the job command cmd= "{0} -i {1} -t {2} -m {3} -o {4} -c {5}".format(asr_script_fp, single_col_fp, tree, asr_method, tmp_output_fp, tmp_ci_fp) #add job command to the the jobs file jobs.write(cmd+"\n") jobs.close() created_tmp_files.extend(output_files) created_tmp_files.extend(ci_files) if(verbose): print "Launching parallel jobs." #run the job command job_prefix='asr' submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=num_jobs) if(verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files) if(verbose): print "Jobs are done running. Now combining all tmp files." #Combine output files combined_table=combine_asr_tables(output_files) combined_ci_table=combine_asr_tables(ci_files) #create a Table object combined_table=Table(header=combined_table[0],rows=combined_table[1:]) combined_ci_table=Table(header=combined_ci_table[0],rows=combined_ci_table[1:]) #clean up all tmp files for file in created_tmp_files: remove(file) #return the combined table return combined_table,combined_ci_table
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading OTU table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) if opts.verbose: print "Done loading OTU table containing %i samples and %i OTUs." %(len(otu_table.SampleIds),len(otu_table.ObservationIds)) if(opts.input_count_table is None): if(opts.type_of_prediction == 'KO'): input_count_table=join(get_picrust_project_dir(),'picrust','data','ko_precalculated.biom.gz') elif(opts.type_of_prediction == 'COG'): input_count_table=join(get_picrust_project_dir(),'picrust','data','cog_precalculated.biom.gz') else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if (ext == '.gz'): genome_table_str = gzip.open(input_count_table,'rb').read() else: genome_table_str = open(input_count_table,'U').read() #In the genome/trait table genomes are the samples and #genes are the observations if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs ids_to_load = otu_table.ObservationIds if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_str,ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* trait table because --suppress_subset_loading was passed. This may result in high memory usage." genome_table = parse_biom_table(genome_table_str) if opts.verbose: print "Done loading trait table containing %i functions for %i organisms." %(len(genome_table.ObservationIds),len(genome_table.SampleIds)) make_output_dir_for_file(opts.output_metagenome_table) if opts.accuracy_metrics: # Calculate accuracy metrics #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False) #print "Unweighted NSTI:", unweighted_nsti weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True) samples= weighted_nsti[0] nstis = list(weighted_nsti[1]) #print "Samples:",samples #print "NSTIs:",nstis samples_and_nstis = zip(samples,nstis) #print "Samples and NSTIs:",samples_and_nstis lines = ["#Sample\tMetric\tValue\n"] #print weighted_nsti for sample,nsti in samples_and_nstis: line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti)) lines.append(line) if opts.verbose: for l in sorted(lines): print l if opts.verbose: print "Writing accuracy information to file:", opts.accuracy_metrics open(opts.accuracy_metrics,'w').writelines(sorted(lines)) if opts.verbose: print "Predicting the metagenome..." predicted_metagenomes = predict_metagenomes(otu_table,genome_table) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) if(opts.format_tab_delimited): open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf(header_key="KEGG Pathways",header_value="KEGG Pathways",metadata_formatter=lambda s: '|'.join(['; '.join(l) for l in s]))) else: open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))
def run_asr_in_parallel(tree, table, asr_method, parallel_method='sge',tmp_dir='jobs/',num_jobs=100, verbose=False): '''Runs the ancestral state reconstructions in parallel''' asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py') if(parallel_method=='sge'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_picrust_jobs_sge.py') elif(parallel_method=='multithreaded'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_picrust_jobs.py') elif(parallel_method=='torque'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_picrust_jobs_torque.py') else: raise RuntimeError if(verbose): print "Loading trait table..." #foreach trait in the table, create a new tmp file with just that trait, and create the job command and add it a tmp jobs file table=LoadTable(filename=table, header=True, sep='\t') #get dimensions of the table dim=table.Shape created_tmp_files=[] output_files=[] ci_files=[] #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_asr_') jobs=open(jobs_fp,'w') created_tmp_files.append(jobs_fp) if(verbose): print "Creating temporary input files in: ",tmp_dir #iterate over each column for i in range(1,dim[1]): #create a new table with only a single trait single_col_table=table.getColumns([0,i]) #write the new table to a tmp file single_col_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='in_asr_') single_col_table.writeToFile(single_col_fp,sep='\t') created_tmp_files.append(single_col_fp) #create tmp output files tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_') output_files.append(tmp_output_fp) tmp_ci_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_ci_') ci_files.append(tmp_ci_fp) #create the job command cmd= "{0} -i {1} -t {2} -m {3} -o {4} -c {5}".format(asr_script_fp, single_col_fp, tree, asr_method, tmp_output_fp, tmp_ci_fp) #add job command to the the jobs file jobs.write(cmd+"\n") jobs.close() created_tmp_files.extend(output_files) created_tmp_files.extend(ci_files) if(verbose): print "Launching parallel jobs." #run the job command job_prefix='asr' submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=num_jobs) if(verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files) if(verbose): print "Jobs are done running. Now combining all tmp files." #Combine output files combined_table=combine_asr_tables(output_files) combined_ci_table=combine_asr_tables(ci_files) #create a Table object combined_table=Table(header=combined_table[0],rows=combined_table[1:]) combined_ci_table=Table(header=combined_ci_table[0],rows=combined_ci_table[1:]) #clean up all tmp files for file in created_tmp_files: remove(file) #return the combined table return combined_table,combined_ci_table
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table = load_table(opts.input_otu_fp) ids_to_load = otu_table.ids(axis='observation') if(opts.input_count_fp is None): #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if (ext == '.gz'): count_table_fh = gzip.open(input_count_table,'rb') else: count_table_fh = open(input_count_table,'U') if opts.load_precalc_file_in_biom: count_table = load_table(count_table_fh) else: count_table = convert_precalc_to_biom(count_table_fh, ids_to_load) #Need to only keep data relevant to our otu list ids=[] for x in otu_table.iter(axis='observation'): ids.append(str(x[1])) ob_id=count_table.ids(axis='observation')[0] filtered_otus=[] filtered_values=[] for x in ids: if count_table.exists(x, axis='sample'): filtered_otus.append(x) filtered_values.append(otu_table.data(x, axis='observation')) filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids()) copy_numbers_filtered={} for x in filtered_otus: value = count_table.get_value_by_ids(ob_id,x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x]={opts.metadata_identifer:value} filtered_otu_table.add_metadata(copy_numbers_filtered, axis='observation') def metadata_norm(v, i, md): return v / float(md[opts.metadata_identifer]) normalized_table = filtered_otu_table.transform(metadata_norm, axis='observation') #move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'observation') make_output_dir_for_file(opts.output_otu_fp) write_biom_table(normalized_table, opts.output_otu_fp)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if (opts.suppress_unit_tests and opts.suppress_script_usage_tests): option_parser.error("You're suppressing both test types. Nothing to run.") test_dir = abspath(dirname(__file__)) unittest_good_pattern = re.compile('OK\s*$') application_not_found_pattern = re.compile('ApplicationNotFoundError') python_name = 'python' bad_tests = [] missing_application_tests = [] # Run through all of PICRUSt's unit tests, and keep track of any files which # fail unit tests. if not opts.suppress_unit_tests: unittest_names = [] if not opts.unittest_glob: for root, dirs, files in walk(test_dir): for name in files: if name.startswith('test_') and name.endswith('.py'): unittest_names.append(join(root,name)) else: for fp in glob(opts.unittest_glob): fn = split(fp)[1] if fn.startswith('test_') and fn.endswith('.py'): unittest_names.append(abspath(fp)) unittest_names.sort() for unittest_name in unittest_names: print "Testing %s:\n" % unittest_name command = '%s %s -v' % (python_name, unittest_name) stdout, stderr, return_value = system_call(command) print stderr if not unittest_good_pattern.search(stderr): if application_not_found_pattern.search(stderr): missing_application_tests.append(unittest_name) else: bad_tests.append(unittest_name) if not opts.suppress_script_usage_tests: try: from qiime.test import run_script_usage_tests except ImportError: print "QIIME not installed so not running script tests." opts.suppress_script_usage_tests=True else: test_data_dir = join(get_picrust_project_dir(),'picrust_test_data') scripts_dir = join(get_picrust_project_dir(),'scripts') if opts.script_usage_tests != None: script_usage_tests = opts.script_usage_tests.split(',') else: script_usage_tests = None # Run the script usage testing functionality script_usage_result_summary, num_script_usage_example_failures = \ run_script_usage_tests( test_data_dir=test_data_dir, scripts_dir=scripts_dir, working_dir='/tmp/', verbose=True, tests=script_usage_tests, force_overwrite=True, timeout=300) print "==============\nResult summary\n==============" if not opts.suppress_unit_tests: print "\nUnit test result summary\n------------------------\n" if bad_tests: print "\nFailed the following unit tests.\n%s" % '\n'.join(bad_tests) if missing_application_tests: print "\nFailed the following unit tests, in part or whole due "+\ "to missing external applications.\nDepending on the PICRUSt features "+\ "you plan to use, this may not be critical.\n%s"\ % '\n'.join(missing_application_tests) if not (missing_application_tests or bad_tests): print "\nAll unit tests passed.\n\n" if not opts.suppress_script_usage_tests: print "\nScript usage test result summary\n------------------------------------\n" print script_usage_result_summary print "" # If script usage tests weren't suppressed,we can't have any failures. script_usage_tests_success = (opts.suppress_script_usage_tests or num_script_usage_example_failures == 0) # If any of the unit tests or script usage tests fail, or if we have any # missing application errors, use return code 1 (as python's unittest # module does to indicate one or more failures). return_code = 1 if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and script_usage_tests_success): return_code = 0 return return_code
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) tmp_dir='jobs/' make_output_dir(tmp_dir) #Run the jobs script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py') if(opts.parallel_method=='sge'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py') elif(opts.parallel_method=='multithreaded'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py') elif(opts.parallel_method=='torque'): cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py') else: raise RuntimeError if(opts.verbose): print "Loading tree..." tree = load_picrust_tree(opts.tree, opts.verbose) all_tips = [tip.Name for tip in tree.tips()] if(opts.verbose): print "Total number of possible tips to predict: {0}".format(len(all_tips)) created_tmp_files=[] output_files={} output_files['counts']=[] if opts.reconstruction_confidence: output_files['variances']=[] output_files['upper_CI']=[] output_files['lower_CI']=[] if opts.already_calculated: all_tips=get_tips_not_in_precalc(all_tips,opts.already_calculated) if opts.verbose: print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(len(all_tips)) #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_') jobs=open(jobs_fp,'w') created_tmp_files.append(jobs_fp) if(opts.verbose): print "Creating temporary input files in: ",tmp_dir num_tips_per_job=1000 for tips_to_predict in [all_tips[i:i+num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job)]: #create tmp output files tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_predict_traits_') output_files['counts'].append(tmp_output_fp) tip_to_predict_str=','.join(list(tips_to_predict)) if opts.reconstruction_confidence: outfile_base,extension = splitext(tmp_output_fp) output_files['variances'].append(outfile_base+"_variances.tab") output_files['upper_CI'].append(outfile_base+"_upper_CI.tab") output_files['lower_CI'].append(outfile_base+"_lower_CI.tab") #create the job command cmd= "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp) else: cmd= "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp) #NOTE: Calculating NSTI this way is convenient, #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on. if opts.calculate_accuracy_metrics: cmd=cmd+" -a" #add job command to the the jobs file jobs.write(cmd+"\n") jobs.close() #add all output files to tmp list (used later for deletion) for predict_type in output_files: created_tmp_files.extend(output_files[predict_type]) if(opts.verbose): print "Launching parallel jobs." #run the job command job_prefix='picrust' submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs,delay=opts.delay) if(opts.verbose): print "Jobs are now running. Will wait until finished." #wait until all jobs finished (e.g. simple poller) wait_for_output_files(output_files['counts']) if(opts.verbose): print "Jobs are done running." make_output_dir_for_file(opts.output_trait_table) outfile_base,extension = splitext(opts.output_trait_table) for predict_type in sorted(output_files): #Combine output files if opts.verbose: print "Combining all output files for "+ predict_type combined_predictions=combine_predict_trait_output(output_files[predict_type]) if opts.verbose: print "Writing combined file for "+predict_type if predict_type == 'counts': #Output in whatever format the user wants if opts.output_precalc_file_in_biom: open(opts.output_trait_table,'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions))) else: open(opts.output_trait_table,'w').write(combined_predictions) else: if opts.output_precalc_file_in_biom: open(outfile_base+"_"+predict_type+".biom",'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions))) else: open(outfile_base+"_"+predict_type+".tab",'w').write(combined_predictions) #clean up all tmp files for file in created_tmp_files: remove(file)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_ext = path.splitext(opts.input_otu_fp)[1] if opts.input_format_classic: otu_table = parse_classic_table_to_rich_table( open(opts.input_otu_fp, 'U'), None, None, None, DenseOTUTable) else: try: otu_table = parse_biom_table(open(opts.input_otu_fp, 'U')) except ValueError: raise ValueError( "Error loading OTU table! If not in BIOM format use '-f' option.\n" ) ids_to_load = otu_table.ObservationIds if (opts.input_count_fp is None): #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name = '_'.join( ['16S', opts.gg_version, 'precalculated.tab.gz']) input_count_table = join(get_picrust_project_dir(), 'picrust', 'data', precalc_file_name) else: input_count_table = opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext = path.splitext(input_count_table)[1] if (ext == '.gz'): count_table_fh = gzip.open(input_count_table, 'rb') else: count_table_fh = open(input_count_table, 'U') if opts.load_precalc_file_in_biom: count_table = parse_biom_table(count_table_fh.read()) else: count_table = convert_precalc_to_biom(count_table_fh, ids_to_load) #Need to only keep data relevant to our otu list ids = [] for x in otu_table.iterObservations(): ids.append(str(x[1])) ob_id = count_table.ObservationIds[0] filtered_otus = [] filtered_values = [] for x in ids: if count_table.sampleExists(x): filtered_otus.append(x) filtered_values.append(otu_table.observationData(x)) #filtered_values = map(list,zip(*filtered_values)) filtered_otu_table = table_factory(filtered_values, otu_table.SampleIds, filtered_otus, constructor=DenseOTUTable) copy_numbers_filtered = {} for x in filtered_otus: value = count_table.getValueByIds(ob_id, x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x] = {opts.metadata_identifer: value} filtered_otu_table.addObservationMetadata(copy_numbers_filtered) normalized_table = filtered_otu_table.normObservationByMetadata( opts.metadata_identifer) #move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'ObservationMetadata') normalized_otu_table = transfer_sample_metadata(otu_table, normalized_table, 'SampleMetadata') make_output_dir_for_file(opts.output_otu_fp) open(opts.output_otu_fp, 'w').write(format_biom_table(normalized_table))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if (opts.suppress_unit_tests and opts.suppress_script_usage_tests): option_parser.error("You're suppressing both test types. Nothing to run.") test_dir = abspath(dirname(__file__)) unittest_good_pattern = re.compile('OK\s*$') application_not_found_pattern = re.compile('ApplicationNotFoundError') python_name = 'python' bad_tests = [] missing_application_tests = [] # Run through all of PICRUSt's unit tests, and keep track of any files which # fail unit tests. if not opts.suppress_unit_tests: unittest_names = [] if not opts.unittest_glob: for root, dirs, files in walk(test_dir): for name in files: if name.startswith('test_') and name.endswith('.py'): unittest_names.append(join(root,name)) else: for fp in glob(opts.unittest_glob): fn = split(fp)[1] if fn.startswith('test_') and fn.endswith('.py'): unittest_names.append(abspath(fp)) unittest_names.sort() for unittest_name in unittest_names: print "Testing %s:\n" % unittest_name command = '%s %s -v' % (python_name, unittest_name) stdout, stderr, return_value = system_call(command) print stderr if not unittest_good_pattern.search(stderr): if application_not_found_pattern.search(stderr): missing_application_tests.append(unittest_name) else: bad_tests.append(unittest_name) if not opts.suppress_script_usage_tests: try: from qiime.test import run_script_usage_tests except ImportError: print "QIIME not installed so not running script tests." opts.suppress_script_usage_tests=True else: test_data_dir = join(get_picrust_project_dir(),'picrust_test_data') scripts_dir = join(get_picrust_project_dir(),'scripts') if opts.script_usage_tests != None: script_usage_tests = opts.script_usage_tests.split(',') else: script_usage_tests = None # Run the script usage testing functionality script_usage_result_summary, num_script_usage_example_failures = \ run_script_usage_tests( qiime_test_data_dir=test_data_dir, qiime_scripts_dir=scripts_dir, working_dir='/tmp/', verbose=True, tests=script_usage_tests, failure_log_fp=None, force_overwrite=True) print "==============\nResult summary\n==============" if not opts.suppress_unit_tests: print "\nUnit test result summary\n------------------------\n" if bad_tests: print "\nFailed the following unit tests.\n%s" % '\n'.join(bad_tests) if missing_application_tests: print "\nFailed the following unit tests, in part or whole due "+\ "to missing external applications.\nDepending on the PICRUSt features "+\ "you plan to use, this may not be critical.\n%s"\ % '\n'.join(missing_application_tests) if not (missing_application_tests or bad_tests): print "\nAll unit tests passed.\n\n" if not opts.suppress_script_usage_tests: print "\nScript usage test result summary\n------------------------------------\n" print script_usage_result_summary print "" # If script usage tests weren't suppressed,we can't have any failures. script_usage_tests_success = (opts.suppress_script_usage_tests or num_script_usage_example_failures == 0) # If any of the unit tests or script usage tests fail, or if we have any # missing application errors, use return code 1 (as python's unittest # module does to indicate one or more failures). return_code = 1 if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and script_usage_tests_success): return_code = 0 return return_code
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ids_to_load = otu_table.ObservationIds if(opts.input_count_table is None): #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz) precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if opts.verbose: print "Loading count table: ", input_count_table if (ext == '.gz'): genome_table_fh = gzip.open(input_count_table,'rb') else: genome_table_fh = open(input_count_table,'U') #In the genome/trait table genomes are the samples and #genes are the observations if opts.load_precalc_file_in_biom: if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = parse_biom_table(genome_table_fh.read()) else: genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load) partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_fp make_output_dir_for_file(opts.output_fp) open(opts.output_fp,'w').write(output_text)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #set some defaults for the options input_dir = opts.input_dir output_dir = opts.output_dir or input_dir tmp_dir = opts.tmp_dir or output_dir parallel_method = opts.parallel_method asr_method = opts.asr_method predict_traits_method = opts.prediction_method if opts.num_jobs > 20 and parallel_method == 'multithreaded': raise ValueError( 'You probably dont want to run multithreaded evaluations with a large num_jobs. Please adjust options num_jobs and or parallel_method' ) if opts.with_confidence and asr_method not in ['ace_ml', 'ace_reml']: raise ValueError( "PICRUST currently only supports confidence intervals with the ace_ml and ace_reml ASR methods" ) if opts.verbose: print "Reconstruction method:", asr_method print "Prediction method:", predict_traits_method print "Parallel method:", parallel_method print "num_jobs:", opts.num_jobs print "\nOutput will be saved here:'%s'" % output_dir #create the output directory unless it already exists make_output_dir(output_dir) if (parallel_method == 'sge'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_sge.py') elif (parallel_method == 'multithreaded'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs.py') elif (parallel_method == 'torque'): cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts', 'start_parallel_jobs_torque.py') else: raise RuntimeError #get the test datasets to run in the input directory (based on exp_traits files) expect_test_files = glob(join(input_dir, 'exp_traits--*')) test_datasets = {} for file_name in expect_test_files: test_id = file_name.replace(join(input_dir, 'exp_traits--'), '', 1) #create a dict with the test files as values in the ref list test_datasets[test_id] = [ join(input_dir, 'test_trait_table--' + test_id), join(input_dir, 'test_tree--' + test_id), join(input_dir, 'exp_traits--' + test_id) ] created_tmp_files = [] output_files = [] #create a tmp file to store the job commands (which we will pass to our parallel script to run) jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_') jobs = open(jobs_fp, 'w') created_tmp_files.append(jobs_fp) #get location of scripts we need to run asr_script_fp = join(get_picrust_project_dir(), 'scripts', 'ancestral_state_reconstruction.py') predict_traits_script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py') #run each test dataset through the pipeline for test_id in test_datasets: asr_out_fp = join(output_dir, 'asr--' + asr_method + '--' + test_id) asr_params_out_fp = join( output_dir, '--'.join(['asr', asr_method, 'asr_params', test_id])) created_tmp_files.append(asr_out_fp) if opts.check_for_null_files and exists( asr_out_fp) and file_contains_nulls(asr_out_fp): #remove file if opts.verbose: print "Existing ASR file contains null characters. Will run ASR again after removing: " + asr_out_fp remove(asr_out_fp) if exists(asr_out_fp) and not opts.force: if opts.verbose: print "Output file: {0} already exists, so we will skip it.".format( asr_out_fp) asr_cmd = "echo 'Skipping ASR for %s, file %s exists already'" % ( test_id, asr_out_fp) else: #create the asr command asr_cmd = """python {0} -i "{1}" -t "{2}" -m {3} -o "{4}" -c "{5}" """.format( asr_script_fp, test_datasets[test_id][0], test_datasets[test_id][1], asr_method, asr_out_fp, asr_params_out_fp) predict_traits_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\ opts.weighting_method,test_id])) if opts.with_accuracy: predict_traits_accuracy_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\ opts.weighting_method,'accuracy_metrics',test_id])) if opts.check_for_null_files and exists( predict_traits_out_fp) and file_contains_nulls( predict_traits_out_fp): if opts.verbose: print "Existing trait predictions file contains null characters. Will run it again after removing: " + predict_traits_out_fp remove(predict_traits_out_fp) if exists(predict_traits_out_fp) and not opts.force: if opts.verbose: print "Prediction file: {0} already exists. Skipping ASR and prediction for this organism".format( predict_traits_out_fp) continue output_files.append(predict_traits_out_fp) genome_id = split('--', test_id)[2] if predict_traits_method == 'nearest_neighbor': #don't do asr step predict_traits_cmd = """python {0} -i "{1}" -t "{2}" -g "{3}" -o "{4}" -m "{5}" """.format( predict_traits_script_fp, test_datasets[test_id][0], opts.ref_tree, genome_id, predict_traits_out_fp, predict_traits_method) jobs.write(predict_traits_cmd + "\n") else: #create the predict traits command predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -r "{3}" -g "{4}" -o "{5}" -m "{6}" -w {7} """.format(predict_traits_script_fp,\ test_datasets[test_id][0], opts.ref_tree, asr_out_fp,genome_id, predict_traits_out_fp,predict_traits_method,opts.weighting_method) #Instruct predict_traits to use confidence intervals output by ASR if opts.with_confidence: confidence_param = ' -c "%s"' % (asr_params_out_fp) predict_traits_cmd = predict_traits_cmd + confidence_param #Instruct predict traits to output the NTSI measure of distance to #nearby sequences. if opts.with_accuracy: accuracy_param = ' -a "%s"' % (predict_traits_accuracy_out_fp) predict_traits_cmd = predict_traits_cmd + accuracy_param #add job command to the the jobs file jobs.write(asr_cmd + ';' + predict_traits_cmd + "\n") jobs.close() #created_tmp_files.extend(output_files) #submit the jobs job_prefix = 'eval_' if opts.verbose: print "Submitting jobs:", cluster_jobs_fp, jobs_fp, job_prefix, opts.num_jobs submit_jobs(cluster_jobs_fp, jobs_fp, job_prefix, num_jobs=opts.num_jobs)
from os.path import join from picrust.util import get_picrust_project_dir import gzip script_info = {} script_info['brief_description'] = "This script produces the actual metagenome functional predictions for a given OTU table." script_info['script_description'] = "" script_info['script_usage'] = [("","Predict metagenomes from genomes.biom and otus.biom.","%prog -i normalized_otus.biom -o predicted_metagenomes.biom"), ("","Change output format to plain tab-delimited:","%prog -f -i normalized_otus.biom -o predicted_metagenomes.tab")] script_info['output_description']= "Output is a table of function counts (e.g. KEGG KOs) by sample ids." script_info['required_options'] = [ make_option('-i','--input_otu_table',type='existing_filepath',help='the input otu table in biom format'), make_option('-o','--output_metagenome_table',type="new_filepath",help='the output file for the predicted metagenome') ] script_info['optional_options'] = [\ make_option('-c','--input_count_table',default=join(get_picrust_project_dir(),'picrust','data','ko_precalculated.biom.gz'),type="existing_filepath",help='Precalculated function predictions on per otu basis in biom format (can be gzipped) [default: %default]'), make_option('-a','--accuracy_metrics',default=None,type="new_filepath",help='If provided, calculate accuracy metrics for the predicted metagenome. NOTE: requires that per-genome accuracy metrics were calculated using predict_traits.py during genome prediction (e.g. there are "NSTI" values in the genome .biom file metadata)'), make_option('--suppress_subset_loading',default=False,action="store_true",help='Normally, only counts for OTUs present in the sample are loaded. If this flag is passed, the full biom table is loaded. This makes no difference for the analysis, but may result in faster load times (at the cost of more memory usage)'), make_option('-f','--format_tab_delimited',action="store_true",default=False,help='output the predicted metagenome table in tab-delimited format [default: %default]')] script_info['version'] = __version__ def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ext=path.splitext(opts.input_count_table)[1] if not opts.suppress_subset_loading: