Example #1
0
 def test_get_picrust_project_dir(self):
     """get_picrust_project_dir functions as expected"""
     
     # Do an explicit check on whether the file system containing
     # the current file is case insensitive.
     case_insensitive_filesystem = \
      exists(__file__.upper()) and exists(__file__.lower())
      
     actual = get_picrust_project_dir()
     # I base the expected here off the imported location of
     # picrust/util.py here, to handle cases where either the user has
     # PICRUST in their PYTHONPATH, or when they've installed it with
     # setup.py.
     # If util.py moves this test will fail -- that 
     # is what we want in this case, as the get_picrust_project_dir()
     # function would need to be modified.
     import picrust.util
     util_py_filepath = abspath(abspath(picrust.util.__file__))
     expected = dirname(dirname(util_py_filepath))
     
     if case_insensitive_filesystem:
         # make both lowercase if the file system is case insensitive
         actual = actual.lower()
         expected = expected.lower()
     self.assertEqual(actual,expected)
Example #2
0
    def test_get_picrust_project_dir(self):
        """get_picrust_project_dir functions as expected"""

        # Do an explicit check on whether the file system containing
        # the current file is case insensitive.
        case_insensitive_filesystem = \
         exists(__file__.upper()) and exists(__file__.lower())

        actual = get_picrust_project_dir()
        # I base the expected here off the imported location of
        # picrust/util.py here, to handle cases where either the user has
        # PICRUST in their PYTHONPATH, or when they've installed it with
        # setup.py.
        # If util.py moves this test will fail -- that
        # is what we want in this case, as the get_picrust_project_dir()
        # function would need to be modified.
        import picrust.util
        util_py_filepath = abspath(abspath(picrust.util.__file__))
        expected = dirname(dirname(util_py_filepath))

        if case_insensitive_filesystem:
            # make both lowercase if the file system is case insensitive
            actual = actual.lower()
            expected = expected.lower()
        self.assertEqual(actual, expected)
Example #3
0
class Count(CommandLineApplication):
    """ Application controller for Count an ASR tool."""

    #count_fp = environ['COUNT_JAR']
    count_fp = join(get_picrust_project_dir(),'picrust','support_files','jar','Count.jar')
    _command = 'java -Xmx1024M -cp ' + count_fp + ' ca.umontreal.iro.evolution.genecontent.AsymmetricWagner'
    _parameters = {\
     '-gain':ValuedParameter(Prefix='-',Name='gain',Delimiter=' '),\
     '-max_paralogs':ValuedParameter(Prefix='-',Name='max_paralogs',Delimiter=' ')}
    _input_handler = '_input_as_paths'
    _suppress_stdout = False
    _suppress_stderr = False

    #need to overide this method since using command that is not executable
    def _error_on_missing_application(self,params):
        pass
Example #4
0
class Ace(CommandLineApplication):
    """ Application controller for 'ace' fucntion within the 'ape' R package."""

    ace_script_fp = join(get_picrust_project_dir(), 'picrust', 'support_files',
                         'R', 'ace.R')
    _command = ace_script_fp
    _input_handler = '_input_as_string'
    _suppress_stdout = False
    _suppress_stderr = False

    # Overridden to call script with R rather than directly - this is useful
    # because permisssions on the script are set to 644 when PICRUSt is installed
    # with setup.py. This is fine if we're executing it with R, but not if we're
    # trying to execute it directly.
    def _get_base_command(self):
        """ Returns the full command string

            input_arg: the argument to the command which represents the input
                to the program, this will be a string, either
                representing input or a filename to get input from
         """
        command_parts = []
        # Append a change directory to the beginning of the command to change
        # to self.WorkingDir before running the command
        # WorkingDir should be in quotes -- filenames might contain spaces
        cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
        if self._command is None:
            raise ApplicationError, '_command has not been set.'
        command = self._command
        parameters = self.Parameters

        command_parts.append(cd_command)
        command_parts.append("R")
        command_parts.append("-f")
        command_parts.append(command)
        command_parts.append("--args")
        command_parts.append(self._command_delimiter.join(filter(\
            None,(map(str,parameters.values())))))

        return self._command_delimiter.join(command_parts).strip()

    BaseCommand = property(_get_base_command)
Example #5
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading OTU table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))
    ids_to_load = otu_table.ObservationIds
    
    if opts.verbose:
        print "Done loading OTU table containing %i samples and %i OTUs." \
          %(len(otu_table.SampleIds),len(otu_table.ObservationIds))
    
    #Hardcoded loaction of the precalculated datasets for PICRUSt,
    #relative to the project directory
    precalc_data_dir=join(get_picrust_project_dir(),'picrust','data')

    # Load a table of gene counts by OTUs.
    #This can be either user-specified or precalculated
    genome_table_fp = determine_data_table_fp(precalc_data_dir,\
      opts.type_of_prediction,opts.gg_version,\
      user_specified_table=opts.input_count_table,verbose=opts.verbose)

    if opts.verbose:
        print "Loading gene count data from file: %s" %genome_table_fp
    
    genome_table= load_data_table(genome_table_fp,\
      load_data_table_in_biom=opts.load_precalc_file_in_biom,\
      suppress_subset_loading=opts.suppress_subset_loading,\
      ids_to_load=ids_to_load,verbose=opts.verbose,transpose=True)
  
    if opts.verbose:
        print "Loaded %i genes across %i OTUs from gene count table" \
          %(len(genome_table.ObservationIds),len(genome_table.SampleIds))
    
    if opts.with_confidence:
        if opts.input_variance_table:
            variance_table_fp = opts.input_variance_table
        else:
            variance_table_fp = determine_data_table_fp(precalc_data_dir,\
              opts.type_of_prediction,opts.gg_version,\
              precalc_file_suffix='precalculated_variances.tab.gz',\
              user_specified_table=opts.input_count_table)

        if opts.verbose:
            print "Loading variance information from table: %s" \
            %variance_table_fp
        
        variance_table= load_data_table(variance_table_fp,\
          load_data_table_in_biom=opts.load_precalc_file_in_biom,\
          suppress_subset_loading=opts.suppress_subset_loading,\
          ids_to_load=ids_to_load,transpose=True)
        
        if opts.verbose:
            print "Loaded %i genes across %i OTUs from variance table" \
              %(len(variance_table.ObservationIds),len(variance_table.SampleIds))
        #Raise an error if the genome table and variance table differ
        #in the genomes they contain.
        #better to find out now than have something obscure happen latter on
        if opts.verbose:
            print "Checking that genome table and variance table are consistent"
        try:
            assert set(variance_table.ObservationIds) == set(genome_table.ObservationIds) 
        except AssertionError,e:
            for var_id in variance_table.ObservationIds:
                if var_id not in genome_table.ObservationIds:
                    print "Variance table ObsId %s not in genome_table ObsIds" %var_id
            raise AssertionError("Variance table and genome table contain different gene ids")
        try:
            assert set(variance_table.SampleIds) == set(genome_table.SampleIds)
        except AssertionError,e:
            for var_id in variance_table.SampleIds:
                if var_id not in genome_table.SampleIds:
                    print "Variance table SampleId %s not in genome_table SampleIds" %var_id
            raise AssertionError("Variance table and genome table contain different OTU ids")
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)


    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = load_table(opts.input_otu_table)
    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_table is None):
        #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", input_count_table

    if (ext == '.gz'):
        genome_table_fh = gzip.open(input_count_table,'rb')
    else:
        genome_table_fh = open(input_count_table,'U')

    #In the genome/trait table genomes are the samples and
    #genes are the observations


    if opts.load_precalc_file_in_biom:
        if not opts.suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs

            if opts.verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if opts.verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = load_table(genome_table_fh)
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load)
    ok_functional_categories = None

    metadata_type = None
    if opts.limit_to_functional_categories:
        ok_functional_categories = opts.limit_to_functional_categories.split("|")
        if opts.verbose:
            print "Limiting to functional categories: %s" %(str(ok_functional_categories))

        # Either KEGG_Pathways or COG_Category needs
        # to be assigned to metadata_key to limit to
        # functional categories (not needed for 
        # individual functions) 

        if opts.type_of_prediction == "ko":
            metadata_type = "KEGG_Pathways"
        elif opts.type_of_prediction == "cog":
            metadata_type = "COG_Category"
        elif opts.type_of_prediction == "rfam":
            exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)")
              
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\
      limit_to_functional_categories = ok_functional_categories ,  metadata_key = metadata_type )

    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_fp

    make_output_dir_for_file(opts.output_fp)
    open(opts.output_fp,'w').write(output_text)
import sys

script_info = {}
script_info['brief_description'] = "Normalize an OTU table by marker gene copy number"
script_info['script_description'] = ""
script_info['script_usage'] = [
("","Normalize the counts in raw_otus.biom. Write the resulting table to normalized_otus.biom.","%prog -i raw_otus.biom -o normalized_otus.biom"),
("","Input tab-delimited OTU table:","%prog -f -i raw_otus.tab -o predicted_metagenomes.biom")
]
script_info['output_description']= "A normalized OTU table"
script_info['required_options'] = [
 make_option('-i','--input_otu_fp',type="existing_filepath",help='the input otu table filepath in biom format'),
 make_option('-o','--output_otu_fp',type="new_filepath",help='the output otu table filepath in biom format'),
]
script_info['optional_options'] = [
 make_option('-c','--input_count_fp',default=join(get_picrust_project_dir(),'picrust','data','16S_precalculated.biom.gz'),type="existing_filepath",help='the input marker gene counts on per otu basis in biom format (can be gzipped) [default: %default]'),
 make_option('--metadata_identifer',
             default='CopyNumber',
             help='identifier for copy number entry as observation metadata [default: %default]'),
 make_option('-f','--input_format_classic', action="store_true", default=False, help='input otu table (--input_otu_fp) is in classic Qiime format [default: %default]'),
]
script_info['version'] = __version__


def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_ext=path.splitext(opts.input_otu_fp)[1]
    if opts.input_format_classic:
        otu_table=parse_classic_table_to_rich_table(open(opts.input_otu_fp,'U'),None,None,None,DenseOTUTable)
Example #8
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading OTU table: ", opts.input_otu_table

    otu_table = load_table(opts.input_otu_table)
    ids_to_load = otu_table.ids(axis='observation').tolist()

    # Determine whether user wants predictions round to nearest whole
    # number or not.
    if opts.no_round:
        round_flag = False
    else:
        round_flag = True

    if opts.verbose:
        print "Done loading OTU table containing %i samples and %i OTUs." \
          %(len(otu_table.ids()),len(otu_table.ids(axis='observation')))

    #Hardcoded loaction of the precalculated datasets for PICRUSt,
    #relative to the project directory
    precalc_data_dir = join(get_picrust_project_dir(), 'picrust', 'data')

    # Load a table of gene counts by OTUs.
    #This can be either user-specified or precalculated
    genome_table_fp = determine_data_table_fp(precalc_data_dir,\
      opts.type_of_prediction,opts.gg_version,\
      user_specified_table=opts.input_count_table,verbose=opts.verbose)

    if opts.verbose:
        print "Loading gene count data from file: %s" % genome_table_fp

    genome_table= load_data_table(genome_table_fp,\
      load_data_table_in_biom=opts.load_precalc_file_in_biom,\
      suppress_subset_loading=opts.suppress_subset_loading,\
      ids_to_load=ids_to_load,verbose=opts.verbose,transpose=True)

    if opts.verbose:
        print "Loaded %i genes across %i OTUs from gene count table" \
          %(len(genome_table.ids(axis='observation')),len(genome_table.ids()))

    if opts.with_confidence:
        if opts.input_variance_table:
            variance_table_fp = opts.input_variance_table
        else:
            variance_table_fp = determine_data_table_fp(precalc_data_dir,\
              opts.type_of_prediction,opts.gg_version,\
              precalc_file_suffix='precalculated_variances.tab.gz',\
              user_specified_table=opts.input_count_table)

        if opts.verbose:
            print "Loading variance information from table: %s" \
            %variance_table_fp

        variance_table= load_data_table(variance_table_fp,\
          load_data_table_in_biom=opts.load_precalc_file_in_biom,\
          suppress_subset_loading=opts.suppress_subset_loading,\
          ids_to_load=ids_to_load,transpose=True)

        if opts.verbose:
            print "Loaded %i genes across %i OTUs from variance table" \
              %(len(variance_table.ids(axis='observation')),len(variance_table.ids()))
        #Raise an error if the genome table and variance table differ
        #in the genomes they contain.
        #better to find out now than have something obscure happen latter on
        if opts.verbose:
            print "Checking that genome table and variance table are consistent"
        try:
            assert set(variance_table.ids(axis='observation')) == set(
                genome_table.ids(axis='observation'))
        except AssertionError, e:
            for var_id in variance_table.ids(axis='observation'):
                if var_id not in genome_table.ids(axis='observation'):
                    print "Variance table ObsId %s not in genome_table ObsIds" % var_id
            raise AssertionError(
                "Variance table and genome table contain different gene ids")
        try:
            assert set(variance_table.ids()) == set(genome_table.ids())
        except AssertionError, e:
            for var_id in variance_table.ids():
                if var_id not in genome_table.ids():
                    print "Variance table SampleId %s not in genome_table SampleIds" % var_id
            raise AssertionError(
                "Variance table and genome table contain different OTU ids")
Example #9
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    #set some defaults for the options
    input_dir=opts.input_dir
    output_dir=opts.output_dir or input_dir
    tmp_dir=opts.tmp_dir or output_dir
    parallel_method=opts.parallel_method
    asr_method = opts.asr_method
    predict_traits_method = opts.prediction_method
    
    if opts.num_jobs > 20 and parallel_method == 'multithreaded':
        raise ValueError('You probably dont want to run multithreaded evaluations with a large num_jobs. Please adjust options num_jobs and or parallel_method')
        
    if opts.with_confidence and asr_method not in ['ace_ml','ace_reml']:
        raise ValueError("PICRUST currently only supports confidence intervals with the ace_ml and ace_reml ASR methods")

    if opts.verbose:
        print "Reconstruction method:",asr_method
        print "Prediction method:",predict_traits_method
        print "Parallel method:",parallel_method
        print "num_jobs:",opts.num_jobs
        print "\nOutput will be saved here:'%s'" %output_dir 
    
    #create the output directory unless it already exists
    make_output_dir(output_dir)

    if(parallel_method=='sge'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py')
    elif(parallel_method=='multithreaded'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py')
    elif(parallel_method=='torque'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py')
    else:
        raise RuntimeError


    #get the test datasets to run in the input directory (based on exp_traits files)
    expect_test_files=glob(join(input_dir,'exp_traits--*')) 

    test_datasets={}
    for file_name in expect_test_files:
        test_id=file_name.replace(join(input_dir,'exp_traits--'),'',1)
        #create a dict with the test files as values in the ref list
        test_datasets[test_id]=[ join(input_dir,'test_trait_table--'+test_id),join(input_dir,'test_tree--'+test_id),join(input_dir,'exp_traits--'+test_id)]
    
    created_tmp_files=[]    
    output_files=[]

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_')
    jobs=open(jobs_fp,'w')
    created_tmp_files.append(jobs_fp)

    #get location of scripts we need to run
    asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py')
    predict_traits_script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py')

    #run each test dataset through the pipeline
    for test_id in test_datasets:

        asr_out_fp=join(output_dir,'asr--'+asr_method+'--'+test_id)
        asr_params_out_fp=join(output_dir,'--'.join(['asr',asr_method,'asr_params',test_id]))
        created_tmp_files.append(asr_out_fp)

        if opts.check_for_null_files and exists(asr_out_fp) and file_contains_nulls(asr_out_fp):
            #remove file
            if opts.verbose:
                print "Existing ASR file contains null characters. Will run ASR again after removing: "+asr_out_fp
            remove(asr_out_fp)
        

        if exists(asr_out_fp) and not opts.force:
            if opts.verbose:
                print "Output file: {0} already exists, so we will skip it.".format(asr_out_fp)
            asr_cmd = "echo 'Skipping ASR for %s, file %s exists already'" %(test_id,asr_out_fp)
        else:
            #create the asr command
            asr_cmd= """python {0} -i "{1}" -t "{2}" -m {3} -o "{4}" -c "{5}" """.format(asr_script_fp, test_datasets[test_id][0], test_datasets[test_id][1], asr_method, asr_out_fp, asr_params_out_fp)

        predict_traits_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\
          opts.weighting_method,test_id]))
        
        if opts.with_accuracy:
            predict_traits_accuracy_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\
              opts.weighting_method,'accuracy_metrics',test_id]))

        if opts.check_for_null_files and exists(predict_traits_out_fp) and file_contains_nulls(predict_traits_out_fp):
            if opts.verbose:
                print "Existing trait predictions file contains null characters. Will run it again after removing: "+predict_traits_out_fp
            remove(predict_traits_out_fp)

        if exists(predict_traits_out_fp) and not opts.force:
            if opts.verbose:
                print "Prediction file: {0} already exists. Skipping ASR and prediction for this organism".format(predict_traits_out_fp)
            continue
        
        output_files.append(predict_traits_out_fp)

        genome_id=split('--',test_id)[2]
        
        if predict_traits_method == 'nearest_neighbor':
            #don't do asr step
            predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -g "{3}" -o "{4}" -m "{5}" """.format(predict_traits_script_fp, test_datasets[test_id][0], opts.ref_tree, genome_id, predict_traits_out_fp,predict_traits_method)
            jobs.write(predict_traits_cmd+"\n")
        else:

            #create the predict traits command
            predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -r "{3}" -g "{4}" -o "{5}" -m "{6}" -w {7} """.format(predict_traits_script_fp,\
            test_datasets[test_id][0], opts.ref_tree, asr_out_fp,genome_id, predict_traits_out_fp,predict_traits_method,opts.weighting_method)

            #Instruct predict_traits to use confidence intervals output by ASR
            if opts.with_confidence:
                confidence_param = ' -c "%s"' %(asr_params_out_fp)
                predict_traits_cmd = predict_traits_cmd + confidence_param
        
            #Instruct predict traits to output the NTSI measure of distance to
            #nearby sequences.

            if opts.with_accuracy:
                accuracy_param = ' -a "%s"' %(predict_traits_accuracy_out_fp)
                predict_traits_cmd = predict_traits_cmd + accuracy_param

        

 
            #add job command to the the jobs file
            jobs.write(asr_cmd+';'+predict_traits_cmd+"\n")

    jobs.close()

    #created_tmp_files.extend(output_files)

    #submit the jobs
    job_prefix='eval_'
    
    if opts.verbose:
        print "Submitting jobs:",cluster_jobs_fp,jobs_fp,job_prefix,opts.num_jobs
    submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs)
Example #10
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    tmp_dir = 'jobs/'
    make_output_dir(tmp_dir)

    #Run the jobs
    script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py')

    if (opts.parallel_method == 'sge'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_sge.py')
    elif (opts.parallel_method == 'multithreaded'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs.py')
    elif (opts.parallel_method == 'torque'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    if (opts.verbose):
        print "Loading tree..."

    tree = load_picrust_tree(opts.tree, opts.verbose)

    all_tips = [tip.Name for tip in tree.tips()]

    if (opts.verbose):
        print "Total number of possible tips to predict: {0}".format(
            len(all_tips))

    created_tmp_files = []
    output_files = {}
    output_files['counts'] = []
    if opts.reconstruction_confidence:
        output_files['variances'] = []
        output_files['upper_CI'] = []
        output_files['lower_CI'] = []

    if opts.already_calculated:
        all_tips = get_tips_not_in_precalc(all_tips, opts.already_calculated)
        if opts.verbose:
            print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(
                len(all_tips))

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_')
    jobs = open(jobs_fp, 'w')
    created_tmp_files.append(jobs_fp)

    if (opts.verbose):
        print "Creating temporary input files in: ", tmp_dir

    num_tips_per_job = 1000
    for tips_to_predict in [
            all_tips[i:i + num_tips_per_job]
            for i in range(0, len(all_tips), num_tips_per_job)
    ]:

        #create tmp output files
        tmp_output_fp = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='out_predict_traits_')
        output_files['counts'].append(tmp_output_fp)

        tip_to_predict_str = ','.join(list(tips_to_predict))

        if opts.reconstruction_confidence:
            outfile_base, extension = splitext(tmp_output_fp)
            output_files['variances'].append(outfile_base + "_variances.tab")
            output_files['upper_CI'].append(outfile_base + "_upper_CI.tab")
            output_files['lower_CI'].append(outfile_base + "_lower_CI.tab")

            #create the job command
            cmd = "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(
                script_fp, opts.observed_trait_table, opts.tree,
                opts.reconstructed_trait_table, opts.reconstruction_confidence,
                tip_to_predict_str, tmp_output_fp)

        else:
            cmd = "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(
                script_fp, opts.observed_trait_table, opts.tree,
                opts.reconstructed_trait_table, tip_to_predict_str,
                tmp_output_fp)

        #NOTE: Calculating NSTI this way is convenient,
        #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on.
        if opts.calculate_accuracy_metrics:
            cmd = cmd + " -a"

        #add job command to the the jobs file
        jobs.write(cmd + "\n")

    jobs.close()

    #add all output files to tmp list (used later for deletion)
    for predict_type in output_files:
        created_tmp_files.extend(output_files[predict_type])
    if (opts.verbose):
        print "Launching parallel jobs."

    #run the job command
    job_prefix = 'picrust'
    submit_jobs(cluster_jobs_fp,
                jobs_fp,
                job_prefix,
                num_jobs=opts.num_jobs,
                delay=opts.delay)

    if (opts.verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files['counts'])

    if (opts.verbose):
        print "Jobs are done running."

    make_output_dir_for_file(opts.output_trait_table)
    outfile_base, extension = splitext(opts.output_trait_table)
    for predict_type in sorted(output_files):
        #Combine output files
        if opts.verbose:
            print "Combining all output files for " + predict_type

        combined_predictions = combine_predict_trait_output(
            output_files[predict_type])

        if opts.verbose:
            print "Writing combined file for " + predict_type

        if predict_type == 'counts':
            #Output in whatever format the user wants
            if opts.output_precalc_file_in_biom:
                open(opts.output_trait_table, 'w').write(
                    format_biom_table(
                        convert_precalc_to_biom(combined_predictions)))
            else:
                open(opts.output_trait_table, 'w').write(combined_predictions)
        else:
            if opts.output_precalc_file_in_biom:
                open(outfile_base + "_" + predict_type + ".biom", 'w').write(
                    format_biom_table(
                        convert_precalc_to_biom(combined_predictions)))
            else:
                open(outfile_base + "_" + predict_type + ".tab",
                     'w').write(combined_predictions)

    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    otu_table = load_table(opts.input_otu_fp)

    ids_to_load = otu_table.ids(axis="observation")

    if opts.input_count_fp is None:
        # precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name = "_".join(["16S", opts.gg_version, "precalculated.tab.gz"])
        input_count_table = join(get_picrust_project_dir(), "picrust", "data", precalc_file_name)
    else:
        input_count_table = opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext = path.splitext(input_count_table)[1]

    if ext == ".gz":
        count_table_fh = gzip.open(input_count_table, "rb")
    else:
        count_table_fh = open(input_count_table, "U")

    if opts.load_precalc_file_in_biom:
        count_table = load_table(count_table_fh)
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    # Need to only keep data relevant to our otu list
    ids = []
    for x in otu_table.iter(axis="observation"):
        ids.append(str(x[1]))

    ob_id = count_table.ids(axis="observation")[0]

    filtered_otus = []
    filtered_values = []
    for x in ids:
        if count_table.exists(x, axis="sample"):
            filtered_otus.append(x)
            filtered_values.append(otu_table.data(x, axis="observation"))

    filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids())

    copy_numbers_filtered = {}
    for x in filtered_otus:
        value = count_table.get_value_by_ids(ob_id, x)
        try:
            # data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError, "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x] = {opts.metadata_identifer: value}

    filtered_otu_table.add_metadata(copy_numbers_filtered, axis="observation")

    def metadata_norm(v, i, md):
        return v / float(md[opts.metadata_identifer])

    normalized_table = filtered_otu_table.transform(metadata_norm, axis="observation")

    # move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table, normalized_table, "observation")

    make_output_dir_for_file(opts.output_otu_fp)
    write_biom_table(normalized_table, opts.output_otu_fp)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_ext=path.splitext(opts.input_otu_fp)[1]
    if opts.input_format_classic:
        otu_table=parse_classic_table_to_rich_table(open(opts.input_otu_fp,'U'),None,None,None,DenseOTUTable)
    else:
        try:
            otu_table = parse_biom_table(open(opts.input_otu_fp,'U'))
        except ValueError:
            raise ValueError("Error loading OTU table! If not in BIOM format use '-f' option.\n")

    ids_to_load = otu_table.ObservationIds
    
    if(opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]
    
    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table,'rb')
    else:
        count_table_fh = open(input_count_table,'U')
       
    if opts.load_precalc_file_in_biom:
        count_table = parse_biom_table(count_table_fh.read())
    else:
        count_table = convert_precalc_to_biom(count_table_fh,ids_to_load)

    #Need to only keep data relevant to our otu list
    ids=[]
    for x in otu_table.iterObservations():
        ids.append(str(x[1]))

    ob_id=count_table.ObservationIds[0]

    filtered_otus=[]
    filtered_values=[]
    for x in ids:
        if count_table.sampleExists(x):
            filtered_otus.append(x)
            filtered_values.append(otu_table.observationData(x))

    #filtered_values = map(list,zip(*filtered_values))
    filtered_otu_table=table_factory(filtered_values,otu_table.SampleIds,filtered_otus, constructor=DenseOTUTable)

    copy_numbers_filtered={}
    for x in filtered_otus:
        value = count_table.getValueByIds(ob_id,x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))
            
        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x]={opts.metadata_identifer:value}
        
    filtered_otu_table.addObservationMetadata(copy_numbers_filtered)
            

    normalized_table = filtered_otu_table.normObservationByMetadata(opts.metadata_identifer)
    
    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table,normalized_table,'ObservationMetadata')
    normalized_otu_table = transfer_sample_metadata(otu_table,normalized_table,'SampleMetadata')

    make_output_dir_for_file(opts.output_otu_fp)
    open(opts.output_otu_fp,'w').write(format_biom_table(normalized_table))
def run_asr_in_parallel(tree, table, asr_method, parallel_method='sge',tmp_dir='jobs/',num_jobs=100, verbose=False):
    '''Runs the ancestral state reconstructions in parallel'''

    asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py')

    if(parallel_method=='sge'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py')
    elif(parallel_method=='multithreaded'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py')
    elif(parallel_method=='torque'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    if(verbose):
        print "Loading trait table..."

    #foreach trait in the table, create a new tmp file with just that trait, and create the job command and add it a tmp jobs file
    table=LoadTable(filename=table, header=True, sep='\t')

    #get dimensions of the table
    dim=table.Shape

    created_tmp_files=[]
    output_files=[]
    ci_files=[]

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_asr_')
    jobs=open(jobs_fp,'w')
    created_tmp_files.append(jobs_fp)

    if(verbose):
        print "Creating temporary input files in: ",tmp_dir

    #iterate over each column
    for i in range(1,dim[1]):
        #create a new table with only a single trait
        single_col_table=table.getColumns([0,i])

        #write the new table to a tmp file
        single_col_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='in_asr_')
        single_col_table.writeToFile(single_col_fp,sep='\t')
        created_tmp_files.append(single_col_fp)

        #create tmp output files
        tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_')
        output_files.append(tmp_output_fp)
        tmp_ci_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_ci_')
        ci_files.append(tmp_ci_fp)

        #create the job command
        cmd= "{0} -i {1} -t {2} -m {3} -o {4} -c {5}".format(asr_script_fp, single_col_fp, tree, asr_method, tmp_output_fp, tmp_ci_fp)

        #add job command to the the jobs file
        jobs.write(cmd+"\n")

    jobs.close()
    created_tmp_files.extend(output_files)
    created_tmp_files.extend(ci_files)

    if(verbose):
        print "Launching parallel jobs."

    #run the job command
    job_prefix='asr'
    submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=num_jobs)

    if(verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files)

    if(verbose):
        print "Jobs are done running. Now combining all tmp files."
    #Combine output files
    combined_table=combine_asr_tables(output_files)
    combined_ci_table=combine_asr_tables(ci_files)

    #create a Table object
    combined_table=Table(header=combined_table[0],rows=combined_table[1:])
    combined_ci_table=Table(header=combined_ci_table[0],rows=combined_ci_table[1:])

    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)

    #return the combined table
    return combined_table,combined_ci_table
Example #14
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading OTU table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))

    if opts.verbose:
        print "Done loading OTU table containing %i samples and %i OTUs." %(len(otu_table.SampleIds),len(otu_table.ObservationIds))
    if(opts.input_count_table is None):
        if(opts.type_of_prediction == 'KO'):
            input_count_table=join(get_picrust_project_dir(),'picrust','data','ko_precalculated.biom.gz')
        elif(opts.type_of_prediction == 'COG'):
            input_count_table=join(get_picrust_project_dir(),'picrust','data','cog_precalculated.biom.gz')
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    
    ext=path.splitext(input_count_table)[1]
    
    if (ext == '.gz'):
        genome_table_str = gzip.open(input_count_table,'rb').read()
    else:
        genome_table_str = open(input_count_table,'U').read()
    
    #In the genome/trait table genomes are the samples and 
    #genes are the observations
    
    if not opts.suppress_subset_loading:
        #Now we want to use the OTU table information
        #to load only rows in the count table corresponding
        #to relevant OTUs
        ids_to_load = otu_table.ObservationIds

        if opts.verbose:
            print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

        genome_table = load_subset_from_biom_str(genome_table_str,ids_to_load,axis='samples')
    else:
        if opts.verbose:
            print "Loading *full* trait table because --suppress_subset_loading was passed. This may result in high memory usage."
        genome_table = parse_biom_table(genome_table_str)
    
    if opts.verbose:
        print "Done loading trait table containing %i functions for %i organisms." %(len(genome_table.ObservationIds),len(genome_table.SampleIds))

    make_output_dir_for_file(opts.output_metagenome_table)

    if opts.accuracy_metrics:
        # Calculate accuracy metrics
        #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False)
        #print "Unweighted NSTI:", unweighted_nsti
        
        weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True)
        samples= weighted_nsti[0]
        nstis = list(weighted_nsti[1])
        #print "Samples:",samples
        #print "NSTIs:",nstis
        samples_and_nstis = zip(samples,nstis)
        #print "Samples and NSTIs:",samples_and_nstis
        lines = ["#Sample\tMetric\tValue\n"]
        #print weighted_nsti
        for sample,nsti in samples_and_nstis:
            line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti))
            lines.append(line)

        if opts.verbose:
            for l in sorted(lines):
                print l
        if opts.verbose:
            print "Writing accuracy information to file:", opts.accuracy_metrics
        open(opts.accuracy_metrics,'w').writelines(sorted(lines))

    if opts.verbose:
        print "Predicting the metagenome..."
        
    predicted_metagenomes = predict_metagenomes(otu_table,genome_table)

    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table
        
    make_output_dir_for_file(opts.output_metagenome_table)
    if(opts.format_tab_delimited):
        open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf(header_key="KEGG Pathways",header_value="KEGG Pathways",metadata_formatter=lambda s: '|'.join(['; '.join(l) for l in s])))
    else:
        open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)


    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = load_table(opts.input_otu_table)
    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_table is None):
        #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", input_count_table

    if (ext == '.gz'):
        genome_table_fh = gzip.open(input_count_table,'rb')
    else:
        genome_table_fh = open(input_count_table,'U')

    #In the genome/trait table genomes are the samples and
    #genes are the observations


    if opts.load_precalc_file_in_biom:
        if not opts.suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs

            if opts.verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if opts.verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = load_table(genome_table_fh)
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load)
    ok_functional_categories = None

    metadata_type = None
    if opts.limit_to_functional_categories:
        ok_functional_categories = opts.limit_to_functional_categories.split("|")
        if opts.verbose:
            print "Limiting to functional categories: %s" %(str(ok_functional_categories))

        # Either KEGG_Pathways or COG_Category needs
        # to be assigned to metadata_key to limit to
        # functional categories (not needed for 
        # individual functions) 

        if opts.type_of_prediction == "ko":
            metadata_type = "KEGG_Pathways"
        elif opts.type_of_prediction == "cog":
            metadata_type = "COG_Category"
        elif opts.type_of_prediction == "rfam":
            exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)")
              
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\
      limit_to_functional_categories = ok_functional_categories ,  metadata_key = metadata_type )

    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_fp

    make_output_dir_for_file(opts.output_fp)
    open(opts.output_fp,'w').write(output_text)
def run_asr_in_parallel(tree, table, asr_method, parallel_method='sge',tmp_dir='jobs/',num_jobs=100, verbose=False):
    '''Runs the ancestral state reconstructions in parallel'''

    asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py')

    if(parallel_method=='sge'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_picrust_jobs_sge.py')
    elif(parallel_method=='multithreaded'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_picrust_jobs.py')
    elif(parallel_method=='torque'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_picrust_jobs_torque.py')
    else:
        raise RuntimeError

    if(verbose):
        print "Loading trait table..."

    #foreach trait in the table, create a new tmp file with just that trait, and create the job command and add it a tmp jobs file
    table=LoadTable(filename=table, header=True, sep='\t')

    #get dimensions of the table
    dim=table.Shape

    created_tmp_files=[]
    output_files=[]
    ci_files=[]

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_asr_')
    jobs=open(jobs_fp,'w')
    created_tmp_files.append(jobs_fp)

    if(verbose):
        print "Creating temporary input files in: ",tmp_dir

    #iterate over each column
    for i in range(1,dim[1]):
        #create a new table with only a single trait
        single_col_table=table.getColumns([0,i])

        #write the new table to a tmp file
        single_col_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='in_asr_')
        single_col_table.writeToFile(single_col_fp,sep='\t')
        created_tmp_files.append(single_col_fp)

        #create tmp output files
        tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_')
        output_files.append(tmp_output_fp)
        tmp_ci_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_ci_')
        ci_files.append(tmp_ci_fp)

        #create the job command
        cmd= "{0} -i {1} -t {2} -m {3} -o {4} -c {5}".format(asr_script_fp, single_col_fp, tree, asr_method, tmp_output_fp, tmp_ci_fp)

        #add job command to the the jobs file
        jobs.write(cmd+"\n")

    jobs.close()
    created_tmp_files.extend(output_files)
    created_tmp_files.extend(ci_files)

    if(verbose):
        print "Launching parallel jobs."

    #run the job command
    job_prefix='asr'
    submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=num_jobs)

    if(verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files)

    if(verbose):
        print "Jobs are done running. Now combining all tmp files."
    #Combine output files
    combined_table=combine_asr_tables(output_files)
    combined_ci_table=combine_asr_tables(ci_files)

    #create a Table object
    combined_table=Table(header=combined_table[0],rows=combined_table[1:])
    combined_ci_table=Table(header=combined_ci_table[0],rows=combined_ci_table[1:])

    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)

    #return the combined table
    return combined_table,combined_ci_table
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    otu_table = load_table(opts.input_otu_fp)

    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table,'rb')
    else:
        count_table_fh = open(input_count_table,'U')

    if opts.load_precalc_file_in_biom:
        count_table = load_table(count_table_fh)
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    #Need to only keep data relevant to our otu list
    ids=[]
    for x in otu_table.iter(axis='observation'):
        ids.append(str(x[1]))

    ob_id=count_table.ids(axis='observation')[0]

    filtered_otus=[]
    filtered_values=[]
    for x in ids:
        if count_table.exists(x, axis='sample'):
            filtered_otus.append(x)
            filtered_values.append(otu_table.data(x, axis='observation'))

    filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids())

    copy_numbers_filtered={}
    for x in filtered_otus:
        value = count_table.get_value_by_ids(ob_id,x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x]={opts.metadata_identifer:value}

    filtered_otu_table.add_metadata(copy_numbers_filtered, axis='observation')

    def metadata_norm(v, i, md):
        return v / float(md[opts.metadata_identifer])
    normalized_table = filtered_otu_table.transform(metadata_norm, axis='observation')

    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'observation')

    make_output_dir_for_file(opts.output_otu_fp)
    write_biom_table(normalized_table, opts.output_otu_fp)
Example #18
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if (opts.suppress_unit_tests and opts.suppress_script_usage_tests):
       option_parser.error("You're suppressing both test types. Nothing to run.")

    test_dir = abspath(dirname(__file__))

    unittest_good_pattern = re.compile('OK\s*$')
    application_not_found_pattern = re.compile('ApplicationNotFoundError')
    python_name = 'python'
    bad_tests = []
    missing_application_tests = []

    # Run through all of PICRUSt's unit tests, and keep track of any files which
    # fail unit tests.
    if not opts.suppress_unit_tests:
        unittest_names = []
        if not opts.unittest_glob:
            for root, dirs, files in walk(test_dir):
                for name in files:
                    if name.startswith('test_') and name.endswith('.py'):
                        unittest_names.append(join(root,name))
        else:
            for fp in glob(opts.unittest_glob):
                fn = split(fp)[1]
                if fn.startswith('test_') and fn.endswith('.py'):
                    unittest_names.append(abspath(fp))

        unittest_names.sort()

        for unittest_name in unittest_names:
            print "Testing %s:\n" % unittest_name
            command = '%s %s -v' % (python_name, unittest_name)
            stdout, stderr, return_value = system_call(command)
            print stderr
            if not unittest_good_pattern.search(stderr):
                if application_not_found_pattern.search(stderr):
                    missing_application_tests.append(unittest_name)
                else:
                    bad_tests.append(unittest_name)

    if not opts.suppress_script_usage_tests:
        try:
            from qiime.test import run_script_usage_tests
        except ImportError:
            print "QIIME not installed so not running script tests."
            opts.suppress_script_usage_tests=True
        else:
            test_data_dir = join(get_picrust_project_dir(),'picrust_test_data')
            scripts_dir  = join(get_picrust_project_dir(),'scripts')
            if opts.script_usage_tests != None:
                script_usage_tests = opts.script_usage_tests.split(',')
            else:
                script_usage_tests = None

            # Run the script usage testing functionality
            script_usage_result_summary, num_script_usage_example_failures = \
                    run_script_usage_tests(
                    test_data_dir=test_data_dir,
                    scripts_dir=scripts_dir,
                    working_dir='/tmp/',
                    verbose=True,
                    tests=script_usage_tests,
                    force_overwrite=True,
                    timeout=300)

    print "==============\nResult summary\n=============="

    if not opts.suppress_unit_tests:
        print "\nUnit test result summary\n------------------------\n"
        if bad_tests:
            print "\nFailed the following unit tests.\n%s" % '\n'.join(bad_tests)

        if missing_application_tests:
            print "\nFailed the following unit tests, in part or whole due "+\
            "to missing external applications.\nDepending on the PICRUSt features "+\
            "you plan to use, this may not be critical.\n%s"\
             % '\n'.join(missing_application_tests)

        if not (missing_application_tests or bad_tests):
            print "\nAll unit tests passed.\n\n"

    if not opts.suppress_script_usage_tests:
        print "\nScript usage test result summary\n------------------------------------\n"
        print script_usage_result_summary
        print ""

    # If script usage tests weren't suppressed,we can't have any failures.
    script_usage_tests_success = (opts.suppress_script_usage_tests or
                                  num_script_usage_example_failures == 0)

    # If any of the unit tests or script usage tests fail, or if we have any
    # missing application errors, use return code 1 (as python's unittest
    # module does to indicate one or more failures).
    return_code = 1
    if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and
        script_usage_tests_success):
        return_code = 0
    return return_code
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    tmp_dir='jobs/'
    make_output_dir(tmp_dir)

    #Run the jobs
    script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py')

    if(opts.parallel_method=='sge'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py')
    elif(opts.parallel_method=='multithreaded'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py')
    elif(opts.parallel_method=='torque'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    if(opts.verbose):
        print "Loading tree..."
        
    tree = load_picrust_tree(opts.tree, opts.verbose)

    all_tips = [tip.Name for tip in tree.tips()]
    
    if(opts.verbose):
        print "Total number of possible tips to predict: {0}".format(len(all_tips))

    created_tmp_files=[]
    output_files={}
    output_files['counts']=[]
    if opts.reconstruction_confidence:
        output_files['variances']=[]
        output_files['upper_CI']=[]
        output_files['lower_CI']=[]

    if opts.already_calculated:
        all_tips=get_tips_not_in_precalc(all_tips,opts.already_calculated)
        if opts.verbose:
            print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(len(all_tips))

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_')
    jobs=open(jobs_fp,'w')
    created_tmp_files.append(jobs_fp)

    if(opts.verbose):
        print "Creating temporary input files in: ",tmp_dir
    
    num_tips_per_job=1000
    for tips_to_predict in [all_tips[i:i+num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job)]:
        
        #create tmp output files
        tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_predict_traits_')
        output_files['counts'].append(tmp_output_fp)

        tip_to_predict_str=','.join(list(tips_to_predict))

        if opts.reconstruction_confidence:
            outfile_base,extension = splitext(tmp_output_fp)
            output_files['variances'].append(outfile_base+"_variances.tab")
            output_files['upper_CI'].append(outfile_base+"_upper_CI.tab")
            output_files['lower_CI'].append(outfile_base+"_lower_CI.tab")
            
            #create the job command
            cmd= "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp)

        else:
            cmd= "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp)
            

        #NOTE: Calculating NSTI this way is convenient, 
        #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on.
        if opts.calculate_accuracy_metrics:
            cmd=cmd+" -a"

        #add job command to the the jobs file
        jobs.write(cmd+"\n")

    jobs.close()

    #add all output files to tmp list (used later for deletion)
    for predict_type in output_files:
        created_tmp_files.extend(output_files[predict_type])
    if(opts.verbose):
        print "Launching parallel jobs."
        
    #run the job command
    job_prefix='picrust'
    submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs,delay=opts.delay)

    if(opts.verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files['counts'])

    if(opts.verbose):
        print "Jobs are done running."

    make_output_dir_for_file(opts.output_trait_table)
    outfile_base,extension = splitext(opts.output_trait_table)
    for predict_type in sorted(output_files):
       #Combine output files
        if opts.verbose:
            print "Combining all output files for "+ predict_type

        combined_predictions=combine_predict_trait_output(output_files[predict_type])
        
        if opts.verbose:
            print "Writing combined file for "+predict_type

        if predict_type == 'counts':
        #Output in whatever format the user wants
            if opts.output_precalc_file_in_biom:
                open(opts.output_trait_table,'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions)))
            else:
                open(opts.output_trait_table,'w').write(combined_predictions)
        else:
            if opts.output_precalc_file_in_biom:
                open(outfile_base+"_"+predict_type+".biom",'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions)))
            else:
                open(outfile_base+"_"+predict_type+".tab",'w').write(combined_predictions)    
        
    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)
Example #20
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_ext = path.splitext(opts.input_otu_fp)[1]
    if opts.input_format_classic:
        otu_table = parse_classic_table_to_rich_table(
            open(opts.input_otu_fp, 'U'), None, None, None, DenseOTUTable)
    else:
        try:
            otu_table = parse_biom_table(open(opts.input_otu_fp, 'U'))
        except ValueError:
            raise ValueError(
                "Error loading OTU table! If not in BIOM format use '-f' option.\n"
            )

    ids_to_load = otu_table.ObservationIds

    if (opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name = '_'.join(
            ['16S', opts.gg_version, 'precalculated.tab.gz'])
        input_count_table = join(get_picrust_project_dir(), 'picrust', 'data',
                                 precalc_file_name)
    else:
        input_count_table = opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext = path.splitext(input_count_table)[1]

    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table, 'rb')
    else:
        count_table_fh = open(input_count_table, 'U')

    if opts.load_precalc_file_in_biom:
        count_table = parse_biom_table(count_table_fh.read())
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    #Need to only keep data relevant to our otu list
    ids = []
    for x in otu_table.iterObservations():
        ids.append(str(x[1]))

    ob_id = count_table.ObservationIds[0]

    filtered_otus = []
    filtered_values = []
    for x in ids:
        if count_table.sampleExists(x):
            filtered_otus.append(x)
            filtered_values.append(otu_table.observationData(x))

    #filtered_values = map(list,zip(*filtered_values))
    filtered_otu_table = table_factory(filtered_values,
                                       otu_table.SampleIds,
                                       filtered_otus,
                                       constructor=DenseOTUTable)

    copy_numbers_filtered = {}
    for x in filtered_otus:
        value = count_table.getValueByIds(ob_id, x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x] = {opts.metadata_identifer: value}

    filtered_otu_table.addObservationMetadata(copy_numbers_filtered)

    normalized_table = filtered_otu_table.normObservationByMetadata(
        opts.metadata_identifer)

    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table,
                                                     normalized_table,
                                                     'ObservationMetadata')
    normalized_otu_table = transfer_sample_metadata(otu_table,
                                                    normalized_table,
                                                    'SampleMetadata')

    make_output_dir_for_file(opts.output_otu_fp)
    open(opts.output_otu_fp, 'w').write(format_biom_table(normalized_table))
Example #21
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if (opts.suppress_unit_tests and opts.suppress_script_usage_tests):
       option_parser.error("You're suppressing both test types. Nothing to run.")

    test_dir = abspath(dirname(__file__))

    unittest_good_pattern = re.compile('OK\s*$')
    application_not_found_pattern = re.compile('ApplicationNotFoundError')
    python_name = 'python'
    bad_tests = []
    missing_application_tests = []

    # Run through all of PICRUSt's unit tests, and keep track of any files which
    # fail unit tests.
    if not opts.suppress_unit_tests:
        unittest_names = []
        if not opts.unittest_glob:
            for root, dirs, files in walk(test_dir):
                for name in files:
                    if name.startswith('test_') and name.endswith('.py'):
                        unittest_names.append(join(root,name))
        else:
            for fp in glob(opts.unittest_glob):
                fn = split(fp)[1]
                if fn.startswith('test_') and fn.endswith('.py'):
                    unittest_names.append(abspath(fp))

        unittest_names.sort()

        for unittest_name in unittest_names:
            print "Testing %s:\n" % unittest_name
            command = '%s %s -v' % (python_name, unittest_name)
            stdout, stderr, return_value = system_call(command)
            print stderr
            if not unittest_good_pattern.search(stderr):
                if application_not_found_pattern.search(stderr):
                    missing_application_tests.append(unittest_name)
                else:
                    bad_tests.append(unittest_name)

    if not opts.suppress_script_usage_tests:  
        try:
            from qiime.test import run_script_usage_tests
        except ImportError:
            print "QIIME not installed so not running script tests."
            opts.suppress_script_usage_tests=True
        else:
            test_data_dir = join(get_picrust_project_dir(),'picrust_test_data')
            scripts_dir  = join(get_picrust_project_dir(),'scripts')
            if opts.script_usage_tests != None:
                script_usage_tests = opts.script_usage_tests.split(',')
            else:
                script_usage_tests = None

            # Run the script usage testing functionality
                script_usage_result_summary, num_script_usage_example_failures = \
                    run_script_usage_tests(
                    qiime_test_data_dir=test_data_dir,
                    qiime_scripts_dir=scripts_dir,
                    working_dir='/tmp/',
                    verbose=True,
                    tests=script_usage_tests,
                    failure_log_fp=None,
                    force_overwrite=True)

    print "==============\nResult summary\n=============="

    if not opts.suppress_unit_tests:
        print "\nUnit test result summary\n------------------------\n"
        if bad_tests:
            print "\nFailed the following unit tests.\n%s" % '\n'.join(bad_tests)
    
        if missing_application_tests:
            print "\nFailed the following unit tests, in part or whole due "+\
            "to missing external applications.\nDepending on the PICRUSt features "+\
            "you plan to use, this may not be critical.\n%s"\
             % '\n'.join(missing_application_tests)
        
        if not (missing_application_tests or bad_tests):
            print "\nAll unit tests passed.\n\n"

    if not opts.suppress_script_usage_tests:
        print "\nScript usage test result summary\n------------------------------------\n"
        print script_usage_result_summary
        print ""

    # If script usage tests weren't suppressed,we can't have any failures.
    script_usage_tests_success = (opts.suppress_script_usage_tests or
                                  num_script_usage_example_failures == 0)

    # If any of the unit tests or script usage tests fail, or if we have any
    # missing application errors, use return code 1 (as python's unittest
    # module does to indicate one or more failures).
    return_code = 1
    if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and
        script_usage_tests_success):
        return_code = 0
    return return_code
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
  
    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))
    ids_to_load = otu_table.ObservationIds

    if(opts.input_count_table is None):
        #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", input_count_table
    
    if (ext == '.gz'):
        genome_table_fh = gzip.open(input_count_table,'rb')
    else:
        genome_table_fh = open(input_count_table,'U')
    
    #In the genome/trait table genomes are the samples and 
    #genes are the observations

    
    if opts.load_precalc_file_in_biom:
        if not opts.suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs
           
            if opts.verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if opts.verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = parse_biom_table(genome_table_fh.read())
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load)
    
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions)
    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_fp
        
    make_output_dir_for_file(opts.output_fp)
    open(opts.output_fp,'w').write(output_text)
Example #23
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    #set some defaults for the options
    input_dir = opts.input_dir
    output_dir = opts.output_dir or input_dir
    tmp_dir = opts.tmp_dir or output_dir
    parallel_method = opts.parallel_method
    asr_method = opts.asr_method
    predict_traits_method = opts.prediction_method

    if opts.num_jobs > 20 and parallel_method == 'multithreaded':
        raise ValueError(
            'You probably dont want to run multithreaded evaluations with a large num_jobs. Please adjust options num_jobs and or parallel_method'
        )

    if opts.with_confidence and asr_method not in ['ace_ml', 'ace_reml']:
        raise ValueError(
            "PICRUST currently only supports confidence intervals with the ace_ml and ace_reml ASR methods"
        )

    if opts.verbose:
        print "Reconstruction method:", asr_method
        print "Prediction method:", predict_traits_method
        print "Parallel method:", parallel_method
        print "num_jobs:", opts.num_jobs
        print "\nOutput will be saved here:'%s'" % output_dir

    #create the output directory unless it already exists
    make_output_dir(output_dir)

    if (parallel_method == 'sge'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_sge.py')
    elif (parallel_method == 'multithreaded'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs.py')
    elif (parallel_method == 'torque'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    #get the test datasets to run in the input directory (based on exp_traits files)
    expect_test_files = glob(join(input_dir, 'exp_traits--*'))

    test_datasets = {}
    for file_name in expect_test_files:
        test_id = file_name.replace(join(input_dir, 'exp_traits--'), '', 1)
        #create a dict with the test files as values in the ref list
        test_datasets[test_id] = [
            join(input_dir, 'test_trait_table--' + test_id),
            join(input_dir, 'test_tree--' + test_id),
            join(input_dir, 'exp_traits--' + test_id)
        ]

    created_tmp_files = []
    output_files = []

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_')
    jobs = open(jobs_fp, 'w')
    created_tmp_files.append(jobs_fp)

    #get location of scripts we need to run
    asr_script_fp = join(get_picrust_project_dir(), 'scripts',
                         'ancestral_state_reconstruction.py')
    predict_traits_script_fp = join(get_picrust_project_dir(), 'scripts',
                                    'predict_traits.py')

    #run each test dataset through the pipeline
    for test_id in test_datasets:

        asr_out_fp = join(output_dir, 'asr--' + asr_method + '--' + test_id)
        asr_params_out_fp = join(
            output_dir, '--'.join(['asr', asr_method, 'asr_params', test_id]))
        created_tmp_files.append(asr_out_fp)

        if opts.check_for_null_files and exists(
                asr_out_fp) and file_contains_nulls(asr_out_fp):
            #remove file
            if opts.verbose:
                print "Existing ASR file contains null characters. Will run ASR again after removing: " + asr_out_fp
            remove(asr_out_fp)

        if exists(asr_out_fp) and not opts.force:
            if opts.verbose:
                print "Output file: {0} already exists, so we will skip it.".format(
                    asr_out_fp)
            asr_cmd = "echo 'Skipping ASR for %s, file %s exists already'" % (
                test_id, asr_out_fp)
        else:
            #create the asr command
            asr_cmd = """python {0} -i "{1}" -t "{2}" -m {3} -o "{4}" -c "{5}" """.format(
                asr_script_fp, test_datasets[test_id][0],
                test_datasets[test_id][1], asr_method, asr_out_fp,
                asr_params_out_fp)

        predict_traits_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\
          opts.weighting_method,test_id]))

        if opts.with_accuracy:
            predict_traits_accuracy_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\
              opts.weighting_method,'accuracy_metrics',test_id]))

        if opts.check_for_null_files and exists(
                predict_traits_out_fp) and file_contains_nulls(
                    predict_traits_out_fp):
            if opts.verbose:
                print "Existing trait predictions file contains null characters. Will run it again after removing: " + predict_traits_out_fp
            remove(predict_traits_out_fp)

        if exists(predict_traits_out_fp) and not opts.force:
            if opts.verbose:
                print "Prediction file: {0} already exists. Skipping ASR and prediction for this organism".format(
                    predict_traits_out_fp)
            continue

        output_files.append(predict_traits_out_fp)

        genome_id = split('--', test_id)[2]

        if predict_traits_method == 'nearest_neighbor':
            #don't do asr step
            predict_traits_cmd = """python {0} -i "{1}" -t "{2}" -g "{3}" -o "{4}" -m "{5}" """.format(
                predict_traits_script_fp, test_datasets[test_id][0],
                opts.ref_tree, genome_id, predict_traits_out_fp,
                predict_traits_method)
            jobs.write(predict_traits_cmd + "\n")
        else:

            #create the predict traits command
            predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -r "{3}" -g "{4}" -o "{5}" -m "{6}" -w {7} """.format(predict_traits_script_fp,\
            test_datasets[test_id][0], opts.ref_tree, asr_out_fp,genome_id, predict_traits_out_fp,predict_traits_method,opts.weighting_method)

            #Instruct predict_traits to use confidence intervals output by ASR
            if opts.with_confidence:
                confidence_param = ' -c "%s"' % (asr_params_out_fp)
                predict_traits_cmd = predict_traits_cmd + confidence_param

            #Instruct predict traits to output the NTSI measure of distance to
            #nearby sequences.

            if opts.with_accuracy:
                accuracy_param = ' -a "%s"' % (predict_traits_accuracy_out_fp)
                predict_traits_cmd = predict_traits_cmd + accuracy_param

            #add job command to the the jobs file
            jobs.write(asr_cmd + ';' + predict_traits_cmd + "\n")

    jobs.close()

    #created_tmp_files.extend(output_files)

    #submit the jobs
    job_prefix = 'eval_'

    if opts.verbose:
        print "Submitting jobs:", cluster_jobs_fp, jobs_fp, job_prefix, opts.num_jobs
    submit_jobs(cluster_jobs_fp, jobs_fp, job_prefix, num_jobs=opts.num_jobs)
from os.path import join
from picrust.util import get_picrust_project_dir
import gzip

script_info = {}
script_info['brief_description'] = "This script produces the actual metagenome functional predictions for a given OTU table."
script_info['script_description'] = ""
script_info['script_usage'] = [("","Predict metagenomes from genomes.biom and otus.biom.","%prog -i normalized_otus.biom -o predicted_metagenomes.biom"),
                               ("","Change output format to plain tab-delimited:","%prog -f -i normalized_otus.biom -o predicted_metagenomes.tab")]
script_info['output_description']= "Output is a table of function counts (e.g. KEGG KOs) by sample ids."
script_info['required_options'] = [
 make_option('-i','--input_otu_table',type='existing_filepath',help='the input otu table in biom format'),
 make_option('-o','--output_metagenome_table',type="new_filepath",help='the output file for the predicted metagenome')
]
script_info['optional_options'] = [\
    make_option('-c','--input_count_table',default=join(get_picrust_project_dir(),'picrust','data','ko_precalculated.biom.gz'),type="existing_filepath",help='Precalculated function predictions on per otu basis in biom format (can be gzipped) [default: %default]'),
    make_option('-a','--accuracy_metrics',default=None,type="new_filepath",help='If provided, calculate accuracy metrics for the predicted metagenome.  NOTE: requires that per-genome accuracy metrics were calculated using predict_traits.py during genome prediction (e.g. there are "NSTI" values in the genome .biom file metadata)'),
    make_option('--suppress_subset_loading',default=False,action="store_true",help='Normally, only counts for OTUs present in the sample are loaded.  If this flag is passed, the full biom table is loaded.  This makes no difference for the analysis, but may result in faster load times (at the cost of more memory usage)'),
  make_option('-f','--format_tab_delimited',action="store_true",default=False,help='output the predicted metagenome table in tab-delimited format [default: %default]')]
script_info['version'] = __version__

def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))
    ext=path.splitext(opts.input_count_table)[1]
   
    if not opts.suppress_subset_loading: