Ejemplo n.º 1
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))
    ext=path.splitext(opts.input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    if (ext == '.gz'):
        genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb'))
    else:
        genome_table = parse_biom_table(open(opts.input_count_table,'U'))

    make_output_dir_for_file(opts.output_metagenome_table)

    if opts.accuracy_metrics:
        # Calculate accuracy metrics
        #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False)
        #print "Unweighted NSTI:", unweighted_nsti
        
        weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True)
        samples= weighted_nsti[0]
        nstis = list(weighted_nsti[1])
        #print "Samples:",samples
        #print "NSTIs:",nstis
        samples_and_nstis = zip(samples,nstis)
        #print "Samples and NSTIs:",samples_and_nstis
        lines = ["#Sample\tMetric\tValue\n"]
        #print weighted_nsti
        for sample,nsti in samples_and_nstis:
            line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti))
            lines.append(line)

        if opts.verbose:
            for l in sorted(lines):
                print l
        if opts.verbose:
            print "Writing accuracy information to file:", opts.accuracy_metrics
        open(opts.accuracy_metrics,'w').writelines(sorted(lines))

    if opts.verbose:
        print "Predicting the metagenome..."
        
    predicted_metagenomes = predict_metagenomes(otu_table,genome_table)

    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table
        
    make_output_dir_for_file(opts.output_metagenome_table)
    if(opts.format_tab_delimited):
        open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf())
    else:
        open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))
Ejemplo n.º 2
0
 def test_predict_metagenomes_keeps_observation_metadata(self):
     """predict_metagenomes preserves Observation metadata in genome and otu table"""
     
     actual = predict_metagenomes(self.otu_table1_with_metadata,self.genome_table1_with_metadata)
     exp = self.predicted_metagenome_table1_with_metadata
     
     #Need to map to dicts, otherwise the memory location of the lambda function
     #associated with the defaultdict causes (artifactual) inequality of results
     
     actual_md = map(dict,sorted([md for md in actual.ObservationMetadata]))
     exp_md = map(dict,sorted([md for md in exp.ObservationMetadata]))
     for i,md in enumerate(actual_md):
         self.assertEqualItems(md,exp_md[i])
     for i,md in enumerate(exp_md):
         self.assertEqualItems(md,actual_md[i])
Ejemplo n.º 3
0
    def test_predict_metagenomes_keeps_observation_metadata(self):
        """predict_metagenomes preserves Observation metadata in genome and otu table"""

        actual = predict_metagenomes(self.otu_table1_with_metadata,self.genome_table1_with_metadata)
        exp = self.predicted_metagenome_table1_with_metadata

        #NOTE: the expected data is  mapped to dicts below because otherwise the memory
        #location of the lambda function associated with the defaultdict
        #causes (artifactual) inequality of results

        actual_md = map(dict,sorted([md for md in actual.metadata(axis='observation')]))
        exp_md = map(dict,sorted([md for md in exp.metadata(axis='observation')]))
        for i,md in enumerate(actual_md):
            self.assertEqualItems(md,exp_md[i])
        for i,md in enumerate(exp_md):
            self.assertEqualItems(md,actual_md[i])
    def test_predict_metagenomes_keeps_observation_metadata(self):
        """predict_metagenomes preserves Observation metadata in genome and otu table"""

        actual = predict_metagenomes(self.otu_table1_with_metadata,self.genome_table1_with_metadata)
        exp = self.predicted_metagenome_table1_with_metadata

        #NOTE: the expected data is  mapped to dicts below because otherwise the memory
        #location of the lambda function associated with the defaultdict
        #causes (artifactual) inequality of results

        actual_md = map(dict,sorted([md for md in actual.metadata(axis='observation')]))
        exp_md = map(dict,sorted([md for md in exp.metadata(axis='observation')]))
        for i,md in enumerate(actual_md):
            self.assertEqualItems(md,exp_md[i])
        for i,md in enumerate(exp_md):
            self.assertEqualItems(md,actual_md[i])
Ejemplo n.º 5
0
    def test_predict_metagenomes_keeps_sample_metadata(self):
        """predict_metagenomes preserves Sample metadata in genome and otu table"""
        #NOTE: could be consolidated with "_keeps_observation_metadata above

        actual = predict_metagenomes(self.otu_table1_with_metadata,\
          self.genome_table1_with_metadata,verbose=False)
        exp = self.predicted_metagenome_table1_with_metadata

        #Need to map to dicts, otherwise the memory location of the lambda function
        #associated with the defaultdict causes (artifactual) inequality of results

        actual_md = map(dict, sorted([md for md in actual.SampleMetadata]))
        exp_md = map(dict, sorted([md for md in exp.SampleMetadata]))
        for i, md in enumerate(actual_md):
            self.assertEqualItems(md, exp_md[i])
        for i, md in enumerate(exp_md):
            self.assertEqualItems(md, actual_md[i])
    def test_predict_metagenomes_keeps_sample_metadata(self):
        """predict_metagenomes preserves Sample metadata in genome and otu table"""
        #NOTE: could be consolidated with "_keeps_observation_metadata above

        actual = predict_metagenomes(self.otu_table1_with_metadata,\
          self.genome_table1_with_metadata,verbose=False)
        exp = self.predicted_metagenome_table1_with_metadata

        #Need to map to dicts, otherwise the memory location of the lambda function
        #associated with the defaultdict causes (artifactual) inequality of results

        actual_md = map(dict,sorted([md for md in actual.metadata()]))
        exp_md = map(dict,sorted([md for md in exp.metadata()]))
        for i,md in enumerate(actual_md):
            self.assertEqualItems(md,exp_md[i])
        for i,md in enumerate(exp_md):
            self.assertEqualItems(md,actual_md[i])
Ejemplo n.º 7
0
 def test_predict_metagenomes(self):
     """ predict_metagenomes functions as expected with valid input """
     actual = predict_metagenomes(self.otu_table1, self.genome_table1)
     self.assertEqual(actual.delimitedSelf(),
                      self.predicted_metagenome_table1.delimitedSelf())
Ejemplo n.º 8
0
    def test_partition_metagenome_contributions(self):
        """partition_metagenome_contributions functions with valid input"""
        #For reference, the OTU table should look like this:
        ##OTU ID Sample1 Sample2 Sample3 Sample4
        #GG_OTU_1    1.0 2.0 3.0 5.0
        #GG_OTU_2    5.0 1.0 0.0 2.0
        #GG_OTU_3    0.0 0.0 1.0 4.0

        #...and the genome table will look like this:
        ##OTU ID GG_OTU_1    GG_OTU_3    GG_OTU_2
        #f1  1.0 2.0 3.0
        #f2  0.0 1.0 0.0
        #f3  0.0 0.0 1.0

        #For which predict metagenomes should produce a table like this:
        ##OTU ID    Sample1 Sample2 Sample3 Sample4
        #f1  16.0    5.0 5.0 19.0
        #f2  0.0 0.0 1.0 4.0
        #f3  5.0 1.0 0.0 2.0

        #First, sanity checks

        #We expect to see the contributions broken down by OTU
        metagenome_table = predict_metagenomes(self.otu_table1,self.genome_table1)
        obs = partition_metagenome_contributions(self.otu_table1,self.genome_table1)

        obs_text = "\n".join(["\t".join(map(str,i)) for i in obs])
        exp_text = "\n".join(["\t".join(map(str,r.split())) for r in \
          self.predicted_gene_partition_table.split('\n')])

        #Test that the percent of all samples is always smaller than
        #the percent of the current sample
        for l in obs[1:]:
            self.assertTrue(l[-1]<=l[-2])

        #Test that the summed contributions equal the metagenome table value
        sum_f1_sample1 = sum([i[5] for i in obs[1:] if (i[0]=="f1" and i[1]=="Sample1")])
        self.assertFloatEqual(sum_f1_sample1,16.0)

        sum_f2_sample1 = sum(\
          [i[5] for i in obs[1:] if (i[0]=="f2" and i[1]=="Sample1")])
        self.assertFloatEqual(sum_f2_sample1,0.0)

        sum_f3_sample1 = sum(\
          [i[5] for i in obs[1:] if (i[0]=="f3" and i[1]=="Sample1")])
        self.assertFloatEqual(sum_f3_sample1,5.0)

        for l in obs[1:]:
            gene,sample,OTU,gene_count_per_genome,otu_abundance_in_sample,count,percent,percent_all = l
            #Test that genomes without genes don't contribute
            #Only GG_OTU_3 has f2, so for all others the gene
            #contribution should be 0,0
            if gene == "f2" and OTU != "GG_OTU_3":
                self.assertFloatEqual(count,0.0)
                self.assertFloatEqual(percent,0.0)
                self.assertFloatEqual(percent_all,0.0)
            #Ditto for GG_OTU_2 and f3
            if gene == "f3" and OTU != "GG_OTU_2":
                self.assertFloatEqual(count,0.0)
                self.assertFloatEqual(percent,0.0)
                self.assertFloatEqual(percent_all,0.0)

            #Test that OTUs absent from a sample don't contribute
            if sample == "Sample1" and OTU == "GG_OTU_3":
                self.assertFloatEqual(count,0.0)
                self.assertFloatEqual(percent,0.0)
                self.assertFloatEqual(percent_all,0.0)

        #Having validated that this looks OK, just compare to
        #hand-checked result
        self.assertEqual(obs_text,exp_text)

        #Check if "limit to functions" works and retrieves the correct information
        obs_limited = partition_metagenome_contributions(self.otu_table1,self.genome_table1,limit_to_functions=["f2"])
        for l in obs_limited[1:]:
            gene,sample,OTU,gene_count_per_genome,otu_abundance_in_sample,count,percent,percent_all = l
            self.assertEqual(gene,"f2")
Ejemplo n.º 9
0
        #If we are calculating variance, we get the prediction as part
        #of the process

        if opts.verbose:
            print "Predicting the metagenome, metagenome variance and confidence intervals for the metagenome..."

        predicted_metagenomes,predicted_metagenome_variances,\
        predicted_metagenomes_lower_CI_95,predicted_metagenomes_upper_CI_95=\
          predict_metagenome_variances(otu_table,genome_table,variance_table,whole_round=round_flag)
    else:
        #If we don't need confidence intervals, we can do a faster pure numpy prediction

        if opts.verbose:
            print "Predicting the metagenome..."
        predicted_metagenomes = predict_metagenomes(otu_table,
                                                    genome_table,
                                                    whole_round=round_flag)

    if opts.normalize_by_otu:
        #normalize (e.g. divide) the abundances by the sum of the OTUs per sample
        if opts.verbose:
            print "Normalizing functional abundances by sum of OTUs per sample"
        inverse_otu_sums = [1 / x for x in otu_table.sum(axis='sample')]
        scaling_factors = dict(zip(otu_table.ids(), inverse_otu_sums))
        predicted_metagenomes = scale_metagenomes(predicted_metagenomes,
                                                  scaling_factors)

    if opts.normalize_by_function:
        #normalize (e.g. divide) the abundances by the sum of the functions per sample
        #Sum of functional abundances per sample will equal 1 (e.g. relative abundance).
        if opts.verbose:
Ejemplo n.º 10
0
    if opts.with_confidence:
        #If we are calculating variance, we get the prediction as part
        #of the process
        
        if opts.verbose:
            print "Predicting the metagenome, metagenome variance and confidence intervals for the metagenome..."
        
        predicted_metagenomes,predicted_metagenome_variances,\
        predicted_metagenomes_lower_CI_95,predicted_metagenomes_upper_CI_95=\
          predict_metagenome_variances(otu_table,genome_table,variance_table)
    else:
        #If we don't need confidence intervals, we can do a faster pure numpy prediction
        
        if opts.verbose:
            print "Predicting the metagenome..."
        predicted_metagenomes = predict_metagenomes(otu_table,genome_table)

    if opts.normalize_by_otu:
        #normalize (e.g. divide) the abundances by the sum of the OTUs per sample
        if opts.verbose:
            print "Normalizing functional abundances by sum of OTUs per sample"
        inverse_otu_sums = [1/x for x in otu_table.sum(axis='sample')] 
        scaling_factors = dict(zip(otu_table.SampleIds,inverse_otu_sums))
        predicted_metagenomes = scale_metagenomes(predicted_metagenomes,scaling_factors)

    if opts.normalize_by_function:
        #normalize (e.g. divide) the abundances by the sum of the functions per sample
        #Sum of functional abundances per sample will equal 1 (e.g. relative abundance).
        if opts.verbose:
            print "Normalizing functional abundances by sum of functions per sample"
        predicted_metagenomes = predicted_metagenomes.normObservationBySample()
Ejemplo n.º 11
0
    if opts.with_confidence:
        #If we are calculating variance, we get the prediction as part
        #of the process

        if opts.verbose:
            print "Predicting the metagenome, metagenome variance and confidence intervals for the metagenome..."

        predicted_metagenomes,predicted_metagenome_variances,\
        predicted_metagenomes_lower_CI_95,predicted_metagenomes_upper_CI_95=\
          predict_metagenome_variances(otu_table,genome_table,variance_table)
    else:
        #If we don't need confidence intervals, we can do a faster pure numpy prediction

        if opts.verbose:
            print "Predicting the metagenome..."
        predicted_metagenomes = predict_metagenomes(otu_table, genome_table)

    write_metagenome_to_file(predicted_metagenomes,opts.output_metagenome_table,\
        opts.format_tab_delimited,"metagenome prediction",verbose=opts.verbose)

    if opts.with_confidence:
        output_path, output_filename = split(opts.output_metagenome_table)
        base_output_filename, ext = splitext(output_filename)
        variance_output_fp =\
          join(output_path,"%s_variances%s" %(base_output_filename,ext))
        upper_CI_95_output_fp =\
          join(output_path,"%s_upper_CI_95%s" %(base_output_filename,ext))
        lower_CI_95_output_fp =\
          join(output_path,"%s_lower_CI_95%s" %(base_output_filename,ext))

        write_metagenome_to_file(predicted_metagenome_variances,\
    def test_partition_metagenome_contributions(self):
        """partition_metagenome_contributions functions with valid input"""
        #For reference, the OTU table should look like this:
        ##OTU ID Sample1 Sample2 Sample3 Sample4
        #GG_OTU_1    1.0 2.0 3.0 5.0
        #GG_OTU_2    5.0 1.0 0.0 2.0
        #GG_OTU_3    0.0 0.0 1.0 4.0

        #...and the genome table will look like this:
        ##OTU ID GG_OTU_1    GG_OTU_3    GG_OTU_2
        #f1  1.0 2.0 3.0
        #f2  0.0 1.0 0.0
        #f3  0.0 0.0 1.0

        #For which predict metagenomes should produce a table like this:
        ##OTU ID    Sample1 Sample2 Sample3 Sample4
        #f1  16.0    5.0 5.0 19.0
        #f2  0.0 0.0 1.0 4.0
        #f3  5.0 1.0 0.0 2.0

        #First, sanity checks

        #We expect to see the contributions broken down by OTU
        metagenome_table = predict_metagenomes(self.otu_table1,self.genome_table1)
        obs = partition_metagenome_contributions(self.otu_table1,self.genome_table1)

        obs_text = "\n".join(["\t".join(map(str,i)) for i in obs])
        exp_text = "\n".join(["\t".join(map(str,r.split())) for r in \
          self.predicted_gene_partition_table.split('\n')])

        #Test that the percent of all samples is always smaller than
        #the percent of the current sample
        for l in obs[1:]:
            self.assertTrue(l[-1]<=l[-2])

        #Test that the summed contributions equal the metagenome table value
        sum_f1_sample1 = sum([i[5] for i in obs[1:] if (i[0]=="f1" and i[1]=="Sample1")])
        self.assertFloatEqual(sum_f1_sample1,16.0)

        sum_f2_sample1 = sum(\
          [i[5] for i in obs[1:] if (i[0]=="f2" and i[1]=="Sample1")])
        self.assertFloatEqual(sum_f2_sample1,0.0)

        sum_f3_sample1 = sum(\
          [i[5] for i in obs[1:] if (i[0]=="f3" and i[1]=="Sample1")])
        self.assertFloatEqual(sum_f3_sample1,5.0)

        for l in obs[1:]:
            gene,sample,OTU,gene_count_per_genome,otu_abundance_in_sample,count,percent,percent_all = l
            #Test that genomes without genes don't contribute
            #Only GG_OTU_3 has f2, so for all others the gene
            #contribution should be 0,0
            if gene == "f2" and OTU != "GG_OTU_3":
                self.assertFloatEqual(count,0.0)
                self.assertFloatEqual(percent,0.0)
                self.assertFloatEqual(percent_all,0.0)
            #Ditto for GG_OTU_2 and f3
            if gene == "f3" and OTU != "GG_OTU_2":
                self.assertFloatEqual(count,0.0)
                self.assertFloatEqual(percent,0.0)
                self.assertFloatEqual(percent_all,0.0)

            #Test that OTUs absent from a sample don't contribute
            if sample == "Sample1" and OTU == "GG_OTU_3":
                self.assertFloatEqual(count,0.0)
                self.assertFloatEqual(percent,0.0)
                self.assertFloatEqual(percent_all,0.0)

        #Having validated that this looks OK, just compare to
        #hand-checked result
        self.assertEqual(obs_text,exp_text)

        #Check if "limit to functions" works and retrieves the correct information
        obs_limited = partition_metagenome_contributions(self.otu_table1,self.genome_table1,limit_to_functions=["f2"])
        for l in obs_limited[1:]:
            gene,sample,OTU,gene_count_per_genome,otu_abundance_in_sample,count,percent,percent_all = l
            self.assertEqual(gene,"f2")
Ejemplo n.º 13
0
 def test_predict_metagenomes(self):
     """ predict_metagenomes functions as expected with valid input """
     actual = predict_metagenomes(self.otu_table1,self.genome_table1)
     self.assertEqual(str(actual),
                      str(self.predicted_metagenome_table1))
Ejemplo n.º 14
0
    if opts.with_confidence:
        #If we are calculating variance, we get the prediction as part
        #of the process

        if opts.verbose:
            print "Predicting the metagenome, metagenome variance and confidence intervals for the metagenome..."

        predicted_metagenomes,predicted_metagenome_variances,\
        predicted_metagenomes_lower_CI_95,predicted_metagenomes_upper_CI_95=\
          predict_metagenome_variances(otu_table,genome_table,variance_table,whole_round=round_flag)
    else:
        #If we don't need confidence intervals, we can do a faster pure numpy prediction

        if opts.verbose:
            print "Predicting the metagenome..."
        predicted_metagenomes = predict_metagenomes(otu_table,genome_table,whole_round=round_flag)

    if opts.normalize_by_otu:
        #normalize (e.g. divide) the abundances by the sum of the OTUs per sample
        if opts.verbose:
            print "Normalizing functional abundances by sum of OTUs per sample"
        inverse_otu_sums = [1/x for x in otu_table.sum(axis='sample')]
        scaling_factors = dict(zip(otu_table.ids(),inverse_otu_sums))
        predicted_metagenomes = scale_metagenomes(predicted_metagenomes,scaling_factors)

    if opts.normalize_by_function:
        #normalize (e.g. divide) the abundances by the sum of the functions per sample
        #Sum of functional abundances per sample will equal 1 (e.g. relative abundance).
        if opts.verbose:
            print "Normalizing functional abundances by sum of functions per sample"
        predicted_metagenomes = predicted_metagenomes.norm(axis='sample', inplace=False)
Ejemplo n.º 15
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading OTU table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))

    if opts.verbose:
        print "Done loading OTU table containing %i samples and %i OTUs." %(len(otu_table.SampleIds),len(otu_table.ObservationIds))
    if(opts.input_count_table is None):
        if(opts.type_of_prediction == 'KO'):
            input_count_table=join(get_picrust_project_dir(),'picrust','data','ko_precalculated.biom.gz')
        elif(opts.type_of_prediction == 'COG'):
            input_count_table=join(get_picrust_project_dir(),'picrust','data','cog_precalculated.biom.gz')
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    
    ext=path.splitext(input_count_table)[1]
    
    if (ext == '.gz'):
        genome_table_str = gzip.open(input_count_table,'rb').read()
    else:
        genome_table_str = open(input_count_table,'U').read()
    
    #In the genome/trait table genomes are the samples and 
    #genes are the observations
    
    if not opts.suppress_subset_loading:
        #Now we want to use the OTU table information
        #to load only rows in the count table corresponding
        #to relevant OTUs
        ids_to_load = otu_table.ObservationIds

        if opts.verbose:
            print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

        genome_table = load_subset_from_biom_str(genome_table_str,ids_to_load,axis='samples')
    else:
        if opts.verbose:
            print "Loading *full* trait table because --suppress_subset_loading was passed. This may result in high memory usage."
        genome_table = parse_biom_table(genome_table_str)
    
    if opts.verbose:
        print "Done loading trait table containing %i functions for %i organisms." %(len(genome_table.ObservationIds),len(genome_table.SampleIds))

    make_output_dir_for_file(opts.output_metagenome_table)

    if opts.accuracy_metrics:
        # Calculate accuracy metrics
        #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False)
        #print "Unweighted NSTI:", unweighted_nsti
        
        weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True)
        samples= weighted_nsti[0]
        nstis = list(weighted_nsti[1])
        #print "Samples:",samples
        #print "NSTIs:",nstis
        samples_and_nstis = zip(samples,nstis)
        #print "Samples and NSTIs:",samples_and_nstis
        lines = ["#Sample\tMetric\tValue\n"]
        #print weighted_nsti
        for sample,nsti in samples_and_nstis:
            line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti))
            lines.append(line)

        if opts.verbose:
            for l in sorted(lines):
                print l
        if opts.verbose:
            print "Writing accuracy information to file:", opts.accuracy_metrics
        open(opts.accuracy_metrics,'w').writelines(sorted(lines))

    if opts.verbose:
        print "Predicting the metagenome..."
        
    predicted_metagenomes = predict_metagenomes(otu_table,genome_table)

    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table
        
    make_output_dir_for_file(opts.output_metagenome_table)
    if(opts.format_tab_delimited):
        open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf(header_key="KEGG Pathways",header_value="KEGG Pathways",metadata_formatter=lambda s: '|'.join(['; '.join(l) for l in s])))
    else:
        open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))