def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ext=path.splitext(opts.input_count_table)[1] if opts.verbose: print "Loading count table: ", opts.input_count_table if (ext == '.gz'): genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb')) else: genome_table = parse_biom_table(open(opts.input_count_table,'U')) make_output_dir_for_file(opts.output_metagenome_table) if opts.accuracy_metrics: # Calculate accuracy metrics #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False) #print "Unweighted NSTI:", unweighted_nsti weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True) samples= weighted_nsti[0] nstis = list(weighted_nsti[1]) #print "Samples:",samples #print "NSTIs:",nstis samples_and_nstis = zip(samples,nstis) #print "Samples and NSTIs:",samples_and_nstis lines = ["#Sample\tMetric\tValue\n"] #print weighted_nsti for sample,nsti in samples_and_nstis: line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti)) lines.append(line) if opts.verbose: for l in sorted(lines): print l if opts.verbose: print "Writing accuracy information to file:", opts.accuracy_metrics open(opts.accuracy_metrics,'w').writelines(sorted(lines)) if opts.verbose: print "Predicting the metagenome..." predicted_metagenomes = predict_metagenomes(otu_table,genome_table) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) if(opts.format_tab_delimited): open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf()) else: open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))
def test_predict_metagenomes_keeps_observation_metadata(self): """predict_metagenomes preserves Observation metadata in genome and otu table""" actual = predict_metagenomes(self.otu_table1_with_metadata,self.genome_table1_with_metadata) exp = self.predicted_metagenome_table1_with_metadata #Need to map to dicts, otherwise the memory location of the lambda function #associated with the defaultdict causes (artifactual) inequality of results actual_md = map(dict,sorted([md for md in actual.ObservationMetadata])) exp_md = map(dict,sorted([md for md in exp.ObservationMetadata])) for i,md in enumerate(actual_md): self.assertEqualItems(md,exp_md[i]) for i,md in enumerate(exp_md): self.assertEqualItems(md,actual_md[i])
def test_predict_metagenomes_keeps_observation_metadata(self): """predict_metagenomes preserves Observation metadata in genome and otu table""" actual = predict_metagenomes(self.otu_table1_with_metadata,self.genome_table1_with_metadata) exp = self.predicted_metagenome_table1_with_metadata #NOTE: the expected data is mapped to dicts below because otherwise the memory #location of the lambda function associated with the defaultdict #causes (artifactual) inequality of results actual_md = map(dict,sorted([md for md in actual.metadata(axis='observation')])) exp_md = map(dict,sorted([md for md in exp.metadata(axis='observation')])) for i,md in enumerate(actual_md): self.assertEqualItems(md,exp_md[i]) for i,md in enumerate(exp_md): self.assertEqualItems(md,actual_md[i])
def test_predict_metagenomes_keeps_observation_metadata(self): """predict_metagenomes preserves Observation metadata in genome and otu table""" actual = predict_metagenomes(self.otu_table1_with_metadata,self.genome_table1_with_metadata) exp = self.predicted_metagenome_table1_with_metadata #NOTE: the expected data is mapped to dicts below because otherwise the memory #location of the lambda function associated with the defaultdict #causes (artifactual) inequality of results actual_md = map(dict,sorted([md for md in actual.metadata(axis='observation')])) exp_md = map(dict,sorted([md for md in exp.metadata(axis='observation')])) for i,md in enumerate(actual_md): self.assertEqualItems(md,exp_md[i]) for i,md in enumerate(exp_md): self.assertEqualItems(md,actual_md[i])
def test_predict_metagenomes_keeps_sample_metadata(self): """predict_metagenomes preserves Sample metadata in genome and otu table""" #NOTE: could be consolidated with "_keeps_observation_metadata above actual = predict_metagenomes(self.otu_table1_with_metadata,\ self.genome_table1_with_metadata,verbose=False) exp = self.predicted_metagenome_table1_with_metadata #Need to map to dicts, otherwise the memory location of the lambda function #associated with the defaultdict causes (artifactual) inequality of results actual_md = map(dict, sorted([md for md in actual.SampleMetadata])) exp_md = map(dict, sorted([md for md in exp.SampleMetadata])) for i, md in enumerate(actual_md): self.assertEqualItems(md, exp_md[i]) for i, md in enumerate(exp_md): self.assertEqualItems(md, actual_md[i])
def test_predict_metagenomes_keeps_sample_metadata(self): """predict_metagenomes preserves Sample metadata in genome and otu table""" #NOTE: could be consolidated with "_keeps_observation_metadata above actual = predict_metagenomes(self.otu_table1_with_metadata,\ self.genome_table1_with_metadata,verbose=False) exp = self.predicted_metagenome_table1_with_metadata #Need to map to dicts, otherwise the memory location of the lambda function #associated with the defaultdict causes (artifactual) inequality of results actual_md = map(dict,sorted([md for md in actual.metadata()])) exp_md = map(dict,sorted([md for md in exp.metadata()])) for i,md in enumerate(actual_md): self.assertEqualItems(md,exp_md[i]) for i,md in enumerate(exp_md): self.assertEqualItems(md,actual_md[i])
def test_predict_metagenomes(self): """ predict_metagenomes functions as expected with valid input """ actual = predict_metagenomes(self.otu_table1, self.genome_table1) self.assertEqual(actual.delimitedSelf(), self.predicted_metagenome_table1.delimitedSelf())
def test_partition_metagenome_contributions(self): """partition_metagenome_contributions functions with valid input""" #For reference, the OTU table should look like this: ##OTU ID Sample1 Sample2 Sample3 Sample4 #GG_OTU_1 1.0 2.0 3.0 5.0 #GG_OTU_2 5.0 1.0 0.0 2.0 #GG_OTU_3 0.0 0.0 1.0 4.0 #...and the genome table will look like this: ##OTU ID GG_OTU_1 GG_OTU_3 GG_OTU_2 #f1 1.0 2.0 3.0 #f2 0.0 1.0 0.0 #f3 0.0 0.0 1.0 #For which predict metagenomes should produce a table like this: ##OTU ID Sample1 Sample2 Sample3 Sample4 #f1 16.0 5.0 5.0 19.0 #f2 0.0 0.0 1.0 4.0 #f3 5.0 1.0 0.0 2.0 #First, sanity checks #We expect to see the contributions broken down by OTU metagenome_table = predict_metagenomes(self.otu_table1,self.genome_table1) obs = partition_metagenome_contributions(self.otu_table1,self.genome_table1) obs_text = "\n".join(["\t".join(map(str,i)) for i in obs]) exp_text = "\n".join(["\t".join(map(str,r.split())) for r in \ self.predicted_gene_partition_table.split('\n')]) #Test that the percent of all samples is always smaller than #the percent of the current sample for l in obs[1:]: self.assertTrue(l[-1]<=l[-2]) #Test that the summed contributions equal the metagenome table value sum_f1_sample1 = sum([i[5] for i in obs[1:] if (i[0]=="f1" and i[1]=="Sample1")]) self.assertFloatEqual(sum_f1_sample1,16.0) sum_f2_sample1 = sum(\ [i[5] for i in obs[1:] if (i[0]=="f2" and i[1]=="Sample1")]) self.assertFloatEqual(sum_f2_sample1,0.0) sum_f3_sample1 = sum(\ [i[5] for i in obs[1:] if (i[0]=="f3" and i[1]=="Sample1")]) self.assertFloatEqual(sum_f3_sample1,5.0) for l in obs[1:]: gene,sample,OTU,gene_count_per_genome,otu_abundance_in_sample,count,percent,percent_all = l #Test that genomes without genes don't contribute #Only GG_OTU_3 has f2, so for all others the gene #contribution should be 0,0 if gene == "f2" and OTU != "GG_OTU_3": self.assertFloatEqual(count,0.0) self.assertFloatEqual(percent,0.0) self.assertFloatEqual(percent_all,0.0) #Ditto for GG_OTU_2 and f3 if gene == "f3" and OTU != "GG_OTU_2": self.assertFloatEqual(count,0.0) self.assertFloatEqual(percent,0.0) self.assertFloatEqual(percent_all,0.0) #Test that OTUs absent from a sample don't contribute if sample == "Sample1" and OTU == "GG_OTU_3": self.assertFloatEqual(count,0.0) self.assertFloatEqual(percent,0.0) self.assertFloatEqual(percent_all,0.0) #Having validated that this looks OK, just compare to #hand-checked result self.assertEqual(obs_text,exp_text) #Check if "limit to functions" works and retrieves the correct information obs_limited = partition_metagenome_contributions(self.otu_table1,self.genome_table1,limit_to_functions=["f2"]) for l in obs_limited[1:]: gene,sample,OTU,gene_count_per_genome,otu_abundance_in_sample,count,percent,percent_all = l self.assertEqual(gene,"f2")
#If we are calculating variance, we get the prediction as part #of the process if opts.verbose: print "Predicting the metagenome, metagenome variance and confidence intervals for the metagenome..." predicted_metagenomes,predicted_metagenome_variances,\ predicted_metagenomes_lower_CI_95,predicted_metagenomes_upper_CI_95=\ predict_metagenome_variances(otu_table,genome_table,variance_table,whole_round=round_flag) else: #If we don't need confidence intervals, we can do a faster pure numpy prediction if opts.verbose: print "Predicting the metagenome..." predicted_metagenomes = predict_metagenomes(otu_table, genome_table, whole_round=round_flag) if opts.normalize_by_otu: #normalize (e.g. divide) the abundances by the sum of the OTUs per sample if opts.verbose: print "Normalizing functional abundances by sum of OTUs per sample" inverse_otu_sums = [1 / x for x in otu_table.sum(axis='sample')] scaling_factors = dict(zip(otu_table.ids(), inverse_otu_sums)) predicted_metagenomes = scale_metagenomes(predicted_metagenomes, scaling_factors) if opts.normalize_by_function: #normalize (e.g. divide) the abundances by the sum of the functions per sample #Sum of functional abundances per sample will equal 1 (e.g. relative abundance). if opts.verbose:
if opts.with_confidence: #If we are calculating variance, we get the prediction as part #of the process if opts.verbose: print "Predicting the metagenome, metagenome variance and confidence intervals for the metagenome..." predicted_metagenomes,predicted_metagenome_variances,\ predicted_metagenomes_lower_CI_95,predicted_metagenomes_upper_CI_95=\ predict_metagenome_variances(otu_table,genome_table,variance_table) else: #If we don't need confidence intervals, we can do a faster pure numpy prediction if opts.verbose: print "Predicting the metagenome..." predicted_metagenomes = predict_metagenomes(otu_table,genome_table) if opts.normalize_by_otu: #normalize (e.g. divide) the abundances by the sum of the OTUs per sample if opts.verbose: print "Normalizing functional abundances by sum of OTUs per sample" inverse_otu_sums = [1/x for x in otu_table.sum(axis='sample')] scaling_factors = dict(zip(otu_table.SampleIds,inverse_otu_sums)) predicted_metagenomes = scale_metagenomes(predicted_metagenomes,scaling_factors) if opts.normalize_by_function: #normalize (e.g. divide) the abundances by the sum of the functions per sample #Sum of functional abundances per sample will equal 1 (e.g. relative abundance). if opts.verbose: print "Normalizing functional abundances by sum of functions per sample" predicted_metagenomes = predicted_metagenomes.normObservationBySample()
if opts.with_confidence: #If we are calculating variance, we get the prediction as part #of the process if opts.verbose: print "Predicting the metagenome, metagenome variance and confidence intervals for the metagenome..." predicted_metagenomes,predicted_metagenome_variances,\ predicted_metagenomes_lower_CI_95,predicted_metagenomes_upper_CI_95=\ predict_metagenome_variances(otu_table,genome_table,variance_table) else: #If we don't need confidence intervals, we can do a faster pure numpy prediction if opts.verbose: print "Predicting the metagenome..." predicted_metagenomes = predict_metagenomes(otu_table, genome_table) write_metagenome_to_file(predicted_metagenomes,opts.output_metagenome_table,\ opts.format_tab_delimited,"metagenome prediction",verbose=opts.verbose) if opts.with_confidence: output_path, output_filename = split(opts.output_metagenome_table) base_output_filename, ext = splitext(output_filename) variance_output_fp =\ join(output_path,"%s_variances%s" %(base_output_filename,ext)) upper_CI_95_output_fp =\ join(output_path,"%s_upper_CI_95%s" %(base_output_filename,ext)) lower_CI_95_output_fp =\ join(output_path,"%s_lower_CI_95%s" %(base_output_filename,ext)) write_metagenome_to_file(predicted_metagenome_variances,\
def test_partition_metagenome_contributions(self): """partition_metagenome_contributions functions with valid input""" #For reference, the OTU table should look like this: ##OTU ID Sample1 Sample2 Sample3 Sample4 #GG_OTU_1 1.0 2.0 3.0 5.0 #GG_OTU_2 5.0 1.0 0.0 2.0 #GG_OTU_3 0.0 0.0 1.0 4.0 #...and the genome table will look like this: ##OTU ID GG_OTU_1 GG_OTU_3 GG_OTU_2 #f1 1.0 2.0 3.0 #f2 0.0 1.0 0.0 #f3 0.0 0.0 1.0 #For which predict metagenomes should produce a table like this: ##OTU ID Sample1 Sample2 Sample3 Sample4 #f1 16.0 5.0 5.0 19.0 #f2 0.0 0.0 1.0 4.0 #f3 5.0 1.0 0.0 2.0 #First, sanity checks #We expect to see the contributions broken down by OTU metagenome_table = predict_metagenomes(self.otu_table1,self.genome_table1) obs = partition_metagenome_contributions(self.otu_table1,self.genome_table1) obs_text = "\n".join(["\t".join(map(str,i)) for i in obs]) exp_text = "\n".join(["\t".join(map(str,r.split())) for r in \ self.predicted_gene_partition_table.split('\n')]) #Test that the percent of all samples is always smaller than #the percent of the current sample for l in obs[1:]: self.assertTrue(l[-1]<=l[-2]) #Test that the summed contributions equal the metagenome table value sum_f1_sample1 = sum([i[5] for i in obs[1:] if (i[0]=="f1" and i[1]=="Sample1")]) self.assertFloatEqual(sum_f1_sample1,16.0) sum_f2_sample1 = sum(\ [i[5] for i in obs[1:] if (i[0]=="f2" and i[1]=="Sample1")]) self.assertFloatEqual(sum_f2_sample1,0.0) sum_f3_sample1 = sum(\ [i[5] for i in obs[1:] if (i[0]=="f3" and i[1]=="Sample1")]) self.assertFloatEqual(sum_f3_sample1,5.0) for l in obs[1:]: gene,sample,OTU,gene_count_per_genome,otu_abundance_in_sample,count,percent,percent_all = l #Test that genomes without genes don't contribute #Only GG_OTU_3 has f2, so for all others the gene #contribution should be 0,0 if gene == "f2" and OTU != "GG_OTU_3": self.assertFloatEqual(count,0.0) self.assertFloatEqual(percent,0.0) self.assertFloatEqual(percent_all,0.0) #Ditto for GG_OTU_2 and f3 if gene == "f3" and OTU != "GG_OTU_2": self.assertFloatEqual(count,0.0) self.assertFloatEqual(percent,0.0) self.assertFloatEqual(percent_all,0.0) #Test that OTUs absent from a sample don't contribute if sample == "Sample1" and OTU == "GG_OTU_3": self.assertFloatEqual(count,0.0) self.assertFloatEqual(percent,0.0) self.assertFloatEqual(percent_all,0.0) #Having validated that this looks OK, just compare to #hand-checked result self.assertEqual(obs_text,exp_text) #Check if "limit to functions" works and retrieves the correct information obs_limited = partition_metagenome_contributions(self.otu_table1,self.genome_table1,limit_to_functions=["f2"]) for l in obs_limited[1:]: gene,sample,OTU,gene_count_per_genome,otu_abundance_in_sample,count,percent,percent_all = l self.assertEqual(gene,"f2")
def test_predict_metagenomes(self): """ predict_metagenomes functions as expected with valid input """ actual = predict_metagenomes(self.otu_table1,self.genome_table1) self.assertEqual(str(actual), str(self.predicted_metagenome_table1))
if opts.with_confidence: #If we are calculating variance, we get the prediction as part #of the process if opts.verbose: print "Predicting the metagenome, metagenome variance and confidence intervals for the metagenome..." predicted_metagenomes,predicted_metagenome_variances,\ predicted_metagenomes_lower_CI_95,predicted_metagenomes_upper_CI_95=\ predict_metagenome_variances(otu_table,genome_table,variance_table,whole_round=round_flag) else: #If we don't need confidence intervals, we can do a faster pure numpy prediction if opts.verbose: print "Predicting the metagenome..." predicted_metagenomes = predict_metagenomes(otu_table,genome_table,whole_round=round_flag) if opts.normalize_by_otu: #normalize (e.g. divide) the abundances by the sum of the OTUs per sample if opts.verbose: print "Normalizing functional abundances by sum of OTUs per sample" inverse_otu_sums = [1/x for x in otu_table.sum(axis='sample')] scaling_factors = dict(zip(otu_table.ids(),inverse_otu_sums)) predicted_metagenomes = scale_metagenomes(predicted_metagenomes,scaling_factors) if opts.normalize_by_function: #normalize (e.g. divide) the abundances by the sum of the functions per sample #Sum of functional abundances per sample will equal 1 (e.g. relative abundance). if opts.verbose: print "Normalizing functional abundances by sum of functions per sample" predicted_metagenomes = predicted_metagenomes.norm(axis='sample', inplace=False)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading OTU table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) if opts.verbose: print "Done loading OTU table containing %i samples and %i OTUs." %(len(otu_table.SampleIds),len(otu_table.ObservationIds)) if(opts.input_count_table is None): if(opts.type_of_prediction == 'KO'): input_count_table=join(get_picrust_project_dir(),'picrust','data','ko_precalculated.biom.gz') elif(opts.type_of_prediction == 'COG'): input_count_table=join(get_picrust_project_dir(),'picrust','data','cog_precalculated.biom.gz') else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if (ext == '.gz'): genome_table_str = gzip.open(input_count_table,'rb').read() else: genome_table_str = open(input_count_table,'U').read() #In the genome/trait table genomes are the samples and #genes are the observations if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs ids_to_load = otu_table.ObservationIds if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_str,ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* trait table because --suppress_subset_loading was passed. This may result in high memory usage." genome_table = parse_biom_table(genome_table_str) if opts.verbose: print "Done loading trait table containing %i functions for %i organisms." %(len(genome_table.ObservationIds),len(genome_table.SampleIds)) make_output_dir_for_file(opts.output_metagenome_table) if opts.accuracy_metrics: # Calculate accuracy metrics #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False) #print "Unweighted NSTI:", unweighted_nsti weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True) samples= weighted_nsti[0] nstis = list(weighted_nsti[1]) #print "Samples:",samples #print "NSTIs:",nstis samples_and_nstis = zip(samples,nstis) #print "Samples and NSTIs:",samples_and_nstis lines = ["#Sample\tMetric\tValue\n"] #print weighted_nsti for sample,nsti in samples_and_nstis: line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti)) lines.append(line) if opts.verbose: for l in sorted(lines): print l if opts.verbose: print "Writing accuracy information to file:", opts.accuracy_metrics open(opts.accuracy_metrics,'w').writelines(sorted(lines)) if opts.verbose: print "Predicting the metagenome..." predicted_metagenomes = predict_metagenomes(otu_table,genome_table) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) if(opts.format_tab_delimited): open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf(header_key="KEGG Pathways",header_value="KEGG Pathways",metadata_formatter=lambda s: '|'.join(['; '.join(l) for l in s]))) else: open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))