def setUp(self): """define some top-level data""" #self.col_header=['Sample1', 'Sample2'] #self.row_header=['OTU1','OTU2'] #self.otu_table=array([[0,0],[1,5]]) #self.lineages=[['Bacteria'],['Archaea']] #self.data={} #self.data['otu_counts']=self.col_header,self.row_header,self.otu_table,\ # self.lineages self.output_dir = '/tmp/' otu_table_vals = array([[0, 0], [1, 5]]) #{(0,0):0.0,(1,0):1.0, (1,1):5.0} self.otu_table = table_factory(otu_table_vals, ['Sample1', 'Sample2'], ['OTU1', 'OTU2'], [None, None], [{ "taxonomy": ["Bacteria"] }, { "taxonomy": ["Archaea"] }]) filt_otu_table_vals = array([[1, 5]]) #{(0,0):1.0, (0,1):5.0} self.filt_otu_table = table_factory(filt_otu_table_vals, ['Sample1', 'Sample2'], ['OTU2'], [None, None], [{ "taxonomy": ["Archaea"] }]) self.num_otu_hits = 5 self._folders_to_cleanup = []
def setUp(self): """define some top-level data""" #self.col_header=['Sample1', 'Sample2'] #self.row_header=['OTU1','OTU2'] #self.otu_table=array([[0,0],[1,5]]) #self.lineages=[['Bacteria'],['Archaea']] #self.data={} #self.data['otu_counts']=self.col_header,self.row_header,self.otu_table,\ # self.lineages self.output_dir='/tmp/' otu_table_vals = array([[0,0],[1,5]]) #{(0,0):0.0,(1,0):1.0, (1,1):5.0} self.otu_table = table_factory(otu_table_vals, ['Sample1', 'Sample2'], ['OTU1', 'OTU2'], [None, None], [{"taxonomy": ["Bacteria"]}, {"taxonomy": ["Archaea"]}]) filt_otu_table_vals = array([[1,5]]) #{(0,0):1.0, (0,1):5.0} self.filt_otu_table = table_factory(filt_otu_table_vals, ['Sample1', 'Sample2'], ['OTU2'], [None, None], [{"taxonomy": ["Archaea"]}]) self.num_otu_hits=5 self._folders_to_cleanup=[]
def test_get_log_transform(self): orig_data = array([[0, 1, 2], [1000, 0, 0]]) orig_otu_table = table_factory(orig_data, ['Sample1', 'Sample2', 'Sample3'], ['OTU1', 'OTU2'], [None, None, None], [{ "taxonomy": ["Bacteria"] }, { "taxonomy": ["Archaea"] }]) exp_data = array([[0, 0.69314718, 1.38629436], [7.60090246, 0, 0]]) exp_otu_table = table_factory(exp_data, ['Sample1', 'Sample2', 'Sample3'], ['OTU1', 'OTU2'], [None, None, None], [{ "taxonomy": ["Bacteria"] }, { "taxonomy": ["Archaea"] }]) log_otu_table = get_log_transform(orig_otu_table, eps=None) # comparing directly log_otu_table against exp_otu_table doesn't work, # needs to be modified in the otu table object self.assertFloatEqual(list(log_otu_table.iterSampleData()), list(exp_otu_table.iterSampleData()))
def parse_biom_taxon_table(json_table, constructor=None, data_pump=None): """Parse a biom taxon table Constructor must have a _biom_type of "taxon table" """ mat_type = json_table['matrix_type'] table_type = 'taxon table' constructors = [SparseTaxonTable, DenseTaxonTable] constructor = pick_constructor(mat_type,table_type,constructor,constructors) sample_ids = [col['id'] for col in json_table['columns']] sample_metadata = [col['metadata'] for col in json_table['columns']] obs_ids = [row['id'] for row in json_table['rows']] obs_metadata = [row['metadata'] for row in json_table['rows']] dtype = MATRIX_ELEMENT_TYPE[json_table['matrix_element_type']] if data_pump is None: table_obj = table_factory(json_table['data'], sample_ids, obs_ids, sample_metadata, obs_metadata, constructor=constructor, shape=json_table['shape'], dtype=dtype) else: table_obj = table_factory(data_pump, sample_ids, obs_ids, sample_metadata, obs_metadata, constructor=constructor, shape=json_table['shape'], dtype=dtype) return table_obj
def test_verify_subset(self): metadata = [('a','other stuff\tfoo'), ('b', 'asdasdasd'), ('c','123123123')] table = table_factory(array([[1,2,3],[4,5,6]]), ['a','b','c'], ['x','y']) self.assertTrue(verify_subset(table, metadata)) table = table_factory(array([[1,2],[3,4]]), ['a','b'], ['x','y']) self.assertTrue(verify_subset(table, metadata)) table = table_factory(array([[1,2,3],[4,5,6]]), ['a','b','x'], ['x','y']) self.assertFalse(verify_subset(table, metadata))
def setUp(self): """Define some test data.""" self.qiime_config = load_qiime_config() self.dirs_to_remove = [] self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' if not exists(self.tmp_dir): makedirs(self.tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(self.tmp_dir) self.otu_table1 = table_factory(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=list('abcd'), constructor=DenseOTUTable) fd, self.otu_table1_fp = mkstemp(dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) open(self.otu_table1_fp, 'w').write( format_biom_table(self.otu_table1)) self.otu_table2 = table_factory(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=['a', 'b', 'c', 'd_'], constructor=DenseOTUTable) fd, self.otu_table2_fp = mkstemp(dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) open(self.otu_table2_fp, 'w').write( format_biom_table(self.otu_table2)) self.single_sample_otu_table = table_factory( data=array([[2, 0, 0, 1]]).T, sample_ids=list('X'), observation_ids=list( 'abcd'), constructor=DenseOTUTable) fd, self.single_sample_otu_table_fp = mkstemp( dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) open(self.single_sample_otu_table_fp, 'w').write( format_biom_table(self.single_sample_otu_table)) self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);') self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);") self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp, self.single_sample_otu_table_fp]
def setUp(self): """Define some test data.""" self.qiime_config = load_qiime_config() self.dirs_to_remove = [] self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' if not exists(self.tmp_dir): makedirs(self.tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(self.tmp_dir) self.otu_table1 = table_factory(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=list('abcd'), constructor=DenseOTUTable) self.otu_table1_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom', result_constructor=str) open(self.otu_table1_fp,'w').write(\ format_biom_table(self.otu_table1)) self.otu_table2 = table_factory(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=['a', 'b', 'c', 'd_'], constructor=DenseOTUTable) self.otu_table2_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom', result_constructor=str) open(self.otu_table2_fp,'w').write(\ format_biom_table(self.otu_table2)) self.single_sample_otu_table = table_factory( data=array([[2, 0, 0, 1]]).T, sample_ids=list('X'), observation_ids=list('abcd'), constructor=DenseOTUTable) self.single_sample_otu_table_fp = get_tmp_filename( tmp_dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom', result_constructor=str) open(self.single_sample_otu_table_fp,'w').write(\ format_biom_table(self.single_sample_otu_table)) self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);') self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);") self.files_to_remove = [ self.otu_table1_fp, self.otu_table2_fp, self.single_sample_otu_table_fp ]
def setUp(self): """define some top-level data""" self.otu_table_values = array([[0, 0, 9, 5, 3, 1], [1, 5, 4, 0, 3, 2], [2, 3, 1, 1, 2, 5]]) { (0, 2): 9.0, (0, 3): 5.0, (0, 4): 3.0, (0, 5): 1.0, (1, 0): 1.0, (1, 1): 5.0, (1, 2): 4.0, (1, 4): 3.0, (1, 5): 2.0, (2, 0): 2.0, (2, 1): 3.0, (2, 2): 1.0, (2, 3): 1.0, (2, 4): 2.0, (2, 5): 5.0 } self.otu_table = table_factory( self.otu_table_values, ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], ['OTU1', 'OTU2', 'OTU3'], [None, None, None, None, None, None], [{ "taxonomy": ['Bacteria'] }, { "taxonomy": ['Archaea'] }, { "taxonomy": ['Streptococcus'] }]) self.otu_table_f = table_factory( self.otu_table_values, ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], ['OTU1', 'OTU2', 'OTU3'], [None, None, None, None, None, None], [{ "taxonomy": ['1A', '1B', '1C', 'Bacteria'] }, { "taxonomy": ['2A', '2B', '2C', 'Archaea'] }, { "taxonomy": ['3A', '3B', '3C', 'Streptococcus'] }]) self.full_lineages = [['1A', '1B', '1C', 'Bacteria'], ['2A', '2B', '2C', 'Archaea'], ['3A', '3B', '3C', 'Streptococcus']] self.metadata = [[['Sample1', 'NA', 'A'], ['Sample2', 'NA', 'B'], ['Sample3', 'NA', 'A'], ['Sample4', 'NA', 'B'], ['Sample5', 'NA', 'A'], ['Sample6', 'NA', 'B']], ['SampleID', 'CAT1', 'CAT2'], []] self.tree_text = ["('OTU3',('OTU1','OTU2'))"] fh, self.tmp_heatmap_fpath = mkstemp(prefix='test_heatmap_', suffix='.pdf') close(fh)
def test_sample_mapping_to_biom_table(self): """sample_mapping_to_biom_table works""" lines = self.SampleMapping actual = sample_mapping_to_biom_table(lines) exp = table_factory(array([[3., 0., 2.], [1., 2., 0.]]), ['sample1', 'sample2', 'sample3'], ['OTU1', 'OTU2']) self.assertEqual(actual.sortBySampleId(), exp.sortBySampleId()) lines = self.SampleMappingNoMIENS actual = sample_mapping_to_biom_table(lines) exp = table_factory(array([[3., 0., 2.], [1., 2., 0.]]), ['sample.1', 'sample.2', 'sample.3'], ['OTU1', 'OTU2']) self.assertEqual(actual.sortBySampleId(), exp.sortBySampleId())
def test_sample_mapping_to_biom_table(self): """sample_mapping_to_biom_table works""" lines = self.SampleMapping actual = sample_mapping_to_biom_table(lines) exp = table_factory(array([[3.,0.,2.],[1.,2.,0.]]), ['sample1','sample2','sample3'], ['OTU1','OTU2']) self.assertEqual(actual.sortBySampleId(), exp.sortBySampleId()) lines = self.SampleMappingNoMIENS actual = sample_mapping_to_biom_table(lines) exp = table_factory(array([[3.,0.,2.],[1.,2.,0.]]), ['sample.1','sample.2','sample.3'], ['OTU1','OTU2']) self.assertEqual(actual.sortBySampleId(), exp.sortBySampleId())
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.otu_table_data = numpy.array([[2, 1, 0], [0, 5, 0], [0, 3, 0], [1, 2, 0]]) self.sample_names = list('YXZ') self.taxon_names = list('bacd') self.otu_metadata = [{ 'domain': 'Archaea' }, { 'domain': 'Bacteria' }, { 'domain': 'Bacteria' }, { 'domain': 'Bacteria' }] self.otu_table = table_factory(self.otu_table_data, self.sample_names, self.taxon_names) self.otu_table_meta = table_factory( self.otu_table_data, self.sample_names, self.taxon_names, observation_metadata=self.otu_metadata) self.otu_table_str = format_biom_table(self.otu_table) self.otu_table_meta_str = format_biom_table(self.otu_table_meta) _, self.otu_table_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(_) _, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(_) self.rare_dir = mkdtemp(dir=self.tmp_dir, prefix='test_rarefaction_dir', suffix='') open(self.otu_table_fp, 'w').write(self.otu_table_str) open(self.otu_table_meta_fp, 'w').write(self.otu_table_meta_str) self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp] self._dirs_to_clean_up = [self.rare_dir]
def sample_mapping_to_biom_table(lines): """Converts the UniFrac sample mapping file to biom table object The sample mapping file is a required input for the UniFrac web interface. Corrects the sample ids to be MIENS compliant """ trans_table = build_sample_ids_transtable() data = [] sample_ids = [] observation_ids = [] for line in lines: fields = line.strip().split() observation_id = fields[0] sample_id = fields[1].translate(trans_table) count = float(fields[2]) try: sample_idx = sample_ids.index(sample_id) except ValueError: sample_idx = len(sample_ids) sample_ids.append(sample_id) try: observation_idx = observation_ids.index(observation_id) except ValueError: observation_idx = len(observation_ids) observation_ids.append(observation_id) data.append([observation_idx, sample_idx, count]) return table_factory(data, sample_ids, observation_ids)
def table_from_template(new_data,sample_ids,observation_ids,\ sample_metadata_source=None,observation_metadata_source=None,\ constructor=SparseGeneTable,verbose=False): """Build a new BIOM table from new_data, and transfer metadata from 1-2 existing tables""" #Build the BIOM table result_table = table_factory(new_data,sample_ids,observation_ids,\ constructor=SparseGeneTable) #Transfer sample metadata from the OTU table #to the metagenome table (samples are the same) if sample_metadata_source: result_table = transfer_metadata(sample_metadata_source,result_table,\ donor_metadata_type='SampleMetadata',\ recipient_metadata_type='SampleMetadata',verbose=verbose) #Now transfer observation metadata (e.g. gene metadata) #from the genome table to the result table if observation_metadata_source: result_table = transfer_metadata(observation_metadata_source,\ result_table,donor_metadata_type='ObservationMetadata',\ recipient_metadata_type='ObservationMetadata',verbose=verbose) return result_table
def setUp(self): self.otu_table_vals = array([[1,0,2,4], [1,2,0,1], [0,1,1,0], [1,2,1,0]]) {(0, 0):1.0, (0, 2):2.0, (0, 3):4.0, (1, 0):1.0, (1, 1):2.0, (1, 3):1.0, (2, 1):1.0, (2, 2):1.0, (3, 0):1.0, (3, 1): 2.0, (3, 2):1.0} self.otu_table = table_factory(self.otu_table_vals, ['s1', 's2', 's3', 's4'], ['0', '1', '2', '3'], None, [{"taxonomy": ["Root", "Bacteria", "Actinobacteria", "Actinobacteria", "Coriobacteridae", "Coriobacteriales", "Coriobacterineae", "Coriobacteriaceae"]}, {"taxonomy": ["Root", "Bacteria", "Firmicutes", "\"Clostridia\""]}, {"taxonomy": ["Root", "Bacteria", "Firmicutes", "\"Clostridia\""]}, {"taxonomy": ["Root", "Bacteria"]}]) # self.otu_table="""#Full OTU Counts ##OTU ID\ts1\ts2\ts3\ts4\tConsensus Lineage #0\t1\t0\t2\t4\tRoot;Bacteria;Actinobacteria;Actinobacteria;Coriobacteridae;Coriobacteriales;Coriobacterineae;Coriobacteriaceae #1\t1\t2\t0\t1\tRoot;Bacteria;Firmicutes;"Clostridia" #2\t0\t1\t1\t0\tRoot;Bacteria;Firmicutes;"Clostridia" #3\t1\t2\t1\t0\tRoot;Bacteria""".split('\n') self.mapping="""#SampleID\tBarcodeSequence\tTreatment\tDescription #Test mapping file s1\tAAAA\tControl\tControl mouse, I.D. 354 s2\tGGGG\tControl\tControl mouse, I.D. 355 s3\tCCCC\tExp\tDisease mouse, I.D. 356 s4\tTTTT\tExp\tDisease mouse, I.D. 357""".split('\n')
def setUp(self): """Set up files/environment that will be used by the tests.""" # The prefix to use for temporary files. This prefix may be added to, # but all temp dirs and files created by the tests will have this # prefix at a minimum. self.prefix = 'most_wanted_otus_tests_' self.files_to_remove = [] self.dirs_to_remove = [] self.output_dir = mkdtemp(prefix='%soutput_dir_' % self.prefix) self.dirs_to_remove.append(self.output_dir) self.grouping_category = 'Environment' self.top_n = 100 self.blast_results_lines = blast_results.split('\n') self.blast_results_dupes_lines = blast_results_dupes.split('\n') self.rep_set_lines = rep_set.split('\n') self.top_n_mw = [('a', 'gi|7|emb|T51700.1|', 87.0), ('b', 'gi|8|emb|Z700.1|', 89.5)] self.mw_seqs = {'b':'AAGGTT', 'a':'AGT'} self.master_otu_table_ms = table_factory( array([[1.0, 2.0], [2.0, 5.0]]), ['Env1', 'Env2'], ['a', 'b'], sample_metadata=None, observation_metadata=[{'taxonomy':'foo;bar;baz'}, {'taxonomy':'foo;baz;bar'}], table_id=None, constructor=SparseOTUTable)
def parse_classic_table_to_rich_table(lines, sample_mapping, obs_mapping, process_func, **kwargs): """Parses an table (tab delimited) (observation x sample) sample_mapping : can be None or {'sample_id':something} obs_mapping : can be none or {'observation_id':something} """ sample_ids, obs_ids, data, t_md, t_md_name = parse_classic_table(lines, **kwargs) # if we have it, keep it if t_md is None: obs_metadata = None else: obs_metadata = [{t_md_name: process_func(v)} for v in t_md] if sample_mapping is None: sample_metadata = None else: sample_metadata = [sample_mapping[sample_id] for sample_id in sample_ids] # will override any metadata from parsed table if obs_mapping is not None: obs_metadata = [obs_mapping[obs_id] for obs_id in obs_ids] data = nparray_to_sparseobj(data) return table_factory(data, sample_ids, obs_ids, sample_metadata, obs_metadata)
def combine_tables(tables): """Combines multiple biom tables into a signle table, discarding any non-shared OTUs. """ samples = [sample for sample_list in [table.SampleIds for table in tables] for sample in sample_list] duplicate_sample_indices = [ index for indices in [[index for index, value in indices][1:] for sample, indices in groupby(sorted(enumerate(samples), key=lambda x: x[1]), lambda x: x[1])] for index in indices] otu_data = dict() for table in tables: for vals, otu, md in table.iterObservations(): if otu_data.get(otu) is not None: otu_data[otu] = append(otu_data[otu], vals) else: otu_data[otu] = vals otus = [otu for otu in otu_data if len(otu_data[otu]) == len(samples)] if not otus: raise ValueError('No shared OTUs') data = [array([v for i, v in enumerate(otu_data[otu]) if i not in duplicate_sample_indices]) for otu in otus] samples = [v for i, v in enumerate(samples) if i not in duplicate_sample_indices] return table_factory(data, samples, otus, constructor=SparseOTUTable)
def setUp(self): """Set up files/environment that will be used by the tests.""" # The prefix to use for temporary files. This prefix may be added to, # but all temp dirs and files created by the tests will have this # prefix at a minimum. self.prefix = 'most_wanted_otus_tests_' self.files_to_remove = [] self.dirs_to_remove = [] self.output_dir = mkdtemp(prefix='%soutput_dir_' % self.prefix) self.dirs_to_remove.append(self.output_dir) self.grouping_category = 'Environment' self.top_n = 100 self.blast_results_lines = blast_results.split('\n') self.blast_results_dupes_lines = blast_results_dupes.split('\n') self.rep_set_lines = rep_set.split('\n') self.top_n_mw = [('a', 'gi|7|emb|T51700.1|', 87.0), ('b', 'gi|8|emb|Z700.1|', 89.5)] self.mw_seqs = {'b': 'AAGGTT', 'a': 'AGT'} self.master_otu_table_ms = table_factory(array([[1.0, 2.0], [2.0, 5.0]]), ['Env1', 'Env2'], ['a', 'b'], sample_metadata=None, observation_metadata=[{ 'taxonomy': 'foo;bar;baz' }, { 'taxonomy': 'foo;baz;bar' }], table_id=None, constructor=SparseOTUTable)
def make_new_otu_counts(otu_table, sample_to_subtract, samples_from_subject): """make the converted otu table """ new_sample_ids = sample_to_subtract.keys() new_sample_ids.sort() new_otu_counts = zeros([len(otu_table.ObservationIds), len(new_sample_ids)]) for index1, otu in enumerate(otu_table.ObservationIds): for index2, sample in enumerate(new_sample_ids): tpz_sample = sample_to_subtract[sample] if tpz_sample in otu_table.SampleIds: tpz_sample_index = otu_table.SampleIds.index(tpz_sample) else: raise ValueError("There are samples in the category mapping file that are not in the otu table, such as sample: " + tpz_sample + ". Removing these samples from the category mapping file will allow you to proceed.") #get the new count as the relative abundance of the otu at #the later timepoint minus the relative abundance at timepoint zero old_sample_index = otu_table.SampleIds.index(sample) new_count = otu_table[index1, old_sample_index] - \ otu_table[index1, tpz_sample_index] #make sure that the count is not zero across all of the subject's #samples has_nonzeros = False subject_sample_ids = samples_from_subject[sample] for i in subject_sample_ids: sample_index = otu_table.SampleIds.index(i) if otu_table[index1, sample_index] > 0: has_nonzeros = True if has_nonzeros: new_otu_counts[index1, index2] = new_count else: new_otu_counts[index1, index2] = 999999999 return table_factory(new_otu_counts, new_sample_ids, otu_table.ObservationIds, observation_metadata=otu_table.ObservationMetadata)
def merge_otu_tables(vcf_fps): """Takes a list of multiple vcf files and returns a single biom table of all files.""" master_table = None #open all of the files with correct extensions. Raise a value error if incorrect extension master_observation_ids = None for vcf_fp in vcf_fps: if vcf_fp.endswith('gz'): vcf_fp = gzip.open(vcf_fp) elif vcf_fp.endswith('vcf'): vcf_fp = open(vcf_fp, 'U') else: raise ValueError, "Invalid file format or extension, only '.vcf' or '.vcf.gz'\ are accepted" data, sample_ids, observation_ids, sample_md, observation_md =\ create_biom_table(vcf_fp) if master_observation_ids is None: master_observation_ids = observation_ids else: master_observation_ids = set(master_observation_ids) & set(observation_ids) biom_table = table_factory(data, sample_ids, observation_ids, sample_md, observation_md, constructor=SparseOTUTable) if master_table is None: master_table = biom_table else: master_table.merge(biom_table) # try: # master_table = master_table.merge(biom_table) # except AttributeError: # master_table = biom_table return master_table, observation_ids
def format_summarize_taxa(summary, header, delimiter=';', file_format='classic'): """Formats a summarized taxonomy table for output""" if file_format == 'classic': yield "%s\n" % '\t'.join(header) for row in summary: # taxon is tuple, join together for foo;bar;foobar taxon = row[0] line = [delimiter.join(taxon)] # add on otu counts line.extend(map(str, row[1:])) yield "%s\n" % '\t'.join(line) elif file_format == 'biom': # Skip 'Taxon' or 'SampleId' label in first column. sample_ids = header[1:] observation_ids = [] data = [] for row in summary: # Join taxonomic levels to create an observation ID. observation_ids.append(delimiter.join(row[0])) data.append(row[1:]) table = table_factory(asarray(data), sample_ids, observation_ids, constructor=SparseTaxonTable) yield format_biom_table(table) else: raise ValueError("Invalid file format '%s'. Must be either 'classic' " "or 'biom'." % file_format)
def simsam_range(table, tree, simulated_sample_sizes, dissimilarities, mapping_f=None): """Applies sim_otu_table over a range of parameters table: the input table to simulate samples from tree: tree related OTUs in input table simulated_sample_sizes: a list of ints defining how many output samples should be create per input sample dissimilarities: a list of floats containing the dissimilarities to use in simulating tables mapping_f: file handle for metadata mapping file, if a mapping file should be created with the samples from each simulated table This function will yield tuples with the following form: (output table, output mapping lines, simulated_sample_size, dissimilarity) If the user does not provide mapping_f, the tuples will look like: (output table, None, simulated_sample_size, dissimilarity) """ if mapping_f != None: # if the user provided a mapping file, load it into # a list for repeated use, and define the function for # processing the mapping file mapping_lines = list(mapping_f) process_map = create_replicated_mapping_file else: # otherwise create a dummy function for processing the # mapping file so we don't have to check whether it # exists on every iteration mapping_lines = None def process_map(mapping_lines, simulated_sample_size, sample_ids): return None for simulated_sample_size in simulated_sample_sizes: # create the output mapping file data output_mapping_lines = \ process_map(mapping_lines, simulated_sample_size, table.SampleIds) for dissimilarity in dissimilarities: # create the simulated otu table output_sample_ids, output_otu_ids, output_data, output_metadata = \ sim_otu_table(table.SampleIds, table.ObservationIds, table.iterSamples(), table.ObservationMetadata, tree, simulated_sample_size, dissimilarity) output_table = table_factory(output_data, output_sample_ids, output_otu_ids, observation_metadata=output_metadata) yield (output_table, output_mapping_lines, simulated_sample_size, dissimilarity)
def make_otu_table(otu_map_f, otu_to_taxonomy=None, delim='_', table_id=None, sample_metadata=None, constructor=SparseOTUTable): data, sample_ids, otu_ids = parse_otu_map(otu_map_f,delim) if otu_to_taxonomy != None: otu_metadata = [] for o in otu_ids: try: otu_metadata.append({'taxonomy':otu_to_taxonomy[o].split(';')}) except KeyError: otu_metadata.append({'taxonomy':["None"]}) else: otu_metadata = None if sample_metadata != None: raise NotImplementedError,\ "Passing of sample metadata to make_otu_table is not currently supported." try: otu_table = table_factory(data, sample_ids, otu_ids, sample_metadata=sample_metadata, observation_metadata=otu_metadata, table_id=table_id, constructor=constructor, dtype=int) except ValueError,e: raise ValueError,\ ("Couldn't create OTU table. Is your OTU map empty?" " Original error message: %s" % (str(e)))
def predict_metagenomes(otu_table,genome_table,verbose=False): """ predict metagenomes from otu table and genome table """ otu_data,genome_data,overlapping_otus = extract_otu_and_genome_data(otu_table,genome_table) # matrix multiplication to get the predicted metagenomes new_data = dot(array(otu_data).T,array(genome_data)).T #Round counts to nearest whole numbers new_data = around(new_data) # return the result as a sparse biom table - the sample ids are now the # sample ids from the otu table, and the observation ids are now the # functions (i.e., observations) from the genome table result_table = table_factory(new_data,otu_table.SampleIds,genome_table.ObservationIds,constructor=SparseGeneTable) #We need to preserve metadata about the samples from the OTU table, #and metadata about the gene functions from the genome table #Transfer sample metadata from the OTU table #to the metagenome table (samples are the same) result_table = transfer_metadata(otu_table,result_table,\ donor_metadata_type='SampleMetadata',\ recipient_metadata_type='SampleMetadata',verbose=verbose) #Now transfer observation metadata (e.g. gene metadata) #from the genome table to the result table result_table = transfer_metadata(genome_table,result_table,\ donor_metadata_type='ObservationMetadata',\ recipient_metadata_type='ObservationMetadata',verbose=verbose) return result_table
def transpose_biom(table): #files must be in dense format if not table.__class__.__name__.startswith("Dense"): raise ValueError, "Only 'Dense' biom type tables can be compared. Please convert and try again." return table_factory(table._data.T, table.ObservationIds, table.SampleIds, constructor=DenseOTUTable)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_ext=path.splitext(opts.input_otu_fp)[1] if opts.input_format_classic: otu_table=parse_classic_table_to_rich_table(open(opts.input_otu_fp,'U'),None,None,None,DenseOTUTable) else: if input_ext != '.biom': sys.stderr.write("\nOTU table does not have '.biom' extension! If loading causes error consider using '-f' option to load tab-delimited OTU table!\n\n") otu_table = parse_biom_table(open(opts.input_otu_fp,'U')) ext=path.splitext(opts.input_count_fp)[1] if (ext == '.gz'): count_table = parse_biom_table(gzip.open(opts.input_count_fp,'rb')) else: count_table = parse_biom_table(open(opts.input_count_fp,'U')) #Need to only keep data relevant to our otu list ids=[] for x in otu_table.iterObservations(): ids.append(str(x[1])) ob_id=count_table.ObservationIds[0] filtered_otus=[] filtered_values=[] for x in ids: if count_table.sampleExists(x): filtered_otus.append(x) filtered_values.append(otu_table.observationData(x)) #filtered_values = map(list,zip(*filtered_values)) filtered_otu_table=table_factory(filtered_values,otu_table.SampleIds,filtered_otus, constructor=DenseOTUTable) copy_numbers_filtered={} for x in filtered_otus: value = count_table.getValueByIds(ob_id,x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x]={opts.metadata_identifer:value} filtered_otu_table.addObservationMetadata(copy_numbers_filtered) normalized_table = filtered_otu_table.normObservationByMetadata(opts.metadata_identifer) make_output_dir_for_file(opts.output_otu_fp) open(opts.output_otu_fp,'w').write(\ normalized_table.getBiomFormatJsonString('PICRUST'))
def setUp(self): """Define some test data.""" self.qiime_config = load_qiime_config() self.dirs_to_remove = [] self.tmp_dir = self.qiime_config["temp_dir"] or "/tmp/" if not exists(self.tmp_dir): makedirs(self.tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(self.tmp_dir) self.otu_table1 = table_factory( data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list("XYZ"), observation_ids=list("abcd"), constructor=DenseOTUTable, ) self.otu_table1_fp = get_tmp_filename( tmp_dir=self.tmp_dir, prefix="alpha_diversity_tests", suffix=".biom", result_constructor=str ) open(self.otu_table1_fp, "w").write(format_biom_table(self.otu_table1)) self.otu_table2 = table_factory( data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list("XYZ"), observation_ids=["a", "b", "c", "d_"], constructor=DenseOTUTable, ) self.otu_table2_fp = get_tmp_filename( tmp_dir=self.tmp_dir, prefix="alpha_diversity_tests", suffix=".biom", result_constructor=str ) open(self.otu_table2_fp, "w").write(format_biom_table(self.otu_table2)) self.single_sample_otu_table = table_factory( data=array([[2, 0, 0, 1]]).T, sample_ids=list("X"), observation_ids=list("abcd"), constructor=DenseOTUTable ) self.single_sample_otu_table_fp = get_tmp_filename( tmp_dir=self.tmp_dir, prefix="alpha_diversity_tests", suffix=".biom", result_constructor=str ) open(self.single_sample_otu_table_fp, "w").write(format_biom_table(self.single_sample_otu_table)) self.tree1 = parse_newick("((a:2,b:3):2,(c:1,d:2):7);") self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);") self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp, self.single_sample_otu_table_fp]
def biom_table_from_predictions(predictions,trait_ids): organism_ids=predictions.keys() #data is in values (this transposes the matrix) data=map(list,zip(*predictions.values())) data=array(data,dtype=int) #import pdb; pdb.set_trace() biom_table=table_factory(data,organism_ids,trait_ids, constructor=DenseOTUTable) #biom_table=table_factory(data,organism_ids,trait_ids, constructor=SparseOTUTable) return biom_table
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.otu_table_data = numpy.array([[2,1,0], [0,5,0], [0,3,0], [1,2,0]]) self.sample_names = list('YXZ') self.taxon_names = list('bacd') self.otu_metadata = [{'domain':'Archaea'}, {'domain':'Bacteria'}, {'domain':'Bacteria'}, {'domain':'Bacteria'}] self.otu_table = table_factory(self.otu_table_data, self.sample_names, self.taxon_names) self.otu_table_meta = table_factory(self.otu_table_data, self.sample_names, self.taxon_names, observation_metadata=self.otu_metadata) self.otu_table_str = format_biom_table(self.otu_table) self.otu_table_meta_str = format_biom_table(self.otu_table_meta) self.otu_table_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix='test_rarefaction',suffix='.biom') self.otu_table_meta_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix='test_rarefaction',suffix='.biom') self.rare_dir = get_tmp_filename(tmp_dir=self.tmp_dir, prefix='test_rarefaction_dir',suffix='',result_constructor=str) os.mkdir(self.rare_dir) open(self.otu_table_fp,'w').write(self.otu_table_str) open(self.otu_table_meta_fp,'w').write(self.otu_table_meta_str) self._paths_to_clean_up=[self.otu_table_fp,self.otu_table_meta_fp] self._dirs_to_clean_up=[self.rare_dir]
def test_sample_rare_unique(self): t = update_tree(None, tax_strings_by_sample) tax_by_sample = {'a':tax_strings_by_sample[0], 'b':tax_strings_by_sample[1], 'c':tax_strings_by_sample[2]} exp = [('a', None, [['k__1','p__x','c__'],['k__1','p__y','c__3']], [['k__1','p__x','c__1'],['k__1','p__x','c__2']]), ('b', None, [['k__1','p__x','c__'],['k__1','p__y','c__3']], []), ('c', None, [], [])] obs = sample_rare_unique(t, None, tax_by_sample, 0.7) self.assertEqual(sorted(obs), exp) table_a = table_factory(array([[14,15,16]]), ['a','b','c'], ['k__1; p__y; c__']) table_b = table_factory(array([[1,2,3], [4,5,6], [14,15,16]]), ['a','b','c'], ['k__1; p__x; c__1', 'k__1; p__x; c__2', 'k__1; p__y; c__']) table_c = table_factory(array([[1,2,3], [4,5,6], [7,8,9], [10,11,12], [14,15,16]]), ['a','b','c'], ['k__1; p__x; c__1', 'k__1; p__x; c__2', 'k__1; p__x; c__', 'k__1; p__y; c__3', 'k__1; p__y; c__']) exp = [('a', table_a, [['k__1','p__x','c__'],['k__1','p__y','c__3']], [['k__1','p__x','c__1'],['k__1','p__x','c__2']]), ('b', table_b, [['k__1','p__x','c__'],['k__1','p__y','c__3']], []), ('c', table_c, [], [])] obs = sample_rare_unique(t, table, tax_by_sample, 0.7) for o,e in zip(sorted(obs), exp): self.assertEqual(o[0], e[0]) self.assertEqual(o[1], e[1]) self.assertEqual(o[2], e[2]) self.assertEqual(o[3], e[3])
def setUp(self): """define some top-level data""" self.otu_table_values = array([[0, 0, 9, 5, 3, 1], [1, 5, 4, 0, 3, 2], [2, 3, 1, 1, 2, 5]]) {(0, 2): 9.0, (0, 3): 5.0, (0, 4): 3.0, (0, 5): 1.0, (1, 0): 1.0, (1, 1): 5.0, (1, 2): 4.0, (1, 4): 3.0, (1, 5): 2.0, (2, 0): 2.0, (2, 1): 3.0, (2, 2): 1.0, (2, 3): 1.0, (2, 4): 2.0, (2, 5): 5.0} self.otu_table = table_factory(self.otu_table_values, ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], ['OTU1', 'OTU2', 'OTU3'], [None, None, None, None, None, None], [{"taxonomy": ['Bacteria']}, {"taxonomy": ['Archaea']}, {"taxonomy": ['Streptococcus']}]) self.otu_table_f = table_factory(self.otu_table_values, ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], ['OTU1', 'OTU2', 'OTU3'], [None, None, None, None, None, None], [{"taxonomy": ['1A', '1B', '1C', 'Bacteria']}, {"taxonomy": ['2A', '2B', '2C', 'Archaea']}, {"taxonomy": ['3A', '3B', '3C', 'Streptococcus']}]) self.full_lineages = [['1A', '1B', '1C', 'Bacteria'], ['2A', '2B', '2C', 'Archaea'], ['3A', '3B', '3C', 'Streptococcus']] self.metadata = [[['Sample1', 'NA', 'A'], ['Sample2', 'NA', 'B'], ['Sample3', 'NA', 'A'], ['Sample4', 'NA', 'B'], ['Sample5', 'NA', 'A'], ['Sample6', 'NA', 'B']], ['SampleID', 'CAT1', 'CAT2'], []] self.tree_text = ["('OTU3',('OTU1','OTU2'))"] self.tmp_heatmap_fpath = get_tmp_filename( prefix='test_heatmap_', suffix='.pdf' )
def parse_biom_table_json(json_table, data_pump=None): """Parse a biom otu table type""" sample_ids = [col['id'] for col in json_table['columns']] sample_metadata = [col['metadata'] for col in json_table['columns']] obs_ids = [row['id'] for row in json_table['rows']] obs_metadata = [row['metadata'] for row in json_table['rows']] dtype = MATRIX_ELEMENT_TYPE[json_table['matrix_element_type']] if data_pump is None: table_obj = table_factory(json_table['data'], sample_ids, obs_ids, sample_metadata, obs_metadata, shape=json_table['shape'], dtype=dtype) else: table_obj = table_factory(data_pump, sample_ids, obs_ids, sample_metadata, obs_metadata, shape=json_table['shape'], dtype=dtype) return table_obj
def test_generate_heatmap_plots(self): """generate_heatmap_plots: create default output files""" # create directories and move js files to verify everything works # in the script file dir_path = join(self.output_dir, 'test') create_dir(dir_path) js_dir_path = join(dir_path, 'js') create_dir(js_dir_path) self._folders_to_cleanup.append(dir_path) qiime_dir = get_qiime_project_dir() js_path = join(qiime_dir, 'qiime/support_files/js') shutil.copyfile(join(js_path, 'overlib.js'), join(js_dir_path, 'overlib.js')) shutil.copyfile(join(js_path, 'otu_count_display.js'), join(js_dir_path, 'otu_count_display.js')) shutil.copyfile(join(js_path, 'jquery.js'), join(js_dir_path, 'jquery.js')) shutil.copyfile(join(js_path, 'jquery.tablednd_0_5.js'), join(js_dir_path, 'jquery.tablednd_0_5.js')) # generate otu_table object orig_data = array([[0, 1, 2], [1000, 0, 0]]) orig_otu_table = table_factory(orig_data, ['Sample1', 'Sample2', 'Sample3'], ['OTU1', 'OTU2'], [None, None, None], [{ "taxonomy": ["Bacteria"] }, { "taxonomy": ["Archaea"] }]) # put in an OTU sort order and sample order otu_sort = ['OTU2', 'OTU1'] sample_sort = ['Sample2', 'Sample1', 'Sample3'] num_otu_hits = 3 # generate test files generate_heatmap_plots(num_otu_hits, orig_otu_table, otu_sort, sample_sort, dir_path, js_dir_path, 'test', fractional_values=False) self.assertEqual( open(join(js_dir_path, 'test.js'), 'U').read(), exp_js_output_file)
def test_get_log_transform(self): #data = array([[0,1,2],[1000,0,0]]) #logdata = get_log_transform(data,eps=None) # set zeros to 1/2s #exp = log(array([[.5,1,2],[1000,.5,.5]])) # translate to 0 #exp -= exp.min() #self.assertFloatEqual(logdata, exp) orig_data = array([[0, 1, 2], [1000, 0, 0]]) #{(0,1):1.0, (0,2):2,(1,0):1000.0} orig_otu_table = table_factory(orig_data, ['Sample1', 'Sample2', 'Sample3'], ['OTU1', 'OTU2'], [None, None, None], [{ "taxonomy": ["Bacteria"] }, { "taxonomy": ["Archaea"] }]) exp_data = array([[0, 0.69314718, 1.38629436], [7.60090246, 0, 0]]) #{(0,1):0.69314718, (0,2):1.38629436,(1,0):7.60090246} exp_otu_table = table_factory(exp_data, ['Sample1', 'Sample2', 'Sample3'], ['OTU1', 'OTU2'], [None, None, None], [{ "taxonomy": ["Bacteria"] }, { "taxonomy": ["Archaea"] }]) log_otu_table = get_log_transform(orig_otu_table, eps=None) # comparing directly log_otu_table against exp_otu_table doesn't work, # needs to be modified in the otu table object self.assertFloatEqual(log_otu_table._data.items(), exp_otu_table._data.items())
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if not isfile(opts.input_path): raise IOError, \ "Input path (%s) not valid. Does it exist?" % opts.input_path samples, otus, data = parse_trflp(open(opts.input_path,'U')) output_f = open(opts.output_path, 'w') t = table_factory(data,samples,otus) output_f.write(format_biom_table(t)) output_f.close()
def setUp(self): """define some top-level data""" self.output_dir = '/tmp/' otu_table_vals = array([[0, 0], [1, 5]]) self.otu_table = table_factory(otu_table_vals, ['Sample1', 'Sample2'], ['OTU1', 'OTU2'], [None, None], [{"taxonomy": ["Bacteria"]}, {"taxonomy": ["Archaea"]}]) filt_otu_table_vals = array([[1, 5]]) self.filt_otu_table = table_factory(filt_otu_table_vals, ['Sample1', 'Sample2'], ['OTU2'], [None, None], [{"taxonomy": ["Archaea"]}]) self.num_otu_hits = 5 self._folders_to_cleanup = []
def test_get_log_transform(self): orig_data = array([[0, 1, 2], [1000, 0, 0]]) orig_otu_table = table_factory(orig_data, ['Sample1', 'Sample2', 'Sample3'], ['OTU1', 'OTU2'], [None, None, None], [{"taxonomy": ["Bacteria"]}, {"taxonomy": ["Archaea"]}]) exp_data = array([[0, 0.69314718, 1.38629436], [7.60090246, 0, 0]]) exp_otu_table = table_factory(exp_data, ['Sample1', 'Sample2', 'Sample3'], ['OTU1', 'OTU2'], [None, None, None], [{"taxonomy": ["Bacteria"]}, {"taxonomy": ["Archaea"]}]) log_otu_table = get_log_transform(orig_otu_table, eps=None) # comparing directly log_otu_table against exp_otu_table doesn't work, # needs to be modified in the otu table object assert_almost_equal(list(log_otu_table.iterSampleData()), list(exp_otu_table.iterSampleData()))
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config["temp_dir"] or "/tmp/" self.otu_table_data = numpy.array([[2, 1, 0], [0, 5, 0], [0, 3, 0], [1, 2, 0]]) self.sample_names = list("YXZ") self.taxon_names = list("bacd") self.otu_metadata = [ {"domain": "Archaea"}, {"domain": "Bacteria"}, {"domain": "Bacteria"}, {"domain": "Bacteria"}, ] self.otu_table = table_factory(self.otu_table_data, self.sample_names, self.taxon_names) self.otu_table_meta = table_factory( self.otu_table_data, self.sample_names, self.taxon_names, observation_metadata=self.otu_metadata ) self.otu_table_str = format_biom_table(self.otu_table) self.otu_table_meta_str = format_biom_table(self.otu_table_meta) self.otu_table_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix="test_rarefaction", suffix=".biom") self.otu_table_meta_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix="test_rarefaction", suffix=".biom") self.rare_dir = get_tmp_filename( tmp_dir=self.tmp_dir, prefix="test_rarefaction_dir", suffix="", result_constructor=str ) os.mkdir(self.rare_dir) open(self.otu_table_fp, "w").write(self.otu_table_str) open(self.otu_table_meta_fp, "w").write(self.otu_table_meta_str) self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp] self._dirs_to_clean_up = [self.rare_dir]
def test_get_log_transform(self): #data = array([[0,1,2],[1000,0,0]]) #logdata = get_log_transform(data,eps=None) # set zeros to 1/2s #exp = log(array([[.5,1,2],[1000,.5,.5]])) # translate to 0 #exp -= exp.min() #self.assertFloatEqual(logdata, exp) orig_data = array([[0,1,2],[1000,0,0]]) #{(0,1):1.0, (0,2):2,(1,0):1000.0} orig_otu_table = table_factory(orig_data, ['Sample1', 'Sample2', 'Sample3'], ['OTU1', 'OTU2'], [None, None, None], [{"taxonomy": ["Bacteria"]}, {"taxonomy": ["Archaea"]}]) exp_data = array([[0,0.69314718,1.38629436],[7.60090246,0,0]]) #{(0,1):0.69314718, (0,2):1.38629436,(1,0):7.60090246} exp_otu_table = table_factory(exp_data, ['Sample1', 'Sample2', 'Sample3'], ['OTU1', 'OTU2'], [None, None, None], [{"taxonomy": ["Bacteria"]}, {"taxonomy": ["Archaea"]}]) log_otu_table = get_log_transform(orig_otu_table, eps = None) # comparing directly log_otu_table against exp_otu_table doesn't work, # needs to be modified in the otu table object self.assertFloatEqual(list(log_otu_table.iterSampleData()), list(exp_otu_table.iterSampleData()))
def parse_biom_table_json(json_table, data_pump=None): """Parse a biom otu table type""" sample_ids = [col["id"] for col in json_table["columns"]] sample_metadata = [col["metadata"] for col in json_table["columns"]] obs_ids = [row["id"] for row in json_table["rows"]] obs_metadata = [row["metadata"] for row in json_table["rows"]] dtype = MATRIX_ELEMENT_TYPE[json_table["matrix_element_type"]] if data_pump is None: table_obj = table_factory( json_table["data"], sample_ids, obs_ids, sample_metadata, obs_metadata, shape=json_table["shape"], dtype=dtype, ) else: table_obj = table_factory( data_pump, sample_ids, obs_ids, sample_metadata, obs_metadata, shape=json_table["shape"], dtype=dtype ) return table_obj
def setUp(self): self.otu_table_vals = array([[1, 0, 2, 4], [1, 2, 0, 1], [0, 1, 1, 0], [1, 2, 1, 0]]) { (0, 0): 1.0, (0, 2): 2.0, (0, 3): 4.0, (1, 0): 1.0, (1, 1): 2.0, (1, 3): 1.0, (2, 1): 1.0, (2, 2): 1.0, (3, 0): 1.0, (3, 1): 2.0, (3, 2): 1.0 } self.otu_table = table_factory(self.otu_table_vals, [ 's1', 's2', 's3', 's4' ], ['0', '1', '2', '3'], None, [{ "taxonomy": [ "Root", "Bacteria", "Actinobacteria", "Actinobacteria", "Coriobacteridae", "Coriobacteriales", "Coriobacterineae", "Coriobacteriaceae" ] }, { "taxonomy": ["Root", "Bacteria", "Firmicutes", "\"Clostridia\""] }, { "taxonomy": ["Root", "Bacteria", "Firmicutes", "\"Clostridia\""] }, { "taxonomy": ["Root", "Bacteria"] }]) # self.otu_table="""#Full OTU Counts # OTU ID\ts1\ts2\ts3\ts4\tConsensus Lineage # 0\t1\t0\t2\t4\tRoot;Bacteria;Actinobacteria;Actinobacteria;Coriobacteridae;Coriobacteriales;Coriobacterineae;Coriobacteriaceae # 1\t1\t2\t0\t1\tRoot;Bacteria;Firmicutes;"Clostridia" # 2\t0\t1\t1\t0\tRoot;Bacteria;Firmicutes;"Clostridia" # 3\t1\t2\t1\t0\tRoot;Bacteria""".split('\n') self.mapping = """#SampleID\tBarcodeSequence\tTreatment\tDescription #Test mapping file s1\tAAAA\tControl\tControl mouse, I.D. 354 s2\tGGGG\tControl\tControl mouse, I.D. 355 s3\tCCCC\tExp\tDisease mouse, I.D. 356 s4\tTTTT\tExp\tDisease mouse, I.D. 357""".split('\n')
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) out_fh = open(opts.output_file, 'w') otu_table_fh = open(opts.otu_table, 'U') otu_table = parse_biom_table(otu_table_fh) tree_fh = open(opts.tree_file, 'U') tree = DndParser(tree_fh) res_sam_names, res_otus, res_otu_mtx, res_otu_metadata = \ sim_otu_table(otu_table.SampleIds, otu_table.ObservationIds, otu_table.iterSamples(), otu_table.ObservationMetadata, tree, opts.num, opts.dissim) rich_table = table_factory(res_otu_mtx, res_sam_names, res_otus, observation_metadata=res_otu_metadata) out_fh.write(format_biom_table(rich_table))
def merge_otu_tables(vcf_fps): """Takes a list of multiple vcf files and returns a single biom table of all files.""" master_table = None #open all of the files with correct extensions. Raise a value error if incorrect extension master_observation_ids = None for vcf_fp in vcf_fps: if vcf_fp.endswith('gz'): vcf_fp = gzip.open(vcf_fp) elif vcf_fp.endswith('vcf'): vcf_fp = open(vcf_fp, 'U') else: raise ValueError, "Invalid file format or extension, only '.vcf' or '.vcf.gz'\ are accepted" data, sample_ids, observation_ids, sample_md, observation_md =\ create_biom_table(vcf_fp) if master_observation_ids is None: master_observation_ids = observation_ids else: master_observation_ids = set(master_observation_ids) & set( observation_ids) biom_table = table_factory(data, sample_ids, observation_ids, sample_md, observation_md, constructor=SparseOTUTable) if master_table is None: master_table = biom_table else: master_table.merge(biom_table) # try: # master_table = master_table.merge(biom_table) # except AttributeError: # master_table = biom_table return master_table, observation_ids
def tobiom(input_fn, output_fn, tax_fn=None, sampledata_fn=None, otuids_fn=None): otutable = micca.table.read(input_fn) data = otutable.to_numpy() observation_ids = otutable.index.tolist() sample_ids = otutable.columns.tolist() if tax_fn is None: observ_metadata = None else: tax_dict = micca.tax.read(tax_fn) observ_metadata = [] for oid in observation_ids: if tax_dict.has_key(oid): observ_metadata.append({"taxonomy": tax_dict[oid]}) else: observ_metadata.append({"taxonomy": ["NA"]}) if sampledata_fn is None: sample_metadata = None else: sampledata = micca.table.read(sampledata_fn) # re-index with the sample IDs in the OTU table sampledata = sampledata.reindex(sample_ids) sampledata.fillna("NA", inplace=True) sample_metadata = [sampledata.loc[sid].to_dict() for sid in sample_ids] # replace the OTU ids with the original sequence ids when found in otuids if otuids_fn is not None: with open(otuids_fn, "rU") as otuids_handle: otuids_reader = csv.reader(otuids_handle, delimiter="\t") otuids = dict([(row[0], row[1]) for row in otuids_reader]) for i in range(len(observation_ids)): try: origid = otuids[observation_ids[i]] except KeyError: pass else: observation_ids[i] = origid generated_by = "micca v.{}".format(micca_version) if _biom_version == 2: table = Table(data=data, sample_ids=sample_ids, observation_ids=observation_ids, sample_metadata=sample_metadata, observation_metadata=observ_metadata, type="OTU table") json_str = table.to_json(generated_by=generated_by) else: table = table_factory(data=data, sample_ids=sample_ids, observation_ids=observation_ids, sample_metadata=sample_metadata, observation_metadata=observ_metadata, constructor=SparseOTUTable) json_str = table.getBiomFormatJsonString(generated_by=generated_by) with open(output_fn, 'wb') as output_handle: output_handle.write(json_str)
def convert_precalc_to_biom(precalc_in, ids_to_load=None,transpose=True,md_prefix='metadata_'): """Loads PICRUSTs tab-delimited version of the precalc file and outputs a BIOM object""" #if given a string convert to a filehandle if type(precalc_in) ==str or type(precalc_in) == unicode: fh = StringIO.StringIO(precalc_in) else: fh=precalc_in #first line has to be header header_ids=fh.readline().strip().split('\t') col_meta_locs={} for idx,col_id in enumerate(header_ids): if col_id.startswith(md_prefix): col_meta_locs[col_id[len(md_prefix):]]=idx end_of_data=len(header_ids)-len(col_meta_locs) trait_ids = header_ids[1:end_of_data] col_meta=[] row_meta=[{} for i in trait_ids] if ids_to_load: ids_to_load=set(ids_to_load) load_all_ids=False else: load_all_ids=True matching=[] otu_ids=[] for line in fh: fields = line.strip().split('\t') row_id=fields[0] if(row_id.startswith(md_prefix)): #handle metadata #determine type of metadata (this may not be perfect) metadata_type=determine_metadata_type(line) for idx,trait_name in enumerate(trait_ids): row_meta[idx][row_id[len(md_prefix):]]=parse_metadata_field(fields[idx+1],metadata_type) elif load_all_ids or (row_id in set(ids_to_load)): otu_ids.append(row_id) matching.append(map(float,fields[1:end_of_data])) #add metadata col_meta_dict={} for meta_name in col_meta_locs: col_meta_dict[meta_name]=fields[col_meta_locs[meta_name]] col_meta.append(col_meta_dict) if not load_all_ids: ids_to_load.remove(row_id) if not otu_ids: raise ValueError,"No OTUs match identifiers in precalculated file. PICRUSt requires an OTU table reference/closed picked against GreenGenes.\nExample of the first 5 OTU ids from your table: {0}".format(', '.join(list(ids_to_load)[:5])) if ids_to_load: raise ValueError,"One or more OTU ids were not found in the precalculated file!\nAre you using the correct --gg_version?\nExample of (the {0}) unknown OTU ids: {1}".format(len(ids_to_load),', '.join(list(ids_to_load)[:5])) #note that we transpose the data before making biom obj if transpose: return table_factory(asarray(matching).T,otu_ids,trait_ids,col_meta,row_meta,constructor=DenseGeneTable) else: return table_factory(asarray(matching),trait_ids,otu_ids,row_meta,col_meta,constructor=DenseGeneTable)
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.map_file = """#SampleID Day time Description #This is some comment about the study 1 090809 1200 some description of sample1 2 090809 1800 some description of sample2 3 090909 1200 some description of sample3 4 090909 1800 some description of sample4 5 091009 1200 some description of sample5""" self.cat_by_sample = { "1": [("Day", "090809"), ("time", "1200")], "2": [("Day", "090809"), ("time", "1800")], "3": [("Day", "090909"), ("time", "1200")], "4": [("Day", "090909"), ("time", "1800")], "5": [("Day", "091009"), ("time", "1200")] } self.sample_by_cat = { ("Day", "090809"): ["1", "2"], ("Day", "090909"): ["3", "4"], ("Day", "091009"): ["5"], ("time", "1200"): ["1", "3", "5"], ("time", "1800"): ["2", "4"] } self.num_cats = 2 self.meta_dict = { "1": ["090809 1200", 0], "2": ["090809 1800", 0], "3": ["090909 1200", 0], "4": ["090909 1800", 0], "5": ["091009 1200", 0] } self.labels = ["from", "to", "eweight", "consensus_lin", "Day", "time"] self.node_labels = [ "node_name", "node_disp_name", "ntype", "degree", "weighted_degree", "consensus_lin", "Day", "time" ] self.label_list = [["090809", "090909", "091009"], ["1200", "1800"]] self.otu_table_vals = array([[0, 1, 0, 0, 6], [2, 0, 0, 0, 0], [0, 0, 3, 1, 0], [0, 0, 0, 0, 5], [0, 4, 2, 0, 0], [3, 6, 0, 0, 0], [0, 0, 4, 2, 0], [0, 0, 0, 0, 3], [2, 0, 0, 5, 0], [0, 2, 0, 4, 0]]) otu_table_str = format_biom_table( table_factory(self.otu_table_vals, ['1', '2', '3', '4', '5'], [ 'otu_1', 'otu_2', 'otu_3', 'otu_4', 'otu_5', 'otu_6', 'otu_7', 'otu_8', 'otu_9', 'otu_10' ], [None, None, None, None, None], [{ "taxonomy": ["Bacteria", "Actinobacteria", "Coriobacteridae"] }, { "taxonomy": [ "Bacteria", "Bacteroidetes", "Bacteroidales", "Bacteroidaceae" ] }, { "taxonomy": ["Bacteria", "Firmicutes", "Clostridia", "Clostridiales"] }, { "taxonomy": [ "Bacteria", "Spirochaetes", "Spirochaetales", "Spirochaetaceae" ] }, { "taxonomy": [ "Bacteria", "Bacteroidetes", "Bacteroidales", "Rikenellaceae" ] }, { "taxonomy": [ "Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae" ] }, { "taxonomy": [ "Bacteria", "Bacteroidetes", "Bacteroidales", "Odoribacteriaceae" ] }, { "taxonomy": [ "Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae", "otu_425" ] }, { "taxonomy": [ "Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae", "otu_425" ] }, { "taxonomy": [ "Bacteria", "Firmicutes", "Mollicutes", "Clostridium_aff_innocuum_CM970" ] }])) _, self.otu_table_fp = mkstemp( dir=self.tmp_dir, prefix='test_make_otu_network_otu_table', suffix='.biom') close(_) open(self.otu_table_fp, 'w').write(otu_table_str) self.otu_sample_file = """#Full OTU Counts #OTU ID 1 2 3 4 5 Consensus Lineage otu_1 0 1 0 0 6 Bacteria; Actinobacteria; Coriobacteridae otu_2 2 0 0 0 0 Bacteria; Bacteroidetes; Bacteroidales; Bacteroidaceae otu_3 0 0 3 1 0 Bacteria; Firmicutes; Clostridia; Clostridiales otu_4 0 0 0 0 5 Bacteria; Spirochaetes; Spirochaetales; Spirochaetaceae otu_5 0 4 2 0 0 Bacteria; Bacteroidetes; Bacteroidales; Rikenellaceae otu_6 3 6 0 0 0 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae otu_7 0 0 4 2 0 Bacteria; Bacteroidetes; Bacteroidales; Odoribacteriaceae otu_8 0 0 0 0 3 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425 otu_9 2 0 0 5 0 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425 otu_10 0 2 0 4 0 Bacteria; Firmicutes; Mollicutes; Clostridium_aff_innocuum_CM970""" self.con_by_sample = { '1': set(['2', '4']), '2': set(['5', '3', '1', '4']), '3': set(['4', '2']), '4': set(['3', '1', '2']), '5': set(['2']) } self.edge_file_str = [ "2 otu_1 1.0 Bacteria:Actinobacteria:Coriobacteridae 090809 1800", "5 otu_1 6.0 Bacteria:Actinobacteria:Coriobacteridae 091009 1200", "1 otu_2 2.0 Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae 090809 1200", "3 otu_3 3.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1200", "4 otu_3 1.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1800", "5 otu_4 5.0 Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae 091009 1200", "2 otu_5 4.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090809 1800", "3 otu_5 2.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090909 1200", "1 otu_6 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1200", "2 otu_6 6.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1800", "3 otu_7 4.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1200", "4 otu_7 2.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1800", "5 otu_8 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 091009 1200", "1 otu_9 2.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090809 1200", "4 otu_9 5.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090909 1800", "2 otu_10 2.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090809 1800", "4 otu_10 4.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090909 1800" ] self.node_file_str = [ "1 1 user_node 3 7.0 other 090809 1200", "2 2 user_node 4 13.0 other 090809 1800", "3 3 user_node 3 9.0 other 090909 1200", "4 4 user_node 4 12.0 other 090909 1800", "5 5 user_node 3 14.0 other 091009 1200", "otu_1 otu_node 2 7.0 Bacteria:Actinobacteria:Coriobacteridae otu otu", "otu_2 otu_node 1 2.0 Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae otu otu", "otu_3 otu_node 2 4.0 Bacteria:Firmicutes:Clostridia:Clostridiales otu otu", "otu_4 otu_node 1 5.0 Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae otu otu", "otu_5 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae otu otu", "otu_6 otu_node 2 9.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae otu otu", "otu_7 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae otu otu", "otu_8 otu_node 1 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_9 otu_node 2 7.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_10 otu_node 2 6.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 otu otu" ] self.red_edge_file_str = [ "2 otu_1 1.0 Bacteria:Actinobacteria:Coriobacteridae 090809 1800", "5 otu_1 6.0 Bacteria:Actinobacteria:Coriobacteridae 091009 1200", "1 @1 1.0 missed 090809 1200", "3 otu_3 3.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1200", "4 otu_3 1.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1800", "5 @5 1.0 missed 091009 1200", "2 otu_5 4.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090809 1800", "3 otu_5 2.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090909 1200", "1 otu_6 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1200", "2 otu_6 6.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1800", "3 otu_7 4.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1200", "4 otu_7 2.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1800", "1 otu_9 2.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090809 1200", "4 otu_9 5.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090909 1800", "2 otu_10 2.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090809 1800", "4 otu_10 4.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090909 1800" ] self.red_node_file_str = [ "1 1 user_node 3 7.0 other 090809 1200", "2 2 user_node 4 13.0 other 090809 1800", "3 3 user_node 3 9.0 other 090909 1200", "4 4 user_node 4 12.0 other 090909 1800", "5 5 user_node 3 14.0 other 091009 1200", "otu_1 otu_node 2 7.0 Bacteria:Actinobacteria:Coriobacteridae otu otu", "@1 otu_collapsed 1 1.0 other otu otu", "otu_3 otu_node 2 4.0 Bacteria:Firmicutes:Clostridia:Clostridiales otu otu", "@5 otu_collapsed 2 2.0 other otu otu", "otu_5 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae otu otu", "otu_6 otu_node 2 9.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae otu otu", "otu_7 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae otu otu", "otu_9 otu_node 2 7.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_10 otu_node 2 6.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 otu otu" ] self.otu_dc = {1: 3, 2: 7} self.sample_dc = {3: 3, 4: 2} self.degree_counts = {1: 3, 2: 7, 3: 3, 4: 2} self.num_con_cat = {"Day": 2, "time": 1} self.num_con = 6 self.num_cat = {"Day": 2, "time": 4} self.num_cat_less = {"Day": 1, "time": 3} self._paths_to_clean_up = [self.otu_table_fp] self._dir_to_clean_up = ''
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_ext = path.splitext(opts.input_otu_fp)[1] if opts.input_format_classic: otu_table = parse_classic_table_to_rich_table( open(opts.input_otu_fp, 'U'), None, None, None, DenseOTUTable) else: try: otu_table = parse_biom_table(open(opts.input_otu_fp, 'U')) except ValueError: raise ValueError( "Error loading OTU table! If not in BIOM format use '-f' option.\n" ) ids_to_load = otu_table.ObservationIds if (opts.input_count_fp is None): #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name = '_'.join( ['16S', opts.gg_version, 'precalculated.tab.gz']) input_count_table = join(get_picrust_project_dir(), 'picrust', 'data', precalc_file_name) else: input_count_table = opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext = path.splitext(input_count_table)[1] if (ext == '.gz'): count_table_fh = gzip.open(input_count_table, 'rb') else: count_table_fh = open(input_count_table, 'U') if opts.load_precalc_file_in_biom: count_table = parse_biom_table(count_table_fh.read()) else: count_table = convert_precalc_to_biom(count_table_fh, ids_to_load) #Need to only keep data relevant to our otu list ids = [] for x in otu_table.iterObservations(): ids.append(str(x[1])) ob_id = count_table.ObservationIds[0] filtered_otus = [] filtered_values = [] for x in ids: if count_table.sampleExists(x): filtered_otus.append(x) filtered_values.append(otu_table.observationData(x)) #filtered_values = map(list,zip(*filtered_values)) filtered_otu_table = table_factory(filtered_values, otu_table.SampleIds, filtered_otus, constructor=DenseOTUTable) copy_numbers_filtered = {} for x in filtered_otus: value = count_table.getValueByIds(ob_id, x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x] = {opts.metadata_identifer: value} filtered_otu_table.addObservationMetadata(copy_numbers_filtered) normalized_table = filtered_otu_table.normObservationByMetadata( opts.metadata_identifer) #move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'ObservationMetadata') normalized_otu_table = transfer_sample_metadata(otu_table, normalized_table, 'SampleMetadata') make_output_dir_for_file(opts.output_otu_fp) open(opts.output_otu_fp, 'w').write(format_biom_table(normalized_table))
def test_slice_mapping_file(self): header, metadata = parse_mapping_file(StringIO(test_mapping)) table = table_factory(array([[1, 2], [4, 5]]), ['a', 'c'], ['x', 'y']) exp = ["a\t1\t123123", "c\tpoop\tdoesn't matter"] obs = slice_mapping_file(table, metadata) self.assertEqual(obs, exp)
print 'copula done' # all tables from biom.table import table_factory tables = [ copula_table2_gamma_1_0_100, copula_table1_lognorm_3_0, ga_table, null_table1, null_table2, eco_table1, eco_table2 ] names = [ 'table_1.biom', 'table_2.biom', 'table_3.biom', 'table_4.biom', 'table_5.biom', 'table_6.biom', 'table_7.biom' ] def make_ids(data): sids = ['s%i' % i for i in range(data.shape[1])] oids = ['o%i' % i for i in range(data.shape[0])] return sids, oids for table, name in zip(tables, names): sids, oids = make_ids(table) bt = table_factory(table, sids, oids) json_str = bt.getBiomFormatJsonString(generated_by='Sophie_Will') o = open('/Users/will/Desktop/' + name, 'w') o.write(json_str) o.close()
nsteps = 21 x = linspace(0, fmax, nsteps) x[0] = fmin y = linspace(0, smax, nsteps) y[0] = smin otu_ids = ['O_%s' % i for i in range(fmax)] sample_ids = ['S_%s' % i for i in range(smax)] out_dir = '/Users/wdwvt1/src/correlations/tables/timings/' for num_features in x: for num_samples in y: data = _generate_data(num_features, num_samples) bt = table_factory(data, sample_ids[:int(num_samples)], otu_ids[:int(num_features)]) out_path = os.path.join(out_dir, 'table_f_%s_s_%s.biom' % (num_features, num_samples)) o = open(out_path, 'w') o.writelines(bt.getBiomFormatJsonString('will')) o.close() ''' import glob os.mkdir(os.join(out_dir, 'text_tables')) tables = glob(out_dir+'*.biom') for t in tables: out_fp = os.path.join(t.split('/')[:-1], 'text_tables/') + t.split('/')[-1] !biom convert -i $t -o $out_fp -b '''