def write_biom(self, sample_names, read_taxonomies, biom_file_io): '''Write the OTU info to a biom IO output stream Parameters ---------- sample_names: String names of each sample (sample_ids for biom) read_taxonomies: Array of hashes as per _iterate_otu_table_rows() biom_file_io: io open writeable stream to write biom contents to Returns True if successful, else False''' counts = [] observ_metadata = [] otu_ids = [] for otu_id, tax, count in self._iterate_otu_table_rows(read_taxonomies): if len(count) != len(sample_names): raise Exception("Programming error: mismatched sample names and counts") counts.append(count) observ_metadata.append({'taxonomy': tax}) otu_ids.append(str(otu_id)) if len(counts) == 0: logging.info("Not writing BIOM file since no sequences were assigned taxonomy") return True table = Table(np.array(counts), otu_ids, sample_names, observ_metadata, [{}]*len(sample_names), table_id='GraftM Taxonomy Count Table') try: table.to_hdf5(biom_file_io, 'GraftM graft') return True except RuntimeError as e: logging.warn("Error writing BIOM output, file not written. The specific error was: %s" % e) return False
def load_BIOM(table, informat='json', v=1): """ load a BIOM table from BIOM format. Default format is 'json'. """ from biom.table import Table import json import sys informats = ['json','tsv'] if not informat in informats: print "\nPlease specify a valid BIOM input format. Currently we support: '%s'.\n" %"', '".join(informats) else: if v: print "\nSpecified BIOM input format '%s' - ok!" %(informat) if informat == 'json': with open(table) as data_file: data = json.load(data_file) t = Table.from_json(data) elif informat == 'tsv': tsv = open(in_tsv) func = lambda x : x t = Table.from_tsv(tsv, obs_mapping=None, sample_mapping=None, process_func=func) tsv.close() return t
def setUp(self): """define some top-level data""" self.otu_table_values = array([[0, 0, 9, 5, 3, 1], [1, 5, 4, 0, 3, 2], [2, 3, 1, 1, 2, 5]]) { (0, 2): 9.0, (0, 3): 5.0, (0, 4): 3.0, (0, 5): 1.0, (1, 0): 1.0, (1, 1): 5.0, (1, 2): 4.0, (1, 4): 3.0, (1, 5): 2.0, (2, 0): 2.0, (2, 1): 3.0, (2, 2): 1.0, (2, 3): 1.0, (2, 4): 2.0, (2, 5): 5.0, } self.otu_table = Table( self.otu_table_values, ["OTU1", "OTU2", "OTU3"], ["Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"], [{"taxonomy": ["Bacteria"]}, {"taxonomy": ["Archaea"]}, {"taxonomy": ["Streptococcus"]}], [None, None, None, None, None, None], ) self.otu_table_f = Table( self.otu_table_values, ["OTU1", "OTU2", "OTU3"], ["Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"], [ {"taxonomy": ["1A", "1B", "1C", "Bacteria"]}, {"taxonomy": ["2A", "2B", "2C", "Archaea"]}, {"taxonomy": ["3A", "3B", "3C", "Streptococcus"]}, ], [None, None, None, None, None, None], ) self.full_lineages = [ ["1A", "1B", "1C", "Bacteria"], ["2A", "2B", "2C", "Archaea"], ["3A", "3B", "3C", "Streptococcus"], ] self.metadata = [ [ ["Sample1", "NA", "A"], ["Sample2", "NA", "B"], ["Sample3", "NA", "A"], ["Sample4", "NA", "B"], ["Sample5", "NA", "A"], ["Sample6", "NA", "B"], ], ["SampleID", "CAT1", "CAT2"], [], ] self.tree_text = ["('OTU3',('OTU1','OTU2'))"] fh, self.tmp_heatmap_fpath = mkstemp(prefix="test_heatmap_", suffix=".pdf") close(fh)
def test_tree_filter_table_none(self): rooted_nwk = io.StringIO("(O1:4.5,(O2:4,(a:1,b:1):2):0.5);") tree = skbio.TreeNode.read(rooted_nwk) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_table(table, tree) expected = table.filter(['O1', 'O2'], axis='observation') self.assertEqual(actual, expected)
def parse_biom_table(fp, input_is_dense=False): try: return Table.from_hdf5(fp) except: pass if hasattr(fp, 'read'): return Table.from_json(json.load(fp), input_is_dense=input_is_dense) elif isinstance(fp, list): return Table.from_json(json.loads(''.join(fp)), input_is_dense=input_is_dense) else: return Table.from_json(json.loads(fp), input_is_dense=input_is_dense)
def BIOM_return_clipped_taxonomy(taxlevel, BIOM): """ Returns a BIOM table for which the taxonomy has been clipped at a certain level """ from biom.table import Table import numpy as np return_OTUs = {} levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'unassigned'] clip_level='' to_drop=[] if not taxlevel in levels: raise KeyError("The taxonomic level you are trying to search: '%s', is not valid" %level) clip_level = int(levels.index(taxlevel))+1 #check if the first OTU has 'taxonomy' metadata attached, if yes assume all others have too and resume if not 'taxonomy' in BIOM.metadata(axis='observation')[0]: raise KeyError('The BIOM table you are trying to screen does not have taxonomy metadata attached to it') else: print "Found taxonomy metadata with OTUs - ok!" sample_ids = BIOM.ids(axis='sample') observation_ids = BIOM.ids(axis='observation') data_to_biom = [] sample_metadata = BIOM.metadata(axis='sample') observation_metadata = BIOM.metadata(axis='observation') for OTU in observation_ids: orig=BIOM.data(OTU, axis='observation') data_to_biom.append(orig) data = np.asarray(data_to_biom) for i in range(len(observation_metadata)): if len(observation_metadata[i]['taxonomy']) > clip_level: observation_metadata[i]['taxonomy'] = observation_metadata[i]['taxonomy'][:clip_level] if 'unknown' in observation_metadata[i]['taxonomy'][-1]: print "fishy: %s" %observation_metadata[i]['taxonomy'] to_drop.append(observation_ids[i]) # print observation_metadata[i]['taxonomy'] #construct adjusted table outtable = Table(data, observation_ids, sample_ids, table_id='OTU table', sample_metadata=sample_metadata, observation_metadata=observation_metadata) if to_drop: outtable.filter(to_drop, invert=True, axis='observation',inplace=True) return outtable
def BIOM_tsv_to_R_transpose(in_tsv, out_csv): """ Parse a biom table in tsv format and transpose it for input into R """ from biom import Table tsv = open(in_tsv) #in_tsv = open('COI-trim30min100-merge-c3-id97-OTU-taxonomy.kraken.tsv') func = lambda x : x intable = Table.from_tsv(tsv,obs_mapping=None, sample_mapping=None, process_func=func) outtable = intable.transpose() out=open("transposed.tsv","w") out.write(outtable.to_tsv(header_key=None, header_value=None)) out.close() #refine intable = open('transposed.tsv','r') temp = intable.next() out='' for line in intable: if line.startswith('#'): if line.strip().endswith('taxomomy'): print "Removing taxonomy" line = ",".join(line.strip().split("\t")[:-1]).replace('#OTU ID','Sample').replace('\t',',')+'\n' line = line.replace('#OTU ID','Sample').replace('\t',',') out+=line else: line = line.replace('\t',',') out+=line outtable = open(out_csv,'w') outtable.write(out) outtable.close()
def setUp(self): self.otu_table_vals = array([[1, 0, 2, 4], [1, 2, 0, 1], [0, 1, 1, 0], [1, 2, 1, 0]]) self.otu_table = Table(self.otu_table_vals, ['0', '1', '2', '3'], ['s1', 's2', 's3', 's4'], [{"taxonomy": ["Root", "Bacteria", "Actinobacteria", "Actinobacteria", "Coriobacteridae", "Coriobacteriales", "Coriobacterineae", "Coriobacteriaceae"]}, {"taxonomy": ["Root", "Bacteria", "Firmicutes", "\"Clostridia\""]}, {"taxonomy": ["Root", "Bacteria", "Firmicutes", "\"Clostridia\""]}, {"taxonomy": ["Root", "Bacteria"]}], None,) self.mapping = """#SampleID\tBarcodeSequence\tTreatment\tDescription #Test mapping file s1\tAAAA\tControl\tControl mouse, I.D. 354 s2\tGGGG\tControl\tControl mouse, I.D. 355 s3\tCCCC\tExp\tDisease mouse, I.D. 356 s4\tTTTT\tExp\tDisease mouse, I.D. 357""".split('\n')
def run(self, **kwargs): json_table_str = kwargs['json_table_str'] hdf5_biom = kwargs['hdf5_table'] axis = kwargs['axis'] ids = kwargs['ids'] if axis not in self.Axes: raise CommandError("Invalid axis '%s'. Must be either %s." % ( axis, ' or '.join(map(lambda e: "'%s'" % e, self.Axes)))) if hdf5_biom is None and json_table_str is None: raise CommandError("Must specify an input table") elif hdf5_biom is not None and json_table_str is not None: raise CommandError("Can only specify one input table") if json_table_str is not None: idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis) new_data = direct_slice_data(json_table_str, idxs, axis) # multiple walks over the string. bad form, but easy right now # ...should add a yield_and_ignore parser or something. def subset_generator(): yield "{" yield direct_parse_key(json_table_str, "id") yield "," yield direct_parse_key(json_table_str, "format") yield "," yield direct_parse_key(json_table_str, "format_url") yield "," yield direct_parse_key(json_table_str, "type") yield "," yield direct_parse_key(json_table_str, "generated_by") yield "," yield direct_parse_key(json_table_str, "date") yield "," yield direct_parse_key(json_table_str, "matrix_type") yield "," yield direct_parse_key(json_table_str, "matrix_element_type") yield "," yield new_data yield "," yield new_axis_md yield "," if axis == "observation": yield direct_parse_key(json_table_str, "columns") else: yield direct_parse_key(json_table_str, "rows") yield "}" format_ = 'json' table = subset_generator() else: with biom_open(hdf5_biom) as f: table = Table.from_hdf5(f, ids=ids, axis=axis) format_ = 'hdf5' return {'subsetted_table': (table, format_)}
def filter_BIOM_by_per_sample_read_prop(BIOM, min_prop=0.01): """ Filter OTU table by mininimum reads per sample """ import numpy as np from biom.table import Table print "\nFiltering at level: %s %%\n" %(min_prop*100) # print "input table:\n" # print BIOM # print "\n" sample_ids = BIOM.ids(axis='sample') observation_ids = BIOM.ids(axis='observation') data_to_biom = [] sample_metadata = BIOM.metadata(axis='sample') observation_metadata = BIOM.metadata(axis='observation') sums = BIOM.sum(axis='sample') for OTU in observation_ids: orig=BIOM.data(OTU, axis='observation') for i in range(len(orig)): if not int(orig[i]) == 0: if not int(orig[i]) >= sums[i]*min_prop: orig[i] = '0.0' data_to_biom.append(orig) data = np.asarray(data_to_biom) #construct adjusted table table = Table(data, observation_ids, sample_ids, table_id='OTU table', sample_metadata=sample_metadata, observation_metadata=observation_metadata) #Filter OTUs with sum = '0' to_exclude = [] observation_sums = table.sum(axis='observation') for i in range(len(observation_sums)): if int(observation_sums[i]) == 0: to_exclude.append(observation_ids[i]) print "Removing %i OTUs for lack of support\n" %len(to_exclude) table.filter(to_exclude, invert=True, axis='observation',inplace=True) # print table return table
def main(table_loc, otu_list, collapsed_name, output_file, classic=False): table = load_table(table_loc) f = open(otu_list) otus = f.read().strip().split() otus = set(otus) & set(table.ids(axis="observation")) table1 = table.filter(otus, axis="observation", inplace=False) table2 = table.filter(otus, axis="observation", invert=True, inplace=False) sums1 = table1.sum(axis='sample') sums2 = table2.sum(axis='sample') new_table = Table(numpy.array([sums1,sums2]), [collapsed_name, "not_"+collapsed_name], table.ids(axis="sample"), type="otu baptable") if classic: # print to tab delimited biom table open(output_file, 'w').write(new_table.to_tsv()) else: # print biom table new_table.to_json("predict_reactions.py", open(output_file, 'w'))
def build_OTU_table_biom(OTU_table_classic, OTU_table_biom, dataset_ID): # Builds a BIOM format OTU table from an OTU table in classic dense format (sample IDs in the first row, OTU IDs in the first column). For some reason, 'biom convert' command fails to recognize some OTU tables, and therefore the method classic2biom (above) fails. with open(OTU_table_classic,'r') as fidin: otu_table_data = fidin.readlines() firstrow = otu_table_data[0].split('\t') sample_labels = firstrow[1:] sample_labels[len(sample_labels)-1] = sample_labels[len(sample_labels)-1].rstrip('\n') OTU_labels = [otu_table_data[i].split('\t')[0] for i in range(1,len(otu_table_data))] nOTUs = len(OTU_labels) nSamples = len(sample_labels) # Load OTU table row major order OTU_table_data = np.zeros((nOTUs, nSamples)) for i in range(1,nOTUs+1): OTU_table_data[i-1,:] = otu_table_data[i].split('\t')[1:] # Write in BIOM format t = Table(OTU_table_data, OTU_labels, sample_labels, observ_metadata=None, sample_metadata=None, table_id=dataset_ID) with biom_open(OTU_table_biom, 'w') as f: t.to_hdf5(f, "Generated by processing layer", compress=False)
def setUp(self): """define some top-level data""" self.otu_table_values = array([[0, 0, 9, 5, 3, 1], [1, 5, 4, 0, 3, 2], [2, 3, 1, 1, 2, 5]]) {(0, 2): 9.0, (0, 3): 5.0, (0, 4): 3.0, (0, 5): 1.0, (1, 0): 1.0, (1, 1): 5.0, (1, 2): 4.0, (1, 4): 3.0, (1, 5): 2.0, (2, 0): 2.0, (2, 1): 3.0, (2, 2): 1.0, (2, 3): 1.0, (2, 4): 2.0, (2, 5): 5.0} self.otu_table = Table(self.otu_table_values, ['OTU1', 'OTU2', 'OTU3'], ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], [{"taxonomy": ['Bacteria']}, {"taxonomy": ['Archaea']}, {"taxonomy": ['Streptococcus']}], [None, None, None, None, None, None]) self.otu_table_f = Table(self.otu_table_values, ['OTU1', 'OTU2', 'OTU3'], ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], [{"taxonomy": ['1A', '1B', '1C', 'Bacteria']}, {"taxonomy": ['2A', '2B', '2C', 'Archaea']}, {"taxonomy": ['3A', '3B', '3C', 'Streptococcus']}], [None, None, None, None, None, None]) self.full_lineages = [['1A', '1B', '1C', 'Bacteria'], ['2A', '2B', '2C', 'Archaea'], ['3A', '3B', '3C', 'Streptococcus']] self.metadata = [[['Sample1', 'NA', 'A'], ['Sample2', 'NA', 'B'], ['Sample3', 'NA', 'A'], ['Sample4', 'NA', 'B'], ['Sample5', 'NA', 'A'], ['Sample6', 'NA', 'B']], ['SampleID', 'CAT1', 'CAT2'], []] self.tree_text = ["('OTU3',('OTU1','OTU2'))"] fh, self.tmp_heatmap_fpath = mkstemp(prefix='test_heatmap_', suffix='.pdf') close(fh)
def convert_table_to_biom(table_f, sample_mapping, obs_mapping, process_func, **kwargs): """Convert a contigency table to a biom table sample_mapping : dict of {'sample_id':metadata} or None obs_mapping : dict of {'obs_id':metadata} or None process_func: a function to transform observation metadata dtype : type of table data """ otu_table = Table.from_tsv(table_f, obs_mapping, sample_mapping, process_func, **kwargs) return otu_table.to_json(generatedby())
def _subset_table(hdf5_biom, json_table_str, axis, ids): if axis not in ['sample', 'observation']: raise ValueError("Invalid axis '%s'. Must be either 'sample' or " "'observation'." % axis) if hdf5_biom is None and json_table_str is None: raise ValueError("Must specify an input table") elif hdf5_biom is not None and json_table_str is not None: raise ValueError("Can only specify one input table") if json_table_str is not None: idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis) new_data = direct_slice_data(json_table_str, idxs, axis) # multiple walks over the string. bad form, but easy right now # ...should add a yield_and_ignore parser or something. def subset_generator(): yield "{" yield direct_parse_key(json_table_str, "id") yield "," yield direct_parse_key(json_table_str, "format") yield "," yield direct_parse_key(json_table_str, "format_url") yield "," yield direct_parse_key(json_table_str, "type") yield "," yield direct_parse_key(json_table_str, "generated_by") yield "," yield direct_parse_key(json_table_str, "date") yield "," yield direct_parse_key(json_table_str, "matrix_type") yield "," yield direct_parse_key(json_table_str, "matrix_element_type") yield "," yield new_data yield "," yield new_axis_md yield "," if axis == "observation": yield direct_parse_key(json_table_str, "columns") else: yield direct_parse_key(json_table_str, "rows") yield "}" format_ = 'json' table = subset_generator() else: with biom_open(hdf5_biom) as f: table = Table.from_hdf5(f, ids=ids, axis=axis) format_ = 'hdf5' return table, format_
def test_aitchison(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = beta(table=t, metric='aitchison') expected = skbio.DistanceMatrix([[0.0000000, 0.4901290, 0.6935510], [0.4901290, 0.0000000, 0.2034219], [0.6935510, 0.2034219, 0.0000000]], ids=['S1', 'S2', 'S3']) self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def simulate_correls(corr_stren=(.99, .99), std=(1, 1, 1, 2, 2), means=(100, 100, 100, 100, 100), size=30, noncors=10, noncors_mean=100, noncors_std=100): """ Generates a correlation matrix with diagonal of stds based on input parameters and fills rest of matrix with uncorrelated values all with same mean and standard deviations. Output should have a triangle of correlated observations and a pair all other observations should be uncorrelated. Correlation to covariance calculated by cor(X,Y)=cov(X,Y)/sd(X)sd(Y). Parameters ---------- corr_stren: tuple of length 2, correlations in triangle and in pair std: tuple of length 5, standard deviations of each observation means: tuple of length 5, mean of each observation size: number of samples to generate from the multivariate normal distribution noncors: number of uncorrelated values noncors_mean: mean of uncorrelated values noncors_std: standard deviation of uncorrelated values Returns ------- table: a biom table with (size) samples and (5+noncors) observations """ cor = [[std[0], corr_stren[0], corr_stren[0], 0., 0.], # define the correlation matrix for the triangle and pair [corr_stren[0], std[1], corr_stren[0], 0., 0.], [corr_stren[0], corr_stren[0], std[2], 0., 0.], [0., 0., 0., std[3], corr_stren[1]], [0., 0., 0., corr_stren[1], std[4]]] cor = np.array(cor) cov = np.zeros(np.array(cor.shape) + noncors) # generate empty covariance matrix to be filled for i in range(cor.shape[0]): # fill in all but diagonal of covariance matrix, first 5 for j in range(i + 1, cor.shape[0]): curr_cov = cor[i, j] * cor[i, i] * cor[j, j] cov[i, j] = curr_cov cov[j, i] = curr_cov for i in range(cor.shape[0]): # fill diagonal of covariance matrix, first 5 cov[i, i] = np.square(cor[i, i]) means = list(means) for i in range(cor.shape[0], cov.shape[0]): # fill diagonal of covariance, 6 to end and populate mean list cov[i, i] = noncors_std means.append(noncors_mean) # fill the count table counts = multivariate_normal(means, cov, size).T counts = np.round(counts) observ_ids = ["Observ_" + str(i) for i in range(cov.shape[0])] sample_ids = ["Sample_" + str(i) for i in range(size)] table = Table(counts, observ_ids, sample_ids) return table
def test_feature_metadata(self): # no filtering df = pd.DataFrame({'SequencedGenome': ['yes', 'yes']}, index=pd.Index(['O1', 'O2'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_features(table, metadata=metadata) expected = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) self.assertEqual(actual, expected) # filter one df = pd.DataFrame({'SequencedGenome': ['yes']}, index=pd.Index(['O1'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_features(table, metadata=metadata) expected = Table(np.array([[1, 3]]), ['O1'], ['S2', 'S3']) self.assertEqual(actual, expected) # filter all df = pd.DataFrame({}, index=pd.Index(['foo'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_features(table, metadata=metadata) self.assertTrue(actual.is_empty()) # exclude one df = pd.DataFrame({'SequencedGenome': ['yes']}, index=pd.Index(['O1'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_features(table, metadata=metadata, exclude_ids=True) expected = Table(np.array([[1, 1, 2]]), ['O2'], ['S1', 'S2', 'S3']) self.assertEqual(actual, expected) # exclude all df = pd.DataFrame({'SequencedGenome': ['yes', 'yes']}, index=pd.Index(['O1', 'O2'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_features(table, metadata=metadata, exclude_ids=True) self.assertTrue(actual.is_empty())
def test_where(self): # no filtering df = pd.DataFrame({'SequencedGenome': ['yes', 'no']}, index=pd.Index(['O1', 'O2'], name='feature-id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "SequencedGenome='yes' OR SequencedGenome='no'" actual = filter_features(table, metadata=metadata, where=where) expected = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) self.assertEqual(actual, expected) # filter one df = pd.DataFrame({'SequencedGenome': ['yes', 'no']}, index=pd.Index(['O1', 'O2'], name='feature-id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "SequencedGenome='yes'" actual = filter_features(table, metadata=metadata, where=where) expected = Table(np.array([[1, 3]]), ['O1'], ['S2', 'S3']) self.assertEqual(actual, expected) # filter all df = pd.DataFrame({'SequencedGenome': ['yes', 'no']}, index=pd.Index(['O1', 'O2'], name='feature-id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "SequencedGenome='yes' AND SequencedGenome='no'" actual = filter_features(table, metadata=metadata, where=where) expected = Table(np.array([]), [], []) self.assertEqual(actual, expected) # filter one -> exclude one df = pd.DataFrame({'SequencedGenome': ['yes', 'no']}, index=pd.Index(['O1', 'O2'], name='feature-id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "SequencedGenome='yes'" actual = filter_features(table, exclude_ids=True, metadata=metadata, where=where) expected = Table(np.array([[1, 1, 2]]), ['O2'], ['S1', 'S2', 'S3']) self.assertEqual(actual, expected)
def test_invalid_args(self): table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) with self.assertRaisesRegex(ValueError, "No filtering"): filter_samples(table) with self.assertRaisesRegex(ValueError, "'where' is specified."): filter_samples(table, where="Subject='subject-1'") with self.assertRaisesRegex(ValueError, "'exclude_ids' is True."): filter_samples(table, exclude_ids=True)
def test_non_phylogenetic(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = beta_diversity('braycurtis', t) # expected computed with scipy.spatial.distance.braycurtis expected = skbio.DistanceMatrix([[0.0000000, 0.3333333, 0.6666667], [0.3333333, 0.0000000, 0.4285714], [0.6666667, 0.4285714, 0.0000000]], ids=['S1', 'S2', 'S3']) self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def fastspar_correlation(table: Table, verbose: bool = False, nprocs=1) -> pd.DataFrame: # TODO: multiprocess support with tempfile.TemporaryDirectory(prefix='fastspar') as temp: table.to_dataframe().to_dense().to_csv(path.join( temp, 'otu_table.tsv'), sep='\t', index_label='#OTU ID') if verbose: stdout = None else: stdout = subprocess.DEVNULL subprocess.run([ 'fastspar', '-c', path.join(temp, 'otu_table.tsv'), '-r', path.join(temp, path.join(temp, 'correl_table.tsv')), '-a', path.join(temp, 'covar_table.tsv'), '-t', str(nprocs) ], stdout=stdout) cor = pd.read_table(path.join(temp, 'correl_table.tsv'), index_col=0) return df_to_correls(cor)
def generaete_biom_file(res_df, o, tg_rank, sampleid): """ output result in biom format """ import numpy as np import biom from biom.table import Table if biom.__version__ < '2.1.7': sys.exit("[ERROR] Biom library requires v2.1.7 or above.\n") target_df = pd.DataFrame() target_idx = (res_df['LEVEL']==tg_rank) target_df = res_df.loc[target_idx, ['ABUNDANCE','TAXID']] target_df['LINEAGE'] = target_df['TAXID'].apply(lambda x: gt.taxid2lineage(x, True, True)).str.split('|') sample_ids = [sampleid] data = np.array(target_df['ABUNDANCE']).reshape(len(target_df), 1) observ_ids = target_df['TAXID'] observ_metadata = [{'taxonomy': x} for x in target_df['LINEAGE'].tolist()] biom_table = Table(data, observ_ids, sample_ids, observ_metadata, table_id='GOTTCHA2') biom_table.to_json('GOTTCHA2', direct_io=o) return True
def test_rarefy_to_files2(self): """rarefy_to_files should write valid files with some metadata on otus """ maker = RarefactionMaker(self.otu_table_meta_fp, 0, 1, 1, 1) maker.rarefy_to_files(self.rare_dir, include_full=True, include_lineages=False) fname = os.path.join(self.rare_dir, "rarefaction_1_0.biom") with biom_open(fname, 'U') as biom_file: otu_table = Table.from_hdf5(biom_file) self.assertItemsEqual(otu_table.ids(), self.otu_table.ids()[:2])
def make_modules_on_correlations(correlation_table: pd.DataFrame, feature_table: Table, min_r: float=.35) -> \ (Table, nx.Graph, pd.Series): modules = ma.make_modules_naive(correlation_table, min_r=min_r) modules_rev = {asv: module for module, asvs in modules.items() for asv in asvs} for asv in feature_table.ids(axis='observation'): if asv not in modules_rev: modules_rev[asv] = None module_membership = pd.Series(modules_rev) coll_table = ma.collapse_modules(feature_table, modules) metadata = get_metadata_from_table(feature_table) metadata = ma.add_modules_to_metadata(modules, metadata) correlation_table_filtered = filter_correls(correlation_table, conet=True, min_r=min_r) net = correls_to_net(correlation_table_filtered, metadata=metadata) return coll_table, net, module_membership
def test_beta_jensenshannon(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = beta(table=t, metric='jensenshannon') # expected computed with scipy.spatial.distance.jensenshannon expected = skbio.DistanceMatrix([[0.0000000, 0.4645014, 0.52379239], [0.4645014, 0.0000000, 0.07112939], [0.52379239, 0.07112939, 0.0000000]], ids=['S1', 'S2', 'S3']) self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def test_beta_canberra_adkins(self): t = Table(np.array([[0, 0], [0, 1], [1, 2]]), ['O1', 'O2', 'O3'], ['S1', 'S2']) d = (1. / 2.) * sum([abs(0. - 1.) / (0. + 1.), abs(1. - 2.) / (1. + 2.)]) expected = skbio.DistanceMatrix(np.array([[0.0, d], [d, 0.0]]), ids=['S1', 'S2']) actual = beta(table=t, metric='canberra_adkins') self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def setUp(self): THIS_DIR = os.path.dirname(os.path.abspath(__file__)) tablefp = Table({}, [], []) self.emptyfeatures = tablefp goodtable = os.path.join(THIS_DIR, 'data/features_formated.biom') self.features = load_table(goodtable) goodtable = os.path.join(THIS_DIR, 'data/features2_formated.biom') ms2_match = os.path.join(THIS_DIR, 'data/ms2_match.txt') self.ms2_match = pd.read_csv(ms2_match, sep='\t', index_col=0) self.features2 = load_table(goodtable) self.goodcsi = qiime2.Artifact.load(os.path.join(THIS_DIR, 'data/csiFolder.qza')) self.goodcsi2 = qiime2.Artifact.load(os.path.join( THIS_DIR, 'data/csiFolder2.qza'))
def calculate_correlations(table: Table, corr_method: str='spearman', p_adjustment_method: str='fdr_bh') -> pd.DataFrame: # TODO: multiprocess this corr_method_fun = correl_methods[corr_method] correls = pd.DataFrame(index=['r', 'p']) for (val_i, id_i, _), (val_j, id_j, _) in table.iter_pairwise(axis='observation'): r, p = corr_method_fun(val_i, val_j) correls[id_i, id_j] = r, p correls = correls.transpose() correls.index = pd.MultiIndex.from_tuples(correls.index) # Turn tuple index into actual multiindex if p_adjustment_method is not None: correls['p_adjusted'] = p_adjust(correls.p, method=p_adjustment_method) correls = correls.sort_values('p') return correls
def test_combine_id_and_frequency_filters(self): # no filtering df = pd.DataFrame( { 'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut'] }, index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "Subject='subject-1' OR Subject='subject-2'" actual = filter_samples(table, metadata=metadata, where=where, min_frequency=1) expected = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) self.assertEqual(actual, expected) # id and frequency filters active df = pd.DataFrame( { 'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut'] }, index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "Subject='subject-1'" actual = filter_samples(table, metadata=metadata, where=where, min_frequency=2) expected = Table(np.array([[1], [1]]), ['O1', 'O2'], ['S2']) self.assertEqual(actual, expected)
def write_biom(self, sample_names, read_taxonomies, biom_file_io): '''Write the OTU info to a biom IO output stream Parameters ---------- sample_names: String names of each sample (sample_ids for biom) read_taxonomies: Array of hashes as per _iterate_otu_table_rows() biom_file_io: io open writeable stream to write biom contents to Returns True if successful, else False''' counts = [] observ_metadata = [] otu_ids = [] for otu_id, tax, count in self._iterate_otu_table_rows( read_taxonomies): if len(count) != len(sample_names): raise Exception( "Programming error: mismatched sample names and counts") counts.append(count) observ_metadata.append({'taxonomy': tax}) otu_ids.append(str(otu_id)) table = Table(np.array(counts), otu_ids, sample_names, observ_metadata, [{}] * len(sample_names), table_id='GraftM Taxonomy Count Table') try: table.to_hdf5(biom_file_io, 'GraftM graft') return True except RuntimeError as e: logging.warn( "Error writing BIOM output, file not written. The specific error was: %s" % e) return False
def setUp(self): rooted_nwk = io.StringIO("((A:0.1, B:0.2)C:0.3, D:0.4, E:0.5)root;") self.tree = skbio.TreeNode.read(rooted_nwk) self.metadata = Metadata( pd.DataFrame( data=np.array([['Bacteria', '1'], ['Archea', '1']], dtype=object), index=pd.Index(['A', 'D'], name='Feature ID'), columns=['kingdom', 'keep'], )) self.table = Table(data=np.array([[0, 1, 2], [2, 2, 2]]), observation_ids=['A', 'D'], sample_ids=['S1', 'S2', 'S3']) self.filtered_tree = self.tree.copy().shear(['A', 'D']) self.filtered_tree.prune()
def test_phylogenetic(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) tree = skbio.TreeNode.read( io.StringIO('((O1:0.25, O2:0.50):0.25, O3:0.75)root;')) actual = beta_diversity('unweighted_unifrac', t, phylogeny=tree) # expected computed with skbio.diversity.beta_diversity expected = skbio.DistanceMatrix( [[0.00, 0.25, 0.25], [0.25, 0.00, 0.00], [0.25, 0.00, 0.00]], ids=['S1', 'S2', 'S3']) self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def setUp(self): THIS_DIR = os.path.dirname(os.path.abspath(__file__)) tablefp = Table({}, [], []) self.emptyfeatures = tablefp goodtable = os.path.join(THIS_DIR, 'data/features_formated.biom') self.features = load_table(goodtable) goodtable = os.path.join(THIS_DIR, 'data/features2_formated.biom') self.features2 = load_table(goodtable) self.goodcsi = qiime2.Artifact.load( os.path.join(THIS_DIR, 'data/csiFolder.qza')) goodcsi = self.goodcsi.view(CSIDirFmt) self.collated = collate_fingerprint(goodcsi) self.goodcsi2 = qiime2.Artifact.load( os.path.join(THIS_DIR, 'data/csiFolder2.qza')) goodcsi = self.goodcsi2.view(CSIDirFmt) self.collated2 = collate_fingerprint(goodcsi)
def test_rarefy_to_files(self): """rarefy_to_files should write valid files """ maker = RarefactionMaker(self.otu_table_fp, 0, 1, 1, 1) maker.rarefy_to_files( self.rare_dir, include_full=True, include_lineages=False) fname = os.path.join(self.rare_dir, "rarefaction_1_0.biom") with biom_open(fname, 'U') as biom_file: otu_table = Table.from_hdf5(biom_file) self.assertItemsEqual( otu_table.sample_ids, self.otu_table.sample_ids[:2])
def test_filter_empty_features(self): # no filtering table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, max_frequency=42, filter_empty_features=False) expected = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) self.assertEqual(actual, expected) # filter one table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, max_frequency=4, filter_empty_features=False) expected = Table(np.array([[0, 1], [1, 1]]), ['O1', 'O2'], ['S1', 'S2']) self.assertEqual(actual, expected) # filter two table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, max_frequency=1, filter_empty_features=False) expected = Table(np.array([[0], [1]]), ['O1', 'O2'], ['S1']) self.assertEqual(actual, expected) # filter all table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, max_frequency=0, filter_empty_features=False) expected = Table(np.array([[], []]), ['O1', 'O2'], []) self.assertEqual(actual, expected)
def collapse_modules(table, modules): """collapse created modules in a biom table, members of multiple modules will be added to the smallest module""" table = table.copy() module_array = np.zeros((len(modules), table.shape[1])) seen = set() for module_, otus in modules.items(): module_number = int(module_.split('_')[-1]) seen = seen | set(otus) # sum everything in the module module_array[module_number] = np.sum([table.data(feature, axis="observation") for feature in otus], axis=0) table.filter(seen, axis='observation', invert=True) # make new table new_table_matrix = np.concatenate((table.matrix_data.toarray(), module_array)) new_table_obs = list(table.ids(axis='observation')) + list(modules.keys()) return Table(new_table_matrix, new_table_obs, table.ids())
def calculate_correlations( table: Table, corr_method=spearmanr, p_adjustment_method: str = 'fdr_bh') -> pd.DataFrame: # TODO: multiprocess this index = list() data = list() for (val_i, id_i, _), (val_j, id_j, _) in table.iter_pairwise(axis='observation'): r, p = corr_method(val_i, val_j) index.append((id_i, id_j)) data.append((r, p)) correls = pd.DataFrame(data, index=index, columns=['r', 'p']) correls.index = pd.MultiIndex.from_tuples( correls.index) # Turn tuple index into actual multiindex if p_adjustment_method is not None: correls['p_adjusted'] = p_adjust(correls.p, method=p_adjustment_method) return correls
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.otu_table_data = np.array([[2, 1, 0], [0, 5, 0], [0, 3, 0], [1, 2, 0]]) self.sample_names = list('YXZ') self.taxon_names = list('bacd') self.otu_metadata = [{'domain': 'Archaea'}, {'domain': 'Bacteria'}, {'domain': 'Bacteria'}, {'domain': 'Bacteria'}] self.otu_table = Table(self.otu_table_data, self.taxon_names, self.sample_names, observation_metadata=[{}, {}, {}, {}], sample_metadata=[{}, {}, {}]) self.otu_table_meta = Table(self.otu_table_data, self.taxon_names, self.sample_names, observation_metadata=self.otu_metadata) fd, self.otu_table_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(fd) fd, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(fd) self.rare_dir = mkdtemp(dir=self.tmp_dir, prefix='test_rarefaction_dir', suffix='') write_biom_table(self.otu_table, self.otu_table_fp) write_biom_table(self.otu_table_meta, self.otu_table_meta_fp) self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp] self._dirs_to_clean_up = [self.rare_dir]
def test_make_otu_table_taxonomy(self): """make_otu_table should work with taxonomy""" otu_map_lines = """0 ABC_0 DEF_1 1 ABC_1 x GHI_2 GHI_3 GHI_77 z DEF_3 XYZ_1""".split('\n') taxonomy = {'0': ['Bacteria', 'Firmicutes'], 'x': ['Bacteria', 'Bacteroidetes']} obs = make_otu_table(otu_map_lines, taxonomy) data = [[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 3, 0], [0, 1, 0, 1]] obs_md = [{'taxonomy': ['Bacteria', 'Firmicutes']}, {'taxonomy': ['None']}, {'taxonomy': ['Bacteria', 'Bacteroidetes']}, {'taxonomy': ['None']}] exp = Table(data, ['0', '1', 'x', 'z'], ['ABC', 'DEF', 'GHI', 'XYZ'], observation_metadata=obs_md, input_is_dense=True) self.assertEqual(obs, exp)
def test_parallel_beta(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) parallel = beta(table=t, metric='braycurtis', n_jobs=-1) single_thread = beta(table=t, metric='braycurtis', n_jobs=1) # expected computed with scipy.spatial.distance.braycurtis expected = skbio.DistanceMatrix([[0.0000000, 0.3333333, 0.6666667], [0.3333333, 0.0000000, 0.4285714], [0.6666667, 0.4285714, 0.0000000]], ids=['S1', 'S2', 'S3']) self.assertEqual(parallel.ids, expected.ids) self.assertEqual(single_thread.ids, expected.ids) for id1 in parallel.ids: for id2 in parallel.ids: npt.assert_almost_equal(parallel[id1, id2], expected[id1, id2]) for id1 in single_thread.ids: for id2 in single_thread.ids: npt.assert_almost_equal(single_thread[id1, id2], expected[id1, id2])
def test_write_biom_table(self): """Test functionality of write_biom_table(). """ table_exp = Table(np.array([[1., 1., 1., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 1., 0., 1.], [0., 0., 0., 1., 0.], [0., 0., 0., 1., 0.], [0., 0., 1., 0., 0.]]), ["k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Propionibacteriaceae;g__Propionibacterium", "k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae;g__Staphylococcus", "k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia", "k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Mobiluncus", "k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales;f__Xanthomonadaceae;g__Stenotrophomonas", "k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Corynebacteriaceae;g__Corynebacterium"], ["s1", "s2", "s3", "s4", "s5"]) self.biom_output_fp = join(self.working_dir, "test_output_biom") write_biom_table(table_exp, self.biom_output_fp) table_obs = load_table(self.biom_output_fp) self.assertEqual(table_obs, table_exp)
def test_max_frequency(self): # no filtering table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, max_frequency=42) expected = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) self.assertEqual(actual, expected) # filter one table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, max_frequency=4) expected = Table(np.array([[0, 1], [1, 1]]), ['O1', 'O2'], ['S1', 'S2']) self.assertEqual(actual, expected) # filter two table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, max_frequency=1) expected = Table(np.array([[1]]), ['O2'], ['S1']) self.assertEqual(actual, expected) # filter all table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, max_frequency=0) expected = Table(np.array([]), [], []) self.assertEqual(actual, expected)
def collapse_modules(table, modules, prefix="module"): """collapse created modules in a biom table, members of multiple modules will be added to the smallest module""" table = table.copy() module_array = np.zeros((len(modules), table.shape[1])) seen = set() for i, module_ in enumerate(modules): seen = seen | module_ # sum everything in the module module_array[i] = np.sum( [table.data(feature, axis="observation") for feature in module_], axis=0) table.filter(seen, axis='observation', invert=True) # make new table new_table_matrix = np.concatenate( (table.matrix_data.toarray(), module_array)) new_table_obs = list(table.ids(axis='observation')) + [ '_'.join([prefix, str(i)]) for i in range(len(modules)) ] return Table(new_table_matrix, new_table_obs, table.ids())
def biom_table2(): arr = np.array([[250, 0, 100, 446, 75], [0, 0, 1, 1, 2], [2, 2, 2, 2, 2], [100, 100, 500, 1, 1000], [500, 5, 0, 50, 100]]) obs_ids = ["otu_%s" % i for i in range(5)] samp_ids = ["samp_%s" % i for i in range(5)] obs_meta = [{ 'taxonomy': 'k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus; s__' }, { 'taxonomy': 'k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Paenibacillaceae; g__Paenibacillus; s__' }, { 'taxonomy': 'k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Methylophilales; f__Methylophilaceae; g__; s__' }, { 'taxonomy': 'k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__[Ruminococcus]; s__' }, { 'taxonomy': 'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Microbacteriaceae; g__; s__' }] return Table(arr, obs_ids, samp_ids, observation_metadata=obs_meta)
def calculate_correlations(table: Table, corr_method=spearmanr, p_adjust_method: str = 'fdr_bh', nprocs=1) -> \ pd.DataFrame: if nprocs > multiprocessing.cpu_count(): warnings.warn( "nprocs greater than CPU count, using all avaliable CPUs") nprocs = multiprocessing.cpu_count() pool = multiprocessing.Pool(nprocs) cor = partial(calculate_correlation, corr_method=corr_method) results = pool.map( cor, pairwise_iter_wo_metadata(table.iter_pairwise(axis='observation'))) index = [i[0] for i in results] data = [i[1] for i in results] pool.close() pool.join() correls = pd.DataFrame(data, index=index, columns=['r', 'p']) # Turn tuple index into actual multiindex, now guaranteeing that correls index is sorted correls.index = pd.MultiIndex.from_tuples( [sorted(i) for i in correls.index]) if p_adjust_method is not None: correls['p_adjusted'] = p_adjust(correls.p, method=p_adjust_method) return correls
def generate_biom_table(seqs_fp, uc_fp, delim='_'): """Generate BIOM table and representative FASTA set Parameters ---------- seqs_fp: string file path to deblurred sequences uc_fp: string file path to dereplicated sequences map (.uc format) delim: string, optional delimiter for splitting sample and sequence IDs in sequence label default: '_' Returns ------- deblur_clusters: dictionary dictionary of clusters including dereplicated sequence labels Table: biom.table an instance of a BIOM table """ # parse clusters in dereplicated sequences map (.uc format) with open(uc_fp, 'U') as uc_f: derep_clusters, failures, seeds = clusters_from_uc_file(uc_f) # parse clusters in deblur file, set observation ID to be the sequence deblur_clusters = parse_deblur_output(seqs_fp, derep_clusters) # create sparse dictionary of observation and sample ID counts data, otu_ids, sample_ids = generate_biom_data(deblur_clusters, delim) # build BIOM table return deblur_clusters, Table(data, otu_ids, sample_ids, observation_metadata=None, sample_metadata=None, table_id=None, generated_by="deblur", create_date=datetime.now().isoformat())
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) lower_percentage = opts.lower_percentage upper_percentage = opts.upper_percentage otu_table_fp = opts.otu_table_fp otu_table = load_table(otu_table_fp) delimiter = opts.delimiter mapping_fp = opts.mapping md_as_string = opts.md_as_string md_identifier = opts.md_identifier levels = opts.level.split(',') suppress_classic_table_output = opts.suppress_classic_table_output suppress_biom_table_output = opts.suppress_biom_table_output if upper_percentage is not None and lower_percentage is not None: raise ValueError( "upper_percentage and lower_percentage are mutually exclusive") if upper_percentage is not None and lower_percentage is not None and \ mapping: raise ValueError("upper_percentage and lower_percentage can not be " "using with mapping file") if upper_percentage is not None and \ (upper_percentage < 0 or upper_percentage > 1.0): raise ValueError('max_otu_percentage should be between 0.0 and 1.0') if lower_percentage is not None and \ (lower_percentage < 0 or lower_percentage > 1.0): raise ValueError('lower_percentage should be between 0.0 and 1.0') if mapping_fp: mapping_file = open(mapping_fp, 'U') mapping, header, comments = parse_mapping_file(mapping_file) # use the input Mapping file for producing the output filenames map_dir_path, map_fname = split(mapping_fp) map_basename, map_fname_ext = splitext(map_fname) else: if suppress_classic_table_output and suppress_biom_table_output: option_parser.error("Both classic and BIOM output formats were " "suppressed.") if not opts.absolute_abundance: otu_table = otu_table.norm(axis='sample', inplace=False) # introduced output directory to will allow for multiple outputs if opts.output_dir: create_dir(opts.output_dir, False) output_dir_path = opts.output_dir else: output_dir_path = './' # use the input OTU table to produce the output filenames dir_path, fname = split(otu_table_fp) basename, fname_ext = splitext(fname) # Iterate over the levels and generate a summarized taxonomy for each for level in levels: if mapping_fp: # define output filename output_fname = join(output_dir_path, map_basename + '_L%s.txt' % (level)) summary, tax_order = add_summary_mapping(otu_table, mapping, int(level), md_as_string, md_identifier) write_add_taxa_summary_mapping(summary, tax_order, mapping, header, output_fname, delimiter) else: # define the output filename. The extension will be added to the # end depending on the output format output_fname = join(output_dir_path, basename + '_L%s' % level) summary, header = make_summary(otu_table, int(level), upper_percentage, lower_percentage, md_as_string, md_identifier) sample_ids = header[1:] observation_ids = [] data = [] for row in summary: # Join taxonomic levels to create an observation ID. observation_ids.append(delimiter.join(row[0])) data.append(row[1:]) table = Table(np.asarray(data), observation_ids, sample_ids) if opts.transposed_output: table = table.transpose() if not suppress_classic_table_output: with open(output_fname + '.txt', 'w') as outfile: outfile.write(table.to_tsv()) if not suppress_biom_table_output: write_biom_table(table, output_fname + '.biom')
def parse_biom_table(fp, ids=None, axis='sample', input_is_dense=False): r"""Parses the biom table stored in the filepath `fp` Parameters ---------- fp : file like File alike object storing the BIOM table ids : iterable The sample/observation ids of the samples/observations that we need to retrieve from the biom table axis : {'sample', 'observation'}, optional The axis to subset on input_is_dense : boolean Indicates if the BIOM table is dense or sparse. Valid only for JSON tables. Returns ------- Table The BIOM table stored at fp Raises ------ ValueError If `samples` and `observations` are provided. Notes ----- Subsetting from the BIOM table is only supported in one axis Examples -------- Parse a hdf5 biom table >>> from h5py import File # doctest: +SKIP >>> from biom.parse import parse_biom_table >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP >>> t = parse_biom_table(f) # doctest: +SKIP Parse a hdf5 biom table subsetting observations >>> from h5py import File # doctest: +SKIP >>> from biom.parse import parse_biom_table >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP >>> t = parse_biom_table(f, ids=["GG_OTU_1"], ... axis='observation') # doctest: +SKIP """ if axis not in ['observation', 'sample']: UnknownAxisError(axis) try: return Table.from_hdf5(fp, ids=ids, axis=axis) except ValueError: pass except RuntimeError: pass if hasattr(fp, 'read'): old_pos = fp.tell() # Read in characters until first non-whitespace # If it is a {, then this is (most likely) JSON c = fp.read(1) while c.isspace(): c = fp.read(1) if c == '{': fp.seek(old_pos) t = Table.from_json(json.load(fp, object_pairs_hook=OrderedDict), input_is_dense=input_is_dense) else: fp.seek(old_pos) t = Table.from_tsv(fp, None, None, lambda x: x) elif isinstance(fp, list): try: t = Table.from_json(json.loads(''.join(fp), object_pairs_hook=OrderedDict), input_is_dense=input_is_dense) except ValueError: t = Table.from_tsv(fp, None, None, lambda x: x) else: t = Table.from_json(json.loads(fp, object_pairs_hook=OrderedDict), input_is_dense=input_is_dense) def subset_ids(data, id_, md): return id_ in ids def gt_zero(vals, id_, md): return np.any(vals) if ids is not None: t.filter(subset_ids, axis=axis) axis = 'observation' if axis == 'sample' else 'sample' t.filter(gt_zero, axis=axis) return t
class TopLevelTests(TestCase): """Tests of top-level functions""" def setUp(self): self.otu_table_vals = array([[1, 0, 2, 4], [1, 2, 0, 1], [0, 1, 1, 0], [1, 2, 1, 0]]) self.otu_table = Table(self.otu_table_vals, ['0', '1', '2', '3'], ['s1', 's2', 's3', 's4'], [{"taxonomy": ["Root", "Bacteria", "Actinobacteria", "Actinobacteria", "Coriobacteridae", "Coriobacteriales", "Coriobacterineae", "Coriobacteriaceae"]}, {"taxonomy": ["Root", "Bacteria", "Firmicutes", "\"Clostridia\""]}, {"taxonomy": ["Root", "Bacteria", "Firmicutes", "\"Clostridia\""]}, {"taxonomy": ["Root", "Bacteria"]}], None,) self.mapping = """#SampleID\tBarcodeSequence\tTreatment\tDescription #Test mapping file s1\tAAAA\tControl\tControl mouse, I.D. 354 s2\tGGGG\tControl\tControl mouse, I.D. 355 s3\tCCCC\tExp\tDisease mouse, I.D. 356 s4\tTTTT\tExp\tDisease mouse, I.D. 357""".split('\n') def test_sum_counts_by_consensus(self): """should sum otu counts by consensus""" #otu_table = parse_otu_table(self.otu_table) #otu_table = parse_biom_table(self.otu_table) obs_result, obs_mapping = sum_counts_by_consensus(self.otu_table, 3) exp_result = {( 'Root', 'Bacteria', 'Actinobacteria'): array([1, 0, 2, 4]), ('Root', 'Bacteria', 'Firmicutes'): array([1, 3, 1, 1]), ('Root', 'Bacteria', 'Other'): array([1, 2, 1, 0])} exp_mapping = {'s1': 0, 's2': 1, 's3': 2, 's4': 3} self.assertItemsEqual(obs_result, exp_result) self.assertEqual(obs_mapping, exp_mapping) obs_result, obs_mapping = sum_counts_by_consensus(self.otu_table, 2) exp_result = {('Root', 'Bacteria'): array([3, 5, 4, 5])} exp_mapping = {'s1': 0, 's2': 1, 's3': 2, 's4': 3} self.assertItemsEqual(obs_result, exp_result) self.assertEqual(obs_mapping, exp_mapping) obs_result, obs_mapping = sum_counts_by_consensus(self.otu_table, 4) exp_result = {('Root', 'Bacteria', 'Actinobacteria', 'Actinobacteria'): array([1, 0, 2, 4]), ('Root', 'Bacteria', 'Firmicutes', '"Clostridia"'): array([1, 3, 1, 1]), ('Root', 'Bacteria', 'Other', 'Other'): array([1, 2, 1, 0])} exp_mapping = {'s1': 0, 's2': 1, 's3': 2, 's4': 3} self.assertItemsEqual(obs_result, exp_result) self.assertEqual(obs_mapping, exp_mapping) def test_make_new_summary_file(self): """make_new_summary_file works """ lower_percentage, upper_percentage = None, None #otu_table = parse_otu_table(self.otu_table, int) #otu_table = parse_biom_table(self.otu_table) summary, header = make_summary( self.otu_table, 3, upper_percentage, lower_percentage) self.assertEqual(header, ['Taxon', 's1', 's2', 's3', 's4']) self.assertEqual( summary, [[('Root', 'Bacteria', 'Actinobacteria'), 1, 0, 2, 4], [('Root', 'Bacteria', 'Firmicutes'), 1, 3, 1, 1], [('Root', 'Bacteria', 'Other'), 1, 2, 1, 0]]) # test that works with relative abundances #otu_table = parse_otu_table(self.otu_table, float) #otu_table = parse_biom_table(self.otu_table, float) #otu_table = convert_otu_table_relative(otu_table) otu_table = self.otu_table.norm(axis='sample', inplace=False) summary, header = make_summary( otu_table, 3, upper_percentage, lower_percentage) self.assertEqual(header, ['Taxon', 's1', 's2', 's3', 's4']) self.assertEqual(summary[0][0], ('Root', 'Bacteria', 'Actinobacteria')) assert_almost_equal(summary[0][1:], [1.0 / 3, 0.0, 0.5, 0.8]) self.assertEqual(summary[1][0], ('Root', 'Bacteria', 'Firmicutes')) assert_almost_equal(summary[1][1:], [1.0 / 3, 0.6, 0.25, 0.2]) self.assertEqual(summary[2][0], ('Root', 'Bacteria', 'Other')) assert_almost_equal(summary[2][1:], [1.0 / 3, 0.4, 0.25, 0.0]) ## # testing lower triming lower_percentage, upper_percentage = 0.3, None summary, header = make_summary( otu_table, 3, upper_percentage, lower_percentage) self.assertEqual(summary[0][0], ('Root', 'Bacteria', 'Other')) assert_almost_equal(summary[0][1:], [1.0 / 3, 0.4, 0.25, 0.0]) ## # testing upper triming lower_percentage, upper_percentage = None, 0.4 summary, header = make_summary( otu_table, 3, upper_percentage, lower_percentage) self.assertEqual(summary[0][0], ('Root', 'Bacteria', 'Actinobacteria')) assert_almost_equal(summary[0][1:], [1.0 / 3, 0.0, 0.5, 0.8]) def test_add_summary_category_mapping(self): """make_new_summary_file works """ #otu_table = parse_otu_table(self.otu_table, int) #otu_table = parse_biom_table(self.otu_table) mapping, header, comments = parse_mapping_file(self.mapping) summary, taxon_order = add_summary_mapping(self.otu_table, mapping, 3) self.assertEqual(taxon_order, [('Root', 'Bacteria', 'Actinobacteria'), ('Root', 'Bacteria', 'Firmicutes'), ('Root', 'Bacteria', 'Other')]) self.assertEqual(summary, {'s1': [1, 1, 1], 's2': [0, 3, 2], 's3': [2, 1, 1], 's4': [4, 1, 0]})
# print "index: %i" %index ind_taxonomy.append('%s%s' %(syn[levels[index]], taxon[0]['ScientificName'])) # print ind_taxonomy Taxonomy[taxon[0]['ScientificName']]['taxonomy'] = ind_taxonomy # print "Taxonomy: %s" %Taxonomy for taxon in observ_ids: # print taxon # print Taxonomy[taxon] observation_metadata.append(Taxonomy[taxon]) #print "observation metadata:\n%s" %observation_metadata #print len(observation_metadata) table = Table(data, observ_ids, sample_id, observation_metadata, sample_metadata, table_id='Example Table') print table out=open(args.prefix+".biom","w") table.to_json('pplacer converted by jplace_to_biom.py v.'+VERSION, direct_io=out) out.close() out=open(args.prefix+".tsv","w") out.write(table.to_tsv(header_key='taxonomy', header_value='taxomomy')) #to_json('generaged by test', direct_io=out) out.close() print "\n##### DONE! #####\n"
def gibbs(table_fp, mapping_fp, output_dir, loo, jobs, alpha1, alpha2, beta, source_rarefaction_depth, sink_rarefaction_depth, restarts, draws_per_restart, burnin, delay, cluster_start_delay, source_sink_column, source_column_value, sink_column_value, source_category_column): '''Gibb's sampler for Bayesian estimation of microbial sample sources. For details, see the project README file. ''' # Create results directory. Click has already checked if it exists, and # failed if so. os.mkdir(output_dir) # Load the mapping file and biom table and remove samples which are not # shared. o = open(mapping_fp, 'U') sample_metadata_lines = o.readlines() o.close() sample_metadata, biom_table = \ _cli_sync_biom_and_sample_metadata( parse_mapping_file(sample_metadata_lines), load_table(table_fp)) # If biom table has fractional counts, it can produce problems in indexing # later on. biom_table.transform(lambda data, id, metadata: np.ceil(data)) # If biom table has sample metadata, there will be pickling errors when # submitting multiple jobs. We remove the metadata by making a copy of the # table without metadata. biom_table = Table(biom_table._data.toarray(), biom_table.ids(axis='observation'), biom_table.ids(axis='sample')) # Parse the mapping file and options to get the samples requested for # sources and sinks. source_samples, sink_samples = sinks_and_sources( sample_metadata, column_header=source_sink_column, source_value=source_column_value, sink_value=sink_column_value) # If we have no source samples neither normal operation or loo will work. # Will also likely get strange errors. if len(source_samples) == 0: raise ValueError('Mapping file or biom table passed contain no ' '`source` samples.') # Prepare the 'sources' matrix by collapsing the `source_samples` by their # metadata values. sources_envs, sources_data = collapse_sources(source_samples, sample_metadata, source_category_column, biom_table, sort=True) # Rarefiy data if requested. sources_data, biom_table = \ subsample_sources_sinks(sources_data, sink_samples, biom_table, source_rarefaction_depth, sink_rarefaction_depth) # Build function that require only a single parameter -- sample -- to # enable parallel processing if requested. if loo: f = partial(_cli_loo_runner, source_category=source_category_column, alpha1=alpha1, alpha2=alpha2, beta=beta, restarts=restarts, draws_per_restart=draws_per_restart, burnin=burnin, delay=delay, sample_metadata=sample_metadata, sources_data=sources_data, sources_envs=sources_envs, biom_table=biom_table, output_dir=output_dir) sample_iter = source_samples else: f = partial(_cli_sink_source_prediction_runner, alpha1=alpha1, alpha2=alpha2, beta=beta, restarts=restarts, draws_per_restart=draws_per_restart, burnin=burnin, delay=delay, sources_data=sources_data, biom_table=biom_table, output_dir=output_dir) sample_iter = sink_samples if jobs > 1: # Launch the ipcluster and wait for it to come up. subprocess.Popen('ipcluster start -n %s --quiet' % jobs, shell=True) time.sleep(cluster_start_delay) c = Client() c[:].map(f, sample_iter, block=True) # Shut the cluster down. Answer taken from SO: # http://stackoverflow.com/questions/30930157/stopping-ipcluster-engines-ipython-parallel c.shutdown(hub=True) else: for sample in sample_iter: f(sample) # Format results for output. samples = [] samples_data = [] for sample_fp in glob.glob(os.path.join(output_dir, '*')): samples.append(sample_fp.strip().split('/')[-1].split('.txt')[0]) samples_data.append(np.loadtxt(sample_fp, delimiter='\t')) mp, mps = _cli_collate_results(samples, samples_data, sources_envs) o = open(os.path.join(output_dir, 'mixing_proportions.txt'), 'w') o.writelines(mp) o.close() o = open(os.path.join(output_dir, 'mixing_proportions_stds.txt'), 'w') o.writelines(mps) o.close()
class TopLevelTests(TestCase): """Tests of top-level functions""" def setUp(self): """define some top-level data""" self.otu_table_values = array([[0, 0, 9, 5, 3, 1], [1, 5, 4, 0, 3, 2], [2, 3, 1, 1, 2, 5]]) { (0, 2): 9.0, (0, 3): 5.0, (0, 4): 3.0, (0, 5): 1.0, (1, 0): 1.0, (1, 1): 5.0, (1, 2): 4.0, (1, 4): 3.0, (1, 5): 2.0, (2, 0): 2.0, (2, 1): 3.0, (2, 2): 1.0, (2, 3): 1.0, (2, 4): 2.0, (2, 5): 5.0, } self.otu_table = Table( self.otu_table_values, ["OTU1", "OTU2", "OTU3"], ["Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"], [{"taxonomy": ["Bacteria"]}, {"taxonomy": ["Archaea"]}, {"taxonomy": ["Streptococcus"]}], [None, None, None, None, None, None], ) self.otu_table_f = Table( self.otu_table_values, ["OTU1", "OTU2", "OTU3"], ["Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"], [ {"taxonomy": ["1A", "1B", "1C", "Bacteria"]}, {"taxonomy": ["2A", "2B", "2C", "Archaea"]}, {"taxonomy": ["3A", "3B", "3C", "Streptococcus"]}, ], [None, None, None, None, None, None], ) self.full_lineages = [ ["1A", "1B", "1C", "Bacteria"], ["2A", "2B", "2C", "Archaea"], ["3A", "3B", "3C", "Streptococcus"], ] self.metadata = [ [ ["Sample1", "NA", "A"], ["Sample2", "NA", "B"], ["Sample3", "NA", "A"], ["Sample4", "NA", "B"], ["Sample5", "NA", "A"], ["Sample6", "NA", "B"], ], ["SampleID", "CAT1", "CAT2"], [], ] self.tree_text = ["('OTU3',('OTU1','OTU2'))"] fh, self.tmp_heatmap_fpath = mkstemp(prefix="test_heatmap_", suffix=".pdf") close(fh) def test_extract_metadata_column(self): """Extracts correct column from mapping file""" obs = extract_metadata_column(self.otu_table.sample_ids, self.metadata, category="CAT2") exp = ["A", "B", "A", "B", "A", "B"] self.assertEqual(obs, exp) def test_get_order_from_categories(self): """Sample indices should be clustered within each category""" category_labels = ["A", "B", "A", "B", "A", "B"] obs = get_order_from_categories(self.otu_table, category_labels) group_string = "".join([category_labels[i] for i in obs]) self.assertTrue("AAABBB" == group_string or group_string == "BBBAAA") def test_get_order_from_tree(self): obs = get_order_from_tree(self.otu_table.observation_ids, self.tree_text) exp = [2, 0, 1] assert_almost_equal(obs, exp) def test_make_otu_labels(self): lineages = [] for val, id, meta in self.otu_table.iter(axis="observation"): lineages.append([v for v in meta["taxonomy"]]) obs = make_otu_labels(self.otu_table.observation_ids, lineages, n_levels=1) exp = ["Bacteria (OTU1)", "Archaea (OTU2)", "Streptococcus (OTU3)"] self.assertEqual(obs, exp) full_lineages = [] for val, id, meta in self.otu_table_f.iter(axis="observation"): full_lineages.append([v for v in meta["taxonomy"]]) obs = make_otu_labels(self.otu_table_f.observation_ids, full_lineages, n_levels=3) exp = ["1B;1C;Bacteria (OTU1)", "2B;2C;Archaea (OTU2)", "3B;3C;Streptococcus (OTU3)"] self.assertEqual(obs, exp) def test_names_to_indices(self): new_order = ["Sample4", "Sample2", "Sample3", "Sample6", "Sample5", "Sample1"] obs = names_to_indices(self.otu_table.sample_ids, new_order) exp = [3, 1, 2, 5, 4, 0] assert_almost_equal(obs, exp) def test_get_log_transform(self): obs = get_log_transform(self.otu_table) data = [val for val in self.otu_table.iter_data(axis="observation")] xform = asarray(data, dtype=float64) for (i, val) in enumerate(obs.iter_data(axis="observation")): non_zeros = argwhere(xform[i] != 0) xform[i, non_zeros] = log10(xform[i, non_zeros]) assert_almost_equal(val, xform[i]) def test_get_clusters(self): data = asarray([val for val in self.otu_table.iter_data(axis="observation")]) obs = get_clusters(data, axis="row") self.assertTrue([0, 1, 2] == obs or obs == [1, 2, 0]) obs = get_clusters(data, axis="column") exp = [2, 3, 1, 4, 0, 5] self.assertEqual(obs, exp) def test_plot_heatmap(self): plot_heatmap( self.otu_table, self.otu_table.observation_ids, self.otu_table.sample_ids, filename=self.tmp_heatmap_fpath ) self.assertEqual(exists(self.tmp_heatmap_fpath), True) remove_files(set([self.tmp_heatmap_fpath]))
def parse_biom_table(fp, ids=None, axis='sample', input_is_dense=False): r"""Parses the biom table stored in the filepath `fp` Parameters ---------- fp : file like File alike object storing the BIOM table ids : iterable The sample/observation ids of the samples/observations that we need to retrieve from the biom table axis : {'sample', 'observation'}, optional The axis to subset on input_is_dense : boolean Indicates if the BIOM table is dense or sparse. Valid only for JSON tables. Returns ------- Table The BIOM table stored at fp Raises ------ ValueError If `samples` and `observations` are provided. Notes ----- Subsetting from the BIOM table is only supported in one axis Examples -------- Parse a hdf5 biom table >>> from h5py import File # doctest: +SKIP >>> from biom.parse import parse_biom_table >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP >>> t = parse_biom_table(f) # doctest: +SKIP Parse a hdf5 biom table subsetting observations >>> from h5py import File # doctest: +SKIP >>> from biom.parse import parse_biom_table >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP >>> t = parse_biom_table(f, ids=["GG_OTU_1"], ... axis='observation') # doctest: +SKIP """ if axis not in ['observation', 'sample']: UnknownAxisError(axis) try: return Table.from_hdf5(fp, ids=ids, axis=axis) except: pass if hasattr(fp, 'read'): old_pos = fp.tell() try: t = Table.from_json(json.load(fp), input_is_dense=input_is_dense) except ValueError: fp.seek(old_pos) t = Table.from_tsv(fp, None, None, lambda x: x) elif isinstance(fp, list): try: t = Table.from_json(json.loads(''.join(fp)), input_is_dense=input_is_dense) except ValueError: t = Table.from_tsv(fp, None, None, lambda x: x) else: t = Table.from_json(json.loads(fp), input_is_dense=input_is_dense) if ids is not None: f = lambda data, id_, md: id_ in ids t.filter(f, axis=axis) axis = 'observation' if axis == 'sample' else 'sample' f = lambda vals, id_, md: np.any(vals) t.filter(f, axis=axis) return t
class FunctionTests(TestCase): def setUp(self): self.tmp_dir = get_qiime_temp_dir() self.otu_table_data = np.array([[2, 1, 0], [0, 5, 0], [0, 3, 0], [1, 2, 0]]) self.sample_names = list('YXZ') self.taxon_names = list('bacd') self.otu_metadata = [{'domain': 'Archaea'}, {'domain': 'Bacteria'}, {'domain': 'Bacteria'}, {'domain': 'Bacteria'}] self.otu_table = Table(self.otu_table_data, self.taxon_names, self.sample_names) self.otu_table_meta = Table(self.otu_table_data, self.taxon_names, self.sample_names, observation_metadata=self.otu_metadata) fd, self.otu_table_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(fd) fd, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(fd) self.rare_dir = mkdtemp(dir=self.tmp_dir, prefix='test_rarefaction_dir', suffix='') write_biom_table(self.otu_table, self.otu_table_fp) write_biom_table(self.otu_table_meta, self.otu_table_meta_fp) self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp] self._dirs_to_clean_up = [self.rare_dir] def tearDown(self): """ cleanup temporary files """ map(remove, self._paths_to_clean_up) for d in self._dirs_to_clean_up: if os.path.exists(d): rmtree(d) def test_rarefy_to_list(self): """rarefy_to_list should rarefy correctly, same names """ maker = RarefactionMaker(self.otu_table_fp, 0, 1, 1, 1) res = maker.rarefy_to_list(include_full=True) self.assertItemsEqual(res[-1][2].ids(), self.otu_table.ids()) self.assertItemsEqual( res[-1][2].ids(axis='observation'), self.otu_table.ids(axis='observation')) self.assertEqual(res[-1][2], self.otu_table) sample_value_sum = [] for val in res[1][2].iter_data(axis='sample'): sample_value_sum.append(val.sum()) npt.assert_almost_equal(sample_value_sum, [1.0, 1.0]) def test_rarefy_to_files(self): """rarefy_to_files should write valid files """ maker = RarefactionMaker(self.otu_table_fp, 1, 2, 1, 1) maker.rarefy_to_files( self.rare_dir, include_full=True, include_lineages=False) fname = os.path.join(self.rare_dir, "rarefaction_1_0.biom") otu_table = load_table(fname) self.assertItemsEqual( otu_table.ids(), self.otu_table.ids()[:2]) # third sample had 0 seqs, so it's gone def test_rarefy_to_files2(self): """rarefy_to_files should write valid files with some metadata on otus """ maker = RarefactionMaker(self.otu_table_meta_fp, 1, 2, 1, 1) maker.rarefy_to_files( self.rare_dir, include_full=True, include_lineages=False) fname = os.path.join(self.rare_dir, "rarefaction_1_0.biom") otu_table = load_table(fname) self.assertItemsEqual( otu_table.ids(), self.otu_table.ids()[:2]) # third sample had 0 seqs, so it's gone def test_get_empty_rare(self): """get_rare_data should be empty when depth > # seqs in any sample""" self.assertRaises(TableException, get_rare_data, self.otu_table, 50, include_small_samples=False) def test_get_overfull_rare(self): """get_rare_data should be identical to given in this case here, rare depth > any sample, and include_small... = True""" rare_otu_table = get_rare_data(self.otu_table, 50, include_small_samples=True) self.assertEqual(len(rare_otu_table.ids()), 3) # 4 observations times 3 samples = size 12 before self.assertEqual(len(rare_otu_table.ids(axis='observation')), 4) for sam in self.otu_table.ids(): for otu in self.otu_table.ids(axis='observation'): rare_val = rare_otu_table.get_value_by_ids(otu, sam) self.assertEqual(rare_otu_table.get_value_by_ids(otu, sam), self.otu_table.get_value_by_ids(otu, sam)) def test_get_11depth_rare(self): """get_rare_data should get only sample X """ rare_otu_table = get_rare_data(self.otu_table, 11, include_small_samples=False) self.assertEqual(rare_otu_table.ids(), ('X',)) # a very complicated way to test things rare_values = [val[0] for (val, otu_id, meta) in rare_otu_table.iter(axis='observation')] self.assertEqual(rare_values, [1.0, 5.0, 3.0, 2.0])
class TopLevelTests(TestCase): """Tests of top-level functions""" def setUp(self): """define some top-level data""" self.otu_table_values = array([[0, 0, 9, 5, 3, 1], [1, 5, 4, 0, 3, 2], [2, 3, 1, 1, 2, 5]]) {(0, 2): 9.0, (0, 3): 5.0, (0, 4): 3.0, (0, 5): 1.0, (1, 0): 1.0, (1, 1): 5.0, (1, 2): 4.0, (1, 4): 3.0, (1, 5): 2.0, (2, 0): 2.0, (2, 1): 3.0, (2, 2): 1.0, (2, 3): 1.0, (2, 4): 2.0, (2, 5): 5.0} self.otu_table = Table(self.otu_table_values, ['OTU1', 'OTU2', 'OTU3'], ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], [{"taxonomy": ['Bacteria']}, {"taxonomy": ['Archaea']}, {"taxonomy": ['Streptococcus']}], [None, None, None, None, None, None]) self.otu_table_f = Table(self.otu_table_values, ['OTU1', 'OTU2', 'OTU3'], ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], [{"taxonomy": ['1A', '1B', '1C', 'Bacteria']}, {"taxonomy": ['2A', '2B', '2C', 'Archaea']}, {"taxonomy": ['3A', '3B', '3C', 'Streptococcus']}], [None, None, None, None, None, None]) self.full_lineages = [['1A', '1B', '1C', 'Bacteria'], ['2A', '2B', '2C', 'Archaea'], ['3A', '3B', '3C', 'Streptococcus']] self.metadata = [[['Sample1', 'NA', 'A'], ['Sample2', 'NA', 'B'], ['Sample3', 'NA', 'A'], ['Sample4', 'NA', 'B'], ['Sample5', 'NA', 'A'], ['Sample6', 'NA', 'B']], ['SampleID', 'CAT1', 'CAT2'], []] self.tree_text = ["('OTU3',('OTU1','OTU2'))"] fh, self.tmp_heatmap_fpath = mkstemp(prefix='test_heatmap_', suffix='.pdf') close(fh) def test_extract_metadata_column(self): """Extracts correct column from mapping file""" obs = extract_metadata_column(self.otu_table.ids(), self.metadata, category='CAT2') exp = ['A', 'B', 'A', 'B', 'A', 'B'] self.assertEqual(obs, exp) def test_get_order_from_categories(self): """Sample indices should be clustered within each category""" category_labels = ['A', 'B', 'A', 'B', 'A', 'B'] obs = get_order_from_categories(self.otu_table, category_labels) group_string = "".join([category_labels[i] for i in obs]) self.assertTrue("AAABBB" == group_string or group_string == "BBBAAA") def test_get_order_from_tree(self): obs = get_order_from_tree( self.otu_table.ids(axis='observation'), self.tree_text) exp = [2, 0, 1] assert_almost_equal(obs, exp) def test_make_otu_labels(self): lineages = [] for val, id, meta in self.otu_table.iter(axis='observation'): lineages.append([v for v in meta['taxonomy']]) obs = make_otu_labels(self.otu_table.ids(axis='observation'), lineages, n_levels=1) exp = ['Bacteria (OTU1)', 'Archaea (OTU2)', 'Streptococcus (OTU3)'] self.assertEqual(obs, exp) full_lineages = [] for val, id, meta in self.otu_table_f.iter(axis='observation'): full_lineages.append([v for v in meta['taxonomy']]) obs = make_otu_labels(self.otu_table_f.ids(axis='observation'), full_lineages, n_levels=3) exp = ['1B;1C;Bacteria (OTU1)', '2B;2C;Archaea (OTU2)', '3B;3C;Streptococcus (OTU3)'] self.assertEqual(obs, exp) def test_names_to_indices(self): new_order = ['Sample4', 'Sample2', 'Sample3', 'Sample6', 'Sample5', 'Sample1'] obs = names_to_indices(self.otu_table.ids(), new_order) exp = [3, 1, 2, 5, 4, 0] assert_almost_equal(obs, exp) def test_get_log_transform(self): obs = get_log_transform(self.otu_table) data = [val for val in self.otu_table.iter_data(axis='observation')] xform = asarray(data, dtype=float64) for (i, val) in enumerate(obs.iter_data(axis='observation')): non_zeros = argwhere(xform[i] != 0) xform[i, non_zeros] = log10(xform[i, non_zeros]) assert_almost_equal(val, xform[i]) def test_get_clusters(self): data = asarray([val for val in self.otu_table.iter_data(axis='observation')]) obs = get_clusters(data, axis='row') self.assertTrue([0, 1, 2] == obs or obs == [1, 2, 0]) obs = get_clusters(data, axis='column') exp = [2, 3, 1, 4, 0, 5] self.assertEqual(obs, exp) def test_plot_heatmap(self): plot_heatmap( self.otu_table, self.otu_table.ids(axis='observation'), self.otu_table.ids(), filename=self.tmp_heatmap_fpath) self.assertEqual(exists(self.tmp_heatmap_fpath), True) remove_files(set([self.tmp_heatmap_fpath]))