def test_frac_table(self): table = prep_table({ 'S1': { 'G1': 4, 'G2': 5, 'G3': 1 }, 'S2': { 'G1': 2, 'G2': 0, 'G3': 8 }, 'S3': { 'G1': 9, 'G2': 5, 'G3': 6 } }) exp = prep_table({ 'S1': { 'G1': 0.4, 'G2': 0.5, 'G3': 0.1 }, 'S2': { 'G1': 0.2, 'G2': 0.0, 'G3': 0.8 }, 'S3': { 'G1': 0.45, 'G2': 0.25, 'G3': 0.3 } }) # regular obs = frac_table(table) for i in range(4): self.assertListEqual(obs[i], exp[i]) # BIOM obs = frac_table(Table(*map(np.array, table))) exp = Table(*map(np.array, exp)) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # zero column table = prep_table({ 'S1': { 'G1': 0, 'G2': 2 }, 'S2': { 'G1': 0, 'G2': 0 } }) exp = prep_table({'S1': {'G1': 0, 'G2': 1}, 'S2': {'G1': 0, 'G2': 0}}) obs = frac_table(table) self.assertListEqual(obs[0], exp[0])
def test_biom_match(self): table = Table( np.array([[0, 0, 1, 1], [2, 3, 4, 4], [5, 5, 3, 3]]).T, ['a', 'b', 'c', 'd'], ['s2', 's3', 's4']) md = pd.DataFrame({ 'x1': [1, 3, 2], 'x2': [1, 1, 0] }, columns=['s1', 's2', 's3']).T exp_table = Table( np.array([[0, 0, 1, 1], [2, 3, 4, 4]]).T, ['a', 'b', 'c', 'd'], ['s2', 's3']) exp_md = pd.DataFrame({ 'x1': [3, 2], 'x2': [1, 0] }, columns=['s2', 's3']).T res_table, res_md = match(table, md) exp_df = pd.DataFrame(exp_table.to_dataframe()) res_df = pd.DataFrame(res_table.to_dataframe()) exp_df = exp_df.reindex_axis(sorted(exp_df.columns), axis=1) res_df = res_df.reindex_axis(sorted(res_df.columns), axis=1) pdt.assert_frame_equal(exp_df, res_df) exp_md = exp_md.reindex_axis(sorted(exp_md.index), axis=0) res_md = res_md.reindex_axis(sorted(res_md.index), axis=0) pdt.assert_frame_equal(res_md, exp_md)
def deposit_biofilms(output_dir, abs_table1, abs_table2, rel_table1, rel_table2, edges, metadata, sample_id): """ Writes down tables and edges into files. Parameters ---------- output_dir : str output directory rel_table1 : biom.Table Biom table of relative abundances rel_table2 : biom.Table Biom table of relative abundances abs_table1 : biom.Table Biom table of absolute abundances abs_table2 : biom.Table Biom table of absolute abundances edges : list Edge list for ground truthing. metadata : pd.DataFrame Dataframe of sample metadata sample_id : str sample id """ output_abs_microbes = "%s/table.abs.microbes.%s.biom" % (output_dir, sample_id) output_abs_metabolites = "%s/table.abs.metabolites.%s.biom" % (output_dir, sample_id) output_rel_microbes = "%s/table.rel.microbes.%s.biom" % (output_dir, sample_id) output_rel_metabolites = "%s/table.rel.metabolites.%s.biom" % (output_dir, sample_id) output_md = "%s/metadata.%s.txt" % (output_dir, sample_id) output_U = "%s/U.%s.txt" % (output_dir, sample_id) output_V = "%s/V.%s.txt" % (output_dir, sample_id) output_edges = "%s/edges.%s.txt" % (output_dir, sample_id) output_ranks = "%s/ranks.%s.txt" % (output_dir, sample_id) # idx1 = table1.sum(axis=0) > 0 # idx2 = table2.sum(axis=0) > 0 # table1 = table1.loc[:, idx1] # table2 = table2.loc[:, idx2] # relative abundances table1 = Table(rel_table1.values.T, rel_table1.columns, rel_table1.index) table2 = Table(rel_table2.values.T, rel_table2.columns, rel_table2.index) with biom_open(output_rel_microbes, 'w') as f: table1.to_hdf5(f, generated_by='moi1') with biom_open(output_rel_metabolites, 'w') as f: table2.to_hdf5(f, generated_by='moi2') # absolute abundances table1 = Table(abs_table1.values.T, abs_table1.columns, abs_table1.index) table2 = Table(abs_table2.values.T, abs_table2.columns, abs_table2.index) with biom_open(output_abs_microbes, 'w') as f: table1.to_hdf5(f, generated_by='moi1') with biom_open(output_abs_metabolites, 'w') as f: table2.to_hdf5(f, generated_by='moi2') pd.DataFrame(edges).to_csv(output_edges, sep='\t') metadata.to_csv(output_md, sep='\t')
def test_add_pseudocount2(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) obs = add_pseudocount(t, 2) exp = Table(np.array([[2, 3, 5], [3, 3, 4]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) self.assertEqual(obs, exp)
def main(args): os.mkdir(args.output_dir) np.random.seed(args.seed) sims = multinomial_bioms( k=args.latent_dim, D=args.input_dim, N=args.samples, M=args.depth) Y = sims['Y'] parts = Y.shape[0] // 10 samp_ids = list(map(str, range(Y.shape[0]))) obs_ids = list(map(str, range(Y.shape[1]))) train = Table(Y[:parts * 8].T, obs_ids, samp_ids[:parts * 8]) test = Table(Y[parts * 8 : parts * 9].T, obs_ids, samp_ids[parts * 8 : parts * 9]) valid = Table(Y[parts * 9:].T, obs_ids, samp_ids[parts * 9:]) output_dir = args.output_dir with biom_open(f'{output_dir}/train.biom', 'w') as f: train.to_hdf5(f, 'train') with biom_open(f'{output_dir}/test.biom', 'w') as f: test.to_hdf5(f, 'test') with biom_open(f'{output_dir}/valid.biom', 'w') as f: valid.to_hdf5(f, 'valid') tree = sims['tree'] tree.write(f'{output_dir}/basis.nwk') np.savetxt(f'{output_dir}/eigvals.txt', sims['eigs']) np.savetxt(f'{output_dir}/eigvecs.txt', sims['eigvectors']) np.savetxt(f'{output_dir}/W.txt', sims['W'])
def setUp(self): np.random.seed(0) torch.manual_seed(0) self.k, self.D, self.N, self.M, self.C = 10, 50, 500, 100000, 3 self.sims = multinomial_batch_bioms(k=self.k, D=self.D, N=self.N, M=self.M, C=self.C) Y = self.sims['Y'] parts = Y.shape[0] // 10 samp_ids = list(map(str, range(Y.shape[0]))) obs_ids = list(map(str, range(Y.shape[1]))) train = Table(Y[:parts * 8].T, obs_ids, samp_ids[:parts * 8]) test = Table(Y[parts * 8:parts * 9].T, obs_ids, samp_ids[parts * 8:parts * 9]) valid = Table(Y[parts * 9:].T, obs_ids, samp_ids[parts * 9:]) with biom_open('train.biom', 'w') as f: train.to_hdf5(f, 'train') with biom_open('test.biom', 'w') as f: test.to_hdf5(f, 'test') with biom_open('valid.biom', 'w') as f: valid.to_hdf5(f, 'valid') md = pd.DataFrame({'batch_category': self.sims['batch_idx']}, index=samp_ids) md.index.name = 'sampleid' md.to_csv('metadata.txt', sep='\t') batch_priors = pd.Series(self.sims['alphaILR']) batch_priors.to_csv('batch_priors.txt', sep='\t') self.sims['tree'].write('basis.nwk')
def deposit_blocktable(output_dir, abs_table, rel_table, metadata, truth, sample_id): choice = 'abcdefghijklmnopqrstuvwxyz' output_abstable = "%s/rel_table.%s.biom" % ( output_dir, sample_id) output_reltable = "%s/abs_table.%s.biom" % ( output_dir, sample_id) output_metadata = "%s/metadata.%s.txt" % ( output_dir, sample_id) output_truth = "%s/truth.%s.txt" % ( output_dir, sample_id) abs_t = Table(abs_table.T.values, abs_table.columns.values, abs_table.index.values) with biom_open(output_abstable, 'w') as f: abs_t.to_hdf5(f, generated_by='moi') rel_t = Table(rel_table.T.values, rel_table.columns.values, rel_table.index.values) with biom_open(output_reltable, 'w') as f: rel_t.to_hdf5(f, generated_by='moi') metadata.to_csv(output_metadata, sep='\t') truth.to_csv(output_truth, sep='\t')
def deposit_biofilm(table1, table2, metadata, U, V, edges, it, rep, output_dir): """ Writes down tables, metadata and feature metadata into files. Parameters ---------- table : biom.Table Biom table metadata : pd.DataFrame Dataframe of sample metadata feature_metadata : pd.DataFrame Dataframe of features metadata it : int iteration number rep : int repetition number output_dir : str output directory """ choice = 'abcdefghijklmnopqrstuvwxyz' output_microbes = "%s/table_microbes.%d_%s.biom" % (output_dir, it, choice[rep]) output_metabolites = "%s/table_metabolites.%d_%s.biom" % (output_dir, it, choice[rep]) output_md = "%s/metadata.%d_%s.txt" % (output_dir, it, choice[rep]) output_U = "%s/U.%d_%s.txt" % (output_dir, it, choice[rep]) output_V = "%s/V.%d_%s.txt" % (output_dir, it, choice[rep]) output_B = "%s/edges.%d_%s.txt" % (output_dir, it, choice[rep]) output_ranks = "%s/ranks.%d_%s.txt" % (output_dir, it, choice[rep]) idx1 = table1.sum(axis=0) > 0 idx2 = table2.sum(axis=0) > 0 table1 = table1.loc[:, idx1] table2 = table2.loc[:, idx2] table1 = Table(table1.values.T, table1.columns, table1.index) table2 = Table(table2.values.T, table2.columns, table2.index) with biom_open(output_microbes, 'w') as f: table1.to_hdf5(f, generated_by='moi1') with biom_open(output_metabolites, 'w') as f: table2.to_hdf5(f, generated_by='moi2') ranks = (U @ V) ranks = ranks[idx1, :] ranks = ranks[:, idx2] ranks = pd.DataFrame(ranks, index=table1.ids(axis='observation'), columns=table2.ids(axis='observation')) ranks.to_csv(output_ranks, sep='\t') metadata.to_csv(output_md, sep='\t', index_label='#SampleID') B = B[:, idx1] np.savetxt(output_U, U) np.savetxt(output_V, V) np.savetxt(output_B, B)
def test_divide_table(self): obs = prep_table({ 'S1': { 'G1': 20, 'G2': 36, 'G3': 4 }, 'S2': { 'G1': 15, 'G2': 24, 'G3': 8 }, 'S3': { 'G1': 10, 'G2': 18, 'G3': 0 } }) ob2 = Table(*map(np.array, obs)) sizes = {'G1': 5, 'G2': 6, 'G3': 2} exp = prep_table({ 'S1': { 'G1': 4, 'G2': 6, 'G3': 2 }, 'S2': { 'G1': 3, 'G2': 4, 'G3': 4 }, 'S3': { 'G1': 2, 'G2': 3, 'G3': 0 } }) ex2 = Table(*map(np.array, obs)) # regular divide_table(obs, sizes) for i in range(4): self.assertListEqual(obs[i], exp[i]) # BIOM divide_table(ob2, sizes) ex2 = Table(*map(np.array, exp)) self.assertEqual(ob2.descriptive_equality(ex2), 'Tables appear equal') # missing size del (sizes['G3']) with self.assertRaises(KeyError): divide_table(obs, sizes)
def test_verify_subset(self): metadata = [('a', 'other stuff\tfoo'), ('b', 'asdasdasd'), ('c', '123123123')] table = Table(array([[1, 2, 3], [4, 5, 6]]), ['x', 'y'], ['a', 'b', 'c']) self.assertTrue(verify_subset(table, metadata)) table = Table(array([[1, 2], [3, 4]]), ['x', 'y'], ['a', 'b']) self.assertTrue(verify_subset(table, metadata)) table = Table(array([[1, 2, 3], [4, 5, 6]]), ['x', 'y'], ['a', 'b', 'x']) self.assertFalse(verify_subset(table, metadata))
def test_biom_match_tips_intersect_tips(self): # there are less tree tips than table columns table = Table( np.array([[0, 0, 1, 1], [2, 3, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]]).T, ['a', 'b', 'c', 'd'], ['s1', 's2', 's3', 's4']) tree = TreeNode.read([u"((a,b)f,c)r;"]) exp_table = Table( np.array([[0, 0, 1], [2, 3, 4], [5, 5, 3], [0, 0, 0]]).T, ['a', 'b', 'c'], ['s1', 's2', 's3', 's4']) exp_tree = tree res_table, res_tree = match_tips(table, tree) self.assertEqual(exp_table, res_table) self.assertEqual(str(exp_tree), str(res_tree))
def setUp(self): ids_and_md = (['O1', 'O2', 'O3', 'O4'], ['S1', 'S2', 'S3'], [{ 'taxonomy': ['foo', 'bar'] }, { 'taxonomy': ['foo', 'not bar'] }, { 'taxonomy': ['foo', 'bar'] }, { 'taxonomy': ['foo', 'not bar'] }]) self.t1 = Table(array([[0, 1, 2], [0, 0, 1], [1, 1, 0], [3, 0, 1]]), *ids_and_md) self.t2 = Table(array([[0, 1, 2], [1, 0, 1], [0, 1, 0], [3, 0, 1]]), *ids_and_md)
def test_biom_match_tips_intersect_tree_immutable(self): # tests to see if tree changes. table = Table( np.array([[0, 0, 1], [2, 3, 4], [5, 5, 3], [0, 0, 1]]).T, ['a', 'b', 'd'], ['s1', 's2', 's3', 's4']) exp_table = Table( np.array([[0, 0, 1], [2, 3, 4], [5, 5, 3], [0, 0, 1]]).T, ['a', 'b', 'd'], ['s1', 's2', 's3', 's4']) tree = TreeNode.read([u"(((a,b)f, c),d)r;"]) match_tips(table, tree) self.assertEqual(exp_table, table) self.assertEqual(str(tree), u"(((a,b)f,c),d)r;\n")
def setUp(self): X = np.array([[10, 1, 4, 1, 4, 0], [0, 0, 2, 0, 2, 8], [0, 1, 2, 1, 2, 4], [0, 1, 0, 1, 0, 0], [2, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [7, 1, 0, 1, 0, 0]]) oids = ['o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7'] sids = ['s1', 's2', 's3', 's4', 's5', 's6'] bigX = np.array([[10, 1, 4, 1, 4, 1, 0], [0, 0, 2, 0, 2, 1, 8], [0, 1, 2, 1, 2, 1, 4], [0, 1, 0, 1, 0, 1, 0], [2, 0, 0, 0, 0, 1, 0], [1, 0, 0, 0, 0, 1, 0], [4, 0, 0, 0, 0, 1, 0]]) self.big_table = Table( bigX, oids, sids + ['s9'], ) self.metadata = pd.DataFrame( np.vstack( (np.ones(8), np.array(['a', 'a', 'b', 'b', 'a', 'a', 'b', 'a']), np.arange(8).astype(np.float64), np.array([ 'Test', 'Test', 'Train', 'Train', 'Train', 'Train', 'Test', 'Train' ]))).T, columns=['intercept', 'categorical', 'continuous', 'train'], index=['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8']) self.metadata['continuous'] = self.metadata['continuous'].astype( np.float64) self.trimmed_metadata = self.metadata.loc[[ 's1', 's2', 's3', 's4', 's5', 's6' ]] df = pd.DataFrame([{ 'intercept': 1, 'categorical': 'b', 'continuous': 1., 'train': 'Train' }, { 'intercept': 1, 'categorical': 'b', 'continuous': 1., 'train': 'Train' }], index=['s2', 's4']) df = df.reindex( columns=['intercept', 'categorical', 'continuous', 'train']) self.metadata_dup = self.metadata.append(df) self.table = Table(X, oids, sids)
def test_validate_prefix(self): httpretty.register_uri( httpretty.POST, "https://test_server.com/qiita_db/jobs/job-id/step/") httpretty.register_uri( httpretty.GET, "https://test_server.com/qiita_db/prep_template/1/data", body='{"data": {"1.S1": {"orig_name": "S1"}, "1.S2": ' '{"orig_name": "S2"}, "1.S3": {"orig_name": "S3"}}}') fd, biom_fp = mkstemp(suffix=".biom") close(fd) data = np.asarray([[0, 0, 1], [1, 3, 42]]) table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3']) with biom_open(biom_fp, 'w') as f: table.to_hdf5(f, "Test") self._clean_up_files.append(biom_fp) self.parameters['files'] = '{"BIOM": ["%s"]}' % biom_fp obs_success, obs_ainfo, obs_error = validate(self.qclient, 'job-id', self.parameters, self.out_dir) exp_biom_fp = join(self.out_dir, basename(biom_fp)) self._clean_up_files.append(exp_biom_fp) self.assertTrue(obs_success) self.assertEqual(obs_ainfo, [[None, 'BIOM', [exp_biom_fp, 'biom']]]) self.assertEqual(obs_error, "") obs_t = load_table(exp_biom_fp) self.assertItemsEqual(obs_t.ids(), ["1.S1", "1.S2", "1.S3"])
def _create_job_and_biom(self, sample_ids, template=None, analysis=None): # Create the BIOM table that needs to be valdiated fd, biom_fp = mkstemp(suffix=".biom") close(fd) data = np.random.randint(100, size=(2, len(sample_ids))) table = Table(data, ['O1', 'O2'], sample_ids) with biom_open(biom_fp, 'w') as f: table.to_hdf5(f, "Test") self._clean_up_files.append(biom_fp) # Create a new job parameters = { 'template': template, 'files': dumps({'biom': [biom_fp]}), 'artifact_type': 'BIOM', 'analysis': analysis } data = { 'command': dumps(['BIOM type', '2.1.4', 'Validate']), 'parameters': dumps(parameters), 'status': 'running' } res = self.qclient.post('/apitest/processing_job/', data=data) job_id = res['job'] return biom_fp, job_id, parameters
def test_get_stats(): def entropy(p): p = p[p != 0] return -(p * log(p)).sum() data = array([[0, 0, 1], [1, 3, 42]], dtype=float) table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3']) h, jsd = get_stats(table) p = data.sum(axis=1) p /= p.sum() assert_almost_equal(h, entropy(p)) avg_h = 0 weights = [p.sum() / data.sum() for p in data.T] for w, p in zip(weights, data.T): p /= p.sum() avg_h += w * entropy(p) assert_almost_equal(jsd, h - avg_h) table.norm() h, jsd = get_stats(table) data /= data.sum(axis=0) p = data.sum(axis=1) / 3. assert_almost_equal(h, entropy(p)) avg_h = sum(entropy(p) / 3. for p in data.T) assert_almost_equal(jsd, h - avg_h)
def test_beta_rarefaction_too_many_samples_dropped(self): # mantel needs 3x3 or larger distance matrix table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) with self.assertRaisesRegex(ValueError, '3x3 in size'): beta_rarefaction(self.output_dir, table, 'braycurtis', 'upgma', self.md, 2)
def setUp(self): X = np.array( [[10, 1, 4, 1, 4, 0], [0, 0, 2, 0, 2, 8], [0, 1, 2, 1, 2, 4], [0, 1, 0, 1, 0, 0], [2, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [7, 1, 0, 1, 0, 0]] ) oids = ['o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7'] sids = ['s1', 's2', 's3', 's4', 's5', 's6'] self.metadata = pd.DataFrame( np.vstack( ( np.ones(8), np.array(['a', 'a', 'b', 'b', 'a', 'a', 'b', 'a']), np.arange(8).astype(np.float64), np.array(['Test', 'Test', 'Train', 'Train', 'Train', 'Train', 'Test', 'Train']) ) ).T, columns=['intercept', 'categorical', 'continuous', 'train'], index=['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8'] ) self.metadata['continuous'] = self.metadata[ 'continuous'].astype(np.float64) self.trimmed_metadata = self.metadata.loc[ ['s1', 's2', 's3', 's4', 's5', 's6'] ] self.table = Table(X, oids, sids)
def test_band_table(self): res_table, res_md, res_beta, res_theta, res_gamma = band_table(5, 6) mat = np.array( [[161.0, 88.0, 26.0, 4.0, 0.0], [185.0, 144.0, 40.0, 4.0, 4.0], [28.0, 39.0, 156.0, 45.0, 12.0], [7.0, 64.0, 50.0, 81.0, 56.0], [0.0, 29.0, 83.0, 217.0, 194.0], [0.0, 0.0, 19.0, 54.0, 127.0]] ) samp_ids = ['S0', 'S1', 'S2', 'S3', 'S4'] feat_ids = ['F0', 'F1', 'F2', 'F3', 'F4', 'F5'] exp_table = Table(mat, feat_ids, samp_ids) exp_md = pd.DataFrame({'G': [2., 4., 6., 8., 10.]}, index=samp_ids) exp_beta = np.array( [[-0.28284271, -0.48989795, -0.69282032, -0.89442719, -1.09544512]] ) exp_theta = np.array( [2.23148138, 3.64417845, 3.9674706, 3.32461839, 2.31151262] ) exp_gamma = np.array( [0.79195959, 1.89427207, 3.41791359, 5.36656315, 7.74114548] ) self.assertEqual(exp_table, res_table) pdt.assert_frame_equal(exp_md, res_md) npt.assert_allclose(exp_beta, res_beta) npt.assert_allclose(exp_theta, res_theta)
def setUp(self): # Register the URIs for the QiitaClient httpretty.register_uri( httpretty.POST, "https://test_server.com/qiita_db/authenticate/", body='{"access_token": "token", "token_type": "Bearer", ' '"expires_in": "3600"}') self.qclient = QiitaClient('https://test_server.com', 'client_id', 'client_secret') # Create a biom table fd, self.biom_fp = mkstemp(suffix=".biom") close(fd) data = np.asarray([[0, 0, 1], [1, 3, 42]]) table = Table(data, ['O1', 'O2'], ['1.S1', '1.S2', '1.S3']) with biom_open(self.biom_fp, 'w') as f: table.to_hdf5(f, "Test") self.out_dir = mkdtemp() self.parameters = { 'template': 1, 'files': '{"BIOM": ["%s"]}' % self.biom_fp, 'artifact_type': 'BIOM' } self._clean_up_files = [self.biom_fp, self.out_dir]
def setUp(self): self.seqs = (skbio.Sequence('AACCGGTT'), skbio.Sequence('AACCGAGG'), skbio.Sequence('AACCTTTT'), skbio.Sequence('AACCGCTC')) self.table = Table( np.array([[0, 1, 1], [0, 2, 1], [1, 0, 1], [0, 0, 1], [9, 1, 1]]), ['AACCGG', 'AACCGA', 'AACCTT', 'AACCGC', 'AAAAAA'], ['s1', 's2', 's3'])
def test_slice_mapping_file(self): header, metadata = parse_mapping_file(StringIO(test_mapping)) table = Table(array([[1, 2], [4, 5]]), ['x', 'y'], ['a', 'c']) exp = ["a\t1\t123123", "c\tpoop\tdoesn't matter"] obs = slice_mapping_file(table, metadata) self.assertEqual(obs, exp)
def hashing(unhashed_otu_table_list, unhashed_rep_seqs_list, sample_metadata_list): otu_df_list = [] rep_seq_ids = set() seqs = [] # Create OTU table for unhashed_otu_table in unhashed_otu_table_list: otu_df_list.append(hash_otu_table(unhashed_otu_table)) otu_df = pd.concat(otu_df_list, join="outer", axis=1) otu_df.fillna(0.0, inplace=True) otu_table = Table(otu_df.values, list(otu_df.index), list(otu_df.columns)) # Create rep seqs for unhashed_rep_seqs in unhashed_rep_seqs_list: seqs.extend(hash_rep_seqs(unhashed_rep_seqs, rep_seq_ids)) otu_table_ids = set(otu_df.index) assert otu_table_ids == rep_seq_ids assert len(otu_df.index) == len(rep_seq_ids) # Merge sample metadata sample_metadata = pd.concat( [pd.read_csv(s, sep="\\t") for s in sample_metadata_list]) # Write files sample_metadata.to_csv("sample_metadata.tsv", sep="\\t", index=False) with biom_open("otu_table.biom", "w") as fid: otu_table.to_hdf5(fid, "Constructed by micone in dada2/deblur pipeline") with open("rep_seqs.fasta", "w") as fid: fasta_writer = FastaIO.FastaWriter(fid, wrap=None) fasta_writer.write_file(seqs)
def import_shogun_biom(f, annotation_table=None, annotation_type=None, names_to_taxonomy=False): import_funcs = { 'module': shogun_parse_module_table, 'pathway': shogun_parse_pathway_table, 'enzyme': shogun_parse_enzyme_table } table = pd.read_csv(f, sep='\t', index_col=0) bt = Table(table.values, observation_ids=list(map(str, table.index)), sample_ids=list(map(str, table.columns))) if names_to_taxonomy: metadata = { x: { 'taxonomy': x.split(';') } for x in bt.ids(axis='observation') } bt.add_metadata(metadata, axis='observation') if annotation_table is not None: metadata = import_funcs[annotation_type](annotation_table) bt.add_metadata(metadata, axis='observation') return (bt)
def setUp(self): om = [{ 'taxonomy': ['foo', 'bar', 'baz', 'bazilus', 'foospiracea', 'spam', 'egg'] }, { 'taxonomy': ['foo', 'bar', 'baz', 'bazilus', 'foospiracea', 'spam', 'gleep'] }, { 'taxonomy': [ 'foo', 'bar', 'baz', 'bazilus', 'foospiracea', 'bloop', 'Boatface' ] }] self.test_table = Table(data=np.array([[1, 2], [3, 4], [5, 6]]), observation_ids=['o1', 'o2', 'o3'], sample_ids=['s1', 's2'], observation_metadata=om) self.md = pd.DataFrame(data=[[True, '++', '00001'], [False, ':L', '02341'], [True, '8L', '00221'], [True, '}8L', '00501']], columns=['SampleID' 'IS_GOOD', 'LOL', 'PLEL'], index=['s0', 's1', 's2', 's3'])
def noisify(table_file, metadata_file, sigma, output_file): metadata = pd.read_table(metadata_file, index_col=0) table = load_table(table_file) table = pd.DataFrame(np.array(table.matrix_data.todense()).T, index=table.ids(axis='sample'), columns=table.ids(axis='observation')) cov = np.eye(table.shape[1] - 1) m_noise = compositional_noise(cov, nsamp=table.shape[0]) table_ = table.values table_ = np.vstack( [perturb(table_[i, :], m_noise[i, :]) for i in range(table_.shape[0])]) # note that this assumes that the column is named `library_size table_ = pd.DataFrame( multinomial_sample(table_, depths=metadata['library_size'])) table_.index = table.index table_.columns = list(table.columns) metadata['observed'] = np.sum(table_.sum(axis=0) > 0) metadata['unobserved'] = np.sum(table_.sum(axis=0) == 0) metadata.to_csv(metadata_file, sep='\t') # drop zeros -- they are not informative table_ = table_.loc[:, table_.sum(axis=0) > 0] t = Table(table_.T.values, table_.columns.values, table_.index.values) with biom_open(output_file, 'w') as f: t.to_hdf5(f, generated_by='moi')
def deposit(table, groups, truth, output_table, output_groups, output_truth): t = Table(table.T.values, table.columns.values, table.index.values) with biom_open(output_table, 'w') as f: t.to_hdf5(f, generated_by='moi') groups.to_csv(output_groups, sep='\t') with open(output_truth, 'w') as f: f.write(','.join(truth))
def test_faith_pd_invalid_input(self): # tests are based of skbio tests, checking for duplicate ids, # negative counts are not included but should be incorporated # tree has duplicated tip ids tree = TreeNode.read( StringIO('((OTU1:0.1, OTU2:0.2):0.3, (OTU3:0.5, OTU4:0.7):1.1)' 'root;')) otu_ids = ['OTU%d' % i for i in range(1, 5)] u_counts = [1, 1, 0, 0] data = np.array([u_counts]).T bt = Table(data, otu_ids, ['u']) ta = os.path.join(gettempdir(), 'table.biom') tr = os.path.join(gettempdir(), 'tree.biom') self.files_to_delete.append(ta) self.files_to_delete.append(tr) with biom_open(ta, 'w') as fhdf5: bt.to_hdf5(fhdf5, 'Table for unit testing') tree.write(tr) self.assertRaises(IOError, faith_pd, 'dne.biom', tr) self.assertRaises(IOError, faith_pd, ta, 'dne.tre')
def hash_otu_table(unhashed_otu_table, seqid_hash_dict, output_file): table = load_table(unhashed_otu_table) df = table.to_dataframe(dense=True) seq_ids = [seqid_hash_dict[i] for i in df.index] df.index = seq_ids new_table = Table(df.values, list(df.index), list(df.columns)) with biom_open(output_file, "w") as fid: new_table.to_hdf5(fid, "Constructed using qiime1 clustering")