def test_validate_prefix(self): httpretty.register_uri( httpretty.POST, "https://test_server.com/qiita_db/jobs/job-id/step/") httpretty.register_uri( httpretty.GET, "https://test_server.com/qiita_db/prep_template/1/data", body='{"data": {"1.S1": {"orig_name": "S1"}, "1.S2": ' '{"orig_name": "S2"}, "1.S3": {"orig_name": "S3"}}}') fd, biom_fp = mkstemp(suffix=".biom") close(fd) data = np.asarray([[0, 0, 1], [1, 3, 42]]) table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3']) with biom_open(biom_fp, 'w') as f: table.to_hdf5(f, "Test") self._clean_up_files.append(biom_fp) self.parameters['files'] = '{"BIOM": ["%s"]}' % biom_fp obs_success, obs_ainfo, obs_error = validate( self.qclient, 'job-id', self.parameters, self.out_dir) exp_biom_fp = join(self.out_dir, basename(biom_fp)) self._clean_up_files.append(exp_biom_fp) self.assertTrue(obs_success) self.assertEqual(obs_ainfo, [[None, 'BIOM', [exp_biom_fp, 'biom']]]) self.assertEqual(obs_error, "") obs_t = load_table(exp_biom_fp) self.assertItemsEqual(obs_t.ids(), ["1.S1", "1.S2", "1.S3"])
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode, metric: str, n_jobs: int=1)-> skbio.DistanceMatrix: if metric not in phylogenetic_metrics(): raise ValueError("Unknown phylogenetic metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") if n_jobs != 1 and metric == 'weighted_unifrac': raise ValueError("Weighted UniFrac is not parallelizable") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') feature_ids = table.ids(axis='observation') try: results = skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, otu_ids=feature_ids, tree=phylogeny, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs ) except skbio.tree.MissingNodeError as e: message = str(e).replace('otu_ids', 'feature_ids') message = message.replace('tree', 'phylogeny') raise skbio.tree.MissingNodeError(message) return results
def beta(table: biom.Table, metric: str, pseudocount: int=1, n_jobs: int=1)-> skbio.DistanceMatrix: if not (metric in non_phylogenetic_metrics()): raise ValueError("Unknown metric: %s" % metric) counts = table.matrix_data.toarray().T def aitchison(x, y, **kwds): return euclidean(clr(x), clr(y)) if metric == 'aitchison': counts += pseudocount metric = aitchison if table.is_empty(): raise ValueError("The provided table object is empty") sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs )
def main(table_in, table_out, pathways, to_classic): # setup table = load_table(table_in) pathway_dict = get_pathway2kos() # get set of kos from pathways pathways_kos = set() for pathway in pathways: pathways_kos = pathways_kos | pathway_dict[pathway.strip()[-5:]] # get selected kos kos_to_keep = set(table.ids('observation')) & \ pathways_kos if len(kos_to_keep) == 0: raise EmptySetERROR('Intersection created empty set') obs_ids = np.array(list(kos_to_keep)) data = np.empty([len(obs_ids), len(table.ids('sample'))]) for i, obs in enumerate(obs_ids): data[i] = table.data(obs, 'observation') # output new_table = Table(data, obs_ids, table.ids('sample'), type="OTU table") if to_classic: # print to tab delimited biom table f = open(table_out, 'w') f.write(new_table.to_tsv()) else: # print json biom table new_table.to_json("filter_KOs_by_pathway.py", open(table_out, 'w'))
def group(table: biom.Table, axis: str, metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table: if table.is_empty(): raise ValueError("Cannot group an empty table.") if axis == 'feature': biom_axis = 'observation' else: biom_axis = axis metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis), axis) grouped_table = table.collapse( lambda axis_id, _: metadata.get_value(axis_id), collapse_f=_mode_lookup[mode], axis=biom_axis, norm=False, include_collapsed_metadata=False) # Reorder axis by first unique appearance of each group value in metadata # (makes it stable for identity mappings and easier to test) # TODO use CategoricalMetadataColumn API for retrieving categories/groups, # when the API exists. series = metadata.to_series() return grouped_table.sort_order(series.unique(), axis=biom_axis)
def test_execute_job_error(self): # Create a prep template prep_info = {'SKB8.640193': {'col': 'val1'}, 'SKD8.640184': {'col': 'val2'}} data = {'prep_info': dumps(prep_info), 'study': 1, 'data_type': '16S'} template = self.qclient.post( '/apitest/prep_template/', data=data)['prep'] # Create a new validate job fd, biom_fp = mkstemp(suffix=".biom") close(fd) data = np.random.randint(100, size=(2, 2)) table = Table(data, ['O1', 'O2'], ['S1', 'S2']) with biom_open(biom_fp, 'w') as f: table.to_hdf5(f, "Test") data = {'command': dumps(['BIOM type', '2.1.4', 'Validate']), 'parameters': dumps( {'files': dumps({'biom': [biom_fp]}), 'template': template, 'artifact_type': 'BIOM'}), 'artifact_type': 'BIOM', 'status': 'queued'} job_id = self.qclient.post( '/apitest/processing_job/', data=data)['job'] plugin("https://localhost:21174", job_id, self.out_dir) obs = self._wait_job(job_id) self.assertEqual(obs, 'error')
def merge(table1: biom.Table, table2: biom.Table) -> biom.Table: table1_sids = set(table1.ids(axis='sample')) table2_sids = set(table2.ids(axis='sample')) if len(table1_sids & table2_sids) > 0: raise ValueError('Some samples are present in both tables: %s' % ', '.join(table1_sids & table2_sids)) return table1.merge(table2)
def filter_table(table: biom.Table, tree: skbio.TreeNode) -> biom.Table: """ Filter table to remove feature ids that are not tip ids in tree """ tip_ids = set([t.name for t in tree.tips()]) feature_ids = set(table.ids(axis='observation')) # ids_to_keep can only include ids that are in table ids_to_keep = tip_ids & feature_ids table.filter(ids_to_keep, axis='observation', inplace=True) return table
def rarefy(table: biom.Table, sampling_depth: int) -> biom.Table: table = table.subsample(sampling_depth, axis='sample', by_id=False) if table.is_empty(): raise ValueError('The rarefied table contains no samples or features. ' 'Verify your table is valid and that you provided a ' 'shallow enough sampling depth.') return table
def alpha(table: biom.Table, metric: str) -> pd.Series: if metric not in non_phylogenetic_metrics(): raise ValueError("Unknown metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') result = skbio.diversity.alpha_diversity(metric=metric, counts=counts, ids=sample_ids) result.name = metric return result
def _1(data: biom.Table) -> BIOMV100Format: data = _drop_axis_metadata(data) ff = BIOMV100Format() with ff.open() as fh: fh.write(data.to_json(generated_by=_get_generated_by())) return ff
def beta(table: biom.Table, metric: str, n_jobs: int=1)-> skbio.DistanceMatrix: if metric not in non_phylogenetic_metrics(): raise ValueError("Unknown metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs )
def rename_deblur_biom(biom, name_stub='deblur', metadata_name='deblurred_seq'): seqs = biom.ids(axis='observation') seqnames = ['{0}{1}'.format(name_stub, x) for x in range(len(seqs))] seq_metadata = {seqname: {metadata_name: seq} for seq, seqname in zip(seqs, seqnames)} renamed_biom = Table(biom.matrix_data, seqnames, biom.ids(axis='sample'), biom.metadata(axis='observation'), biom.metadata(axis='sample'), table_id = biom.table_id + ' renamed') renamed_biom.add_metadata(seq_metadata, axis='observation') return(renamed_biom)
def test_collapse_full(self): obs = collapse_full(table) exp = Table(array([[0.00769230769231], [0.0282051282051], [0.0487179487179], [0.0692307692308], [0.0897435897436], [0.110256410256], [0.130769230769], [0.151282051282], [0.171794871795], [0.192307692308]]), observ_ids, ['average'], observation_metadata=observ_metadata) for r in range(10): assert_almost_equal(obs[r, 0], exp[r, 0]) self.assertEqual(obs.ids(), exp.ids()) self.assertItemsEqual(obs.ids('observation'), exp.ids('observation')) obs_meta = [] for _, _, m in obs.iter(axis='observation'): obs_meta.append(m) self.assertItemsEqual(obs_meta, observ_metadata)
def test_biom_match(self): table = Table( np.array([[0, 0, 1, 1], [2, 3, 4, 4], [5, 5, 3, 3]]).T, ['a', 'b', 'c', 'd'], ['s2', 's3', 's4']) md = pd.DataFrame( { 'x1': [1, 3, 2], 'x2': [1, 1, 0] }, columns=['s1', 's2', 's3'] ).T exp_table = Table( np.array( [ [0, 0, 1, 1], [2, 3, 4, 4] ]).T, ['a', 'b', 'c', 'd'], ['s2', 's3']) exp_md = pd.DataFrame( { 'x1': [3, 2], 'x2': [1, 0] }, columns=['s2', 's3'] ).T res_table, res_md = match(table, md) exp_df = pd.DataFrame(exp_table.to_dataframe()) res_df = pd.DataFrame(res_table.to_dataframe()) exp_df = exp_df.reindex_axis(sorted(exp_df.columns), axis=1) res_df = res_df.reindex_axis(sorted(res_df.columns), axis=1) pdt.assert_frame_equal(exp_df, res_df) exp_md = exp_md.reindex_axis(sorted(exp_md.index), axis=0) res_md = res_md.reindex_axis(sorted(res_md.index), axis=0) pdt.assert_frame_equal(res_md, exp_md)
def beta(table: biom.Table, metric: str, pseudocount: int = 1, n_jobs: int = 1) -> skbio.DistanceMatrix: if not (metric in non_phylogenetic_metrics()): raise ValueError("Unknown metric: %s" % metric) counts = table.matrix_data.toarray().T def aitchison(x, y, **kwds): return euclidean(clr(x), clr(y)) def canberra_adkins(x, y, **kwds): if (x < 0).any() or (y < 0).any(): raise ValueError("Canberra-Adkins is only defined over positive " "values.") nz = ((x > 0) | (y > 0)) x_ = x[nz] y_ = y[nz] nnz = nz.sum() return (1. / nnz) * np.sum(np.abs(x_ - y_) / (x_ + y_)) if metric == 'aitchison': counts += pseudocount metric = aitchison elif metric == 'canberra_adkins': metric = canberra_adkins if table.is_empty(): raise ValueError("The provided table object is empty") sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs )
def generate_per_sample_biom(biom_file, limit): """Generate per-sample BIOM files Parameters ---------- biom_file : str A filepath to a BIOM table limit : int or None Limit the number of tables to load Returns ------- str The sample ID str The table in BIOM Format v1.0 str The table in the classic OTU table format """ table = load_table(biom_file) obs_ids = table.ids(axis='observation') obs_md = table.metadata(axis='observation') if limit is None: limit = np.inf count = 0 for v, sample, _ in table.iter(): if count >= limit: break single_sample = Table(v[:, np.newaxis], obs_ids, [sample], obs_md) single_sample.filter(lambda v_, i, md: v_ > 0, axis='observation') biomv1 = single_sample.to_json('AG') biomtxt = single_sample.to_tsv( header_key='taxonomy', header_value='taxonomy', metadata_formatter=lambda x: '; '.join(x)) yield (sample, biomv1, biomtxt) count += 1
def alpha_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode, metric: str) -> pd.Series: if metric not in phylogenetic_metrics(): raise ValueError("Unknown phylogenetic metric: %s" % metric) counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') feature_ids = table.ids(axis='observation') try: result = skbio.diversity.alpha_diversity(metric=metric, counts=counts, ids=sample_ids, otu_ids=feature_ids, tree=phylogeny) except skbio.tree.MissingNodeError as e: message = str(e).replace('otu_ids', 'feature_ids') message = message.replace('tree', 'phylogeny') raise skbio.tree.MissingNodeError(message) result.name = metric return result
def collapse_full(_bt): """Collapses full biom table to median of each OTU Parameters ---------- _bt : biom table Table to collapse Returns ------- biom table Collapsed biom table, one sample containing median of each OTU, normalized. """ num_obs = len(_bt.ids(axis='observation')) table = Table(np.array( [np.median(v) for v in _bt.iter_data(axis='observation')]).reshape( (num_obs, 1)), _bt.ids(axis='observation'), ['average'], observation_metadata=_bt.metadata(axis='observation')) table.norm(inplace=True) return table
def setUp(self): # Registewr the URIs for the QiitaClient httpretty.register_uri( httpretty.POST, "https://test_server.com/qiita_db/authenticate/", body='{"access_token": "token", "token_type": "Bearer", ' '"expires_in": "3600"}') self.qclient = QiitaClient('https://test_server.com', 'client_id', 'client_secret') # Create a biom table fd, self.biom_fp = mkstemp(suffix=".biom") close(fd) data = np.asarray([[0, 0, 1], [1, 3, 42]]) table = Table(data, ['O1', 'O2'], ['1.S1', '1.S2', '1.S3']) with biom_open(self.biom_fp, 'w') as f: table.to_hdf5(f, "Test") self.out_dir = mkdtemp() self.artifact_id = 4 self.parameters = {'input_data': self.artifact_id} self._clean_up_files = [self.biom_fp, self.out_dir]
def _create_job_and_biom(self, sample_ids, template=None, analysis=None): # Create the BIOM table that needs to be valdiated fd, biom_fp = mkstemp(suffix=".biom") close(fd) data = np.random.randint(100, size=(2, len(sample_ids))) table = Table(data, ['O1', 'O2'], sample_ids) with biom_open(biom_fp, 'w') as f: table.to_hdf5(f, "Test") self._clean_up_files.append(biom_fp) # Create a new job parameters = {'template': template, 'files': dumps({'biom': [biom_fp]}), 'artifact_type': 'BIOM', 'analysis': analysis} data = {'command': dumps(['BIOM type', '2.1.4', 'Validate']), 'parameters': dumps(parameters), 'status': 'running'} res = self.qclient.post('/apitest/processing_job/', data=data) job_id = res['job'] return biom_fp, job_id, parameters
def filter_seqs(data: pd.Series, table: biom.Table=None, metadata: qiime2.Metadata=None, where: str=None, exclude_ids: bool=False) -> pd.Series: if table is not None and metadata is not None: raise ValueError('Filtering with metadata and filtering with a table ' 'are mutually exclusive.') elif table is None and metadata is None: raise ValueError('No filtering requested. Must provide either table ' 'or metadata.') elif table is not None: ids_to_keep = table.ids(axis='observation') else: # Note, no need to check for missing feature IDs in the metadata, # because that is basically the point of this method. ids_to_keep = metadata.get_ids(where=where) if exclude_ids is True: ids_to_keep = set(data.index) - set(ids_to_keep) filtered = data[data.index.isin(ids_to_keep)] if filtered.empty is True: raise ValueError('All features were filtered out of the data.') return filtered
class TestFilters(unittest.TestCase): def setUp(self): X = np.array( [[10, 1, 4, 1, 4, 0], [0, 0, 2, 0, 2, 8], [0, 1, 2, 1, 2, 4], [0, 1, 0, 1, 0, 0], [2, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [7, 1, 0, 1, 0, 0]] ) oids = ['o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7'] sids = ['s1', 's2', 's3', 's4', 's5', 's6'] self.metadata = pd.DataFrame( np.vstack( ( np.ones(8), np.array(['a', 'a', 'b', 'b', 'a', 'a', 'b', 'a']), np.arange(8).astype(np.float64), np.array(['Test', 'Test', 'Train', 'Train', 'Train', 'Train', 'Test', 'Train']) ) ).T, columns=['intercept', 'categorical', 'continuous', 'train'], index=['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8'] ) self.metadata['continuous'] = self.metadata[ 'continuous'].astype(np.float64) self.trimmed_metadata = self.metadata.loc[ ['s1', 's2', 's3', 's4', 's5', 's6'] ] self.table = Table(X, oids, sids) def test_match_and_filter_no_filter(self): formula = 'C(categorical) + continuous' res = match_and_filter(self.table, self.metadata, formula, min_sample_count=0, min_feature_count=0) res_table, res_metadata, res_design = res pdt.assert_frame_equal(res_table.to_dataframe(), self.table.to_dataframe()) exp_metadata = pd.DataFrame( np.vstack( ( np.ones(6), np.array(['a', 'a', 'b', 'b', 'a', 'a']), np.arange(6).astype(np.float64), np.array(['Test', 'Test', 'Train', 'Train', 'Train', 'Train']) ) ).T, columns=['intercept', 'categorical', 'continuous', 'train'], index=['s1', 's2', 's3', 's4', 's5', 's6'] ) exp_metadata['continuous'] = exp_metadata[ 'continuous'].astype(np.float64) pdt.assert_frame_equal(res_metadata, exp_metadata) exp_design = pd.DataFrame( np.vstack( ( np.ones(6), np.array([0, 0, 1, 1, 0, 0]), np.arange(6).astype(np.float64) ) ).T, columns=['Intercept', 'C(categorical)[T.b]', 'continuous'], index=['s1', 's2', 's3', 's4', 's5', 's6'] ) pdt.assert_frame_equal(res_design, exp_design) def test_split_training_random(self): np.random.seed(0) design = pd.DataFrame( np.vstack( ( np.ones(6), np.array([0, 0, 1, 1, 0, 0]), np.arange(6) ) ).T, columns=['Intercept', 'C(categorical)[T.b]', 'continuous'], index=['s1', 's2', 's3', 's4', 's5', 's6'] ) res = split_training(self.table.to_dataframe().T, self.trimmed_metadata, design, training_column=None, num_random_test_examples=2) trainX, testX, trainY, testY = res # print(trainX.shape, testX.shape, trainY.shape, testY.shape) npt.assert_allclose(trainX.shape, np.array([4, 3])) npt.assert_allclose(trainY.shape, np.array([4, 7])) npt.assert_allclose(testX.shape, np.array([2, 3])) npt.assert_allclose(testY.shape, np.array([2, 7])) def test_split_training_fixed(self): np.random.seed(0) design = pd.DataFrame( np.vstack( ( np.ones(6), np.array([0, 0, 1, 1, 0, 0]), np.arange(6) ) ).T, columns=['Intercept', 'C(categorical)[T.b]', 'continuous'], index=['s1', 's2', 's3', 's4', 's5', 's6'] ) t = self.table.to_dataframe().T res = split_training(t, self.metadata, design, training_column='train', num_random_test_examples=2) exp_trainX = design.iloc[2:].values exp_testX = design.iloc[:2].values exp_trainY = t.iloc[2:].values exp_testY = t.iloc[:2].values res_trainX, res_testX, res_trainY, res_testY = res npt.assert_allclose(exp_trainX, res_trainX) npt.assert_allclose(exp_trainY, res_trainY) npt.assert_allclose(exp_testX, res_testX) npt.assert_allclose(exp_testY, res_testY)
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() if metadata_df.empty or len(metadata.columns) == 0: raise ValueError("All metadata filtered after dropping columns " "that contained non-categorical data.") metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata(column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def match_biom_tables(observed_table, expected_table_keep, verbose=False, limit_to_expected_observations=False, limit_to_observed_observations=False, normalize=False, shuffle_samples=False): expected_table = expected_table_keep.copy() overlapping_obs_ids = list( set(observed_table.ids(axis='observation')) & set(expected_table.ids(axis='observation'))) if len(overlapping_obs_ids) < 1: print "obs ids:", observed_table.ids(axis='observation')[0:10] print "exp ids:", expected_table.ids(axis='observation')[0:10] raise ValueError,\ "No observation ids are in common between the observed and expected tables, so no evaluations can be performed." if limit_to_expected_observations: def f(data_vector, id_, metadata): return (id_ in overlapping_obs_ids) observed_table = observed_table.filter(f, axis='observation', inplace=False) if limit_to_observed_observations: def f(data_vector, id_, metadata): return (id_ in overlapping_obs_ids) expected_table = expected_table.filter(f, axis='observation', inplace=False) ###Make tables have same set (e.g.number) of ObservationIds and in the same order### #1)identify ObservationIds unique to each table unique_obs_in_expected = list( set(expected_table.ids(axis='observation')) - set(observed_table.ids(axis='observation'))) unique_obs_in_observed = list( set(observed_table.ids(axis='observation')) - set(expected_table.ids(axis='observation'))) #2)Add each missing observation with all 0's if unique_obs_in_observed: empty_obs_data = [[0] * len(expected_table.ids()) ] * len(unique_obs_in_observed) empty_obs_table = Table(empty_obs_data, unique_obs_in_observed, expected_table.ids()) expected_table = expected_table.merge(empty_obs_table) if unique_obs_in_expected: empty_obs_data = [[0] * len(observed_table.ids()) ] * len(unique_obs_in_expected) empty_obs_table = Table(empty_obs_data, unique_obs_in_expected, observed_table.ids()) observed_table = observed_table.merge(empty_obs_table) #3)sort the ObservationIds so they are in the same order between the tables if verbose: print "Sorting observations in expected table to match observed table..." expected_table = expected_table.sort_order( observed_table.ids(axis='observation'), axis='observation') overlapping_sample_ids = list( set(observed_table.ids()) & set(expected_table.ids())) if verbose: num_uniq_obs_sample_ids = len( observed_table.ids()) - len(overlapping_sample_ids) num_uniq_exp_sample_ids = len( expected_table.ids()) - len(overlapping_sample_ids) if num_uniq_obs_sample_ids: print "Num observed samples not in expected: {0}".format( num_uniq_obs_sample_ids) if num_uniq_exp_sample_ids: print "Num expected samples not in observed: {0}".format( num_uniq_exp_sample_ids) print "Num samples with same id: {0}".format( len(overlapping_sample_ids)) if normalize: if verbose: print "Normalizing tables..." observed_table = observed_table.norm(axis='sample', inplace=False) expected_table = expected_table.norm(axis='sample', inplace=False) if verbose: print "Extracting data from biom objects..." # create lists to contain filtered data - we're going to need the data in # numpy arrays, so it makes sense to compute this way rather than filtering # the tables obs_data = {} exp_data = {} # build lists of filtered data for sample_id in overlapping_sample_ids: exp_data[sample_id] = expected_table.data(sample_id) if shuffle_samples: if verbose: print "Randomly shufflying sample ids..." sample_ids_to_shuffle = overlapping_sample_ids[:] shuffle(sample_ids_to_shuffle) for index in range(len(overlapping_sample_ids)): obs_data[overlapping_sample_ids[index]] = observed_table.data( sample_ids_to_shuffle[index]) else: for sample_id in overlapping_sample_ids: obs_data[sample_id] = observed_table.data(sample_id) return obs_data, exp_data
from biom import Table from numpy import array REGULAR_BIOM_TABLE = Table(data=array([ [6.0, 0.0], [141.0, 67.0], [0.0, 6.0], [260.0, 601.0], [6128.0, 393.0], [35.0, 0.0], [0.0, 262.0], [0.0, 7.0], [19.0, 0.0] ]), observation_ids=['Dill cryptic virus 2', 'Enterobacteria phage T4', 'Hepatitis C virus', 'Human papillomavirus type 90', 'Lactobacillus phage Lv 1', 'Merkel cell polyomavirus', 'Mycobacterium phage Adler', 'Propionibacterium phge P105', 'Staphylococcus phage PH15'], sample_ids=['vag_intr_SRS014465.fasta', 'vag_intr_SRS015071.fasta'], sample_metadata=[{'name': 'vag_intr_SRS014465.fasta', 'file_id': '762b8657-bc6c-4c2f-8572-6be4df1adfc9', 'dataset_id': 'c1a84ab2-bca8-414b-9132-de4981426ba1', 'reads_total': 875954, 'label': 'label0', 'label_name': 'No Label'}, {'name': 'vag_intr_SRS015071.fasta', 'file_id': '8be2175c-6106-459f-859b-1e8f8bc6b0e8', 'dataset_id': '11d395b8-abfc-4571-aab6-e734b5d33885', 'reads_total': 507176, 'label': 'label0', 'label_name': 'No Label'}], table_id='05775205-8479-4346-9866-a45bdb449d70', type="OTU table") REGULAR_BIOM_SAMPLE_META = {'vag_intr_SRS014465.fasta': {'id': 'vag_intr_SRS014465.fasta', 'metadata': {'name': 'vag_intr_SRS014465.fasta', 'file_id': '762b8657-bc6c-4c2f-8572-6be4df1adfc9',
def percentile_normalize(table: biom.Table, metadata: qiime2.MetadataColumn, batch: qiime2.MetadataColumn = None, n_control_thresh: int = 10, otu_thresh: float = 0.3) -> biom.Table: """ Converts an input table with cases and controls into percentiles of control samples. Parameters ---------- table : biom.Table Feature table with relative abundances. Samples are in columns, features (i.e. OTUs) are in rows. metadata : qiime2.CategoricalMetadataColumn metadata column with samples labeled as "case" or "control". All samples with either label are returned, normalized to the equivalent percentile in "control" samples. batch : qiime2.CategoricalMetadataColumn metadata column with the different batches labeled. Percentile normalization will be performed within each batch, and the output tables will be concatenated together. You can use this to normalize multiple studies at once by first merging the original feature table, adding a study ID column in the merged metadata, and then calling percentile normalization with this option. n_control_thresh : int [default=10] Minimum number of controls accepted to perform percentile normalization. Because the transformation converts abundances in controls to a uniform distribution, we *highly* discourage performing percentile normalization on datasets with fewer than 30 controls, and certainly not fewer than 10 (the default value). If you have fewer controls than `n_control_thresh`, the normalization will return an error. otu_thresh : float [default=0.3] The OTU filtering threshold: OTUs must be present in at least otu_thresh fraction of cases OR controls, otherwise it gets thrown out and not percentile normalized. This method does not perform well with very sparse OTUs, so we do not recommend lowering this threshold below 0.3. otu_thresh should be [0, 1] Returns ------- norm_biom : biom.Table A biom table with the normalized data, only including the samples that were labeled as either "case" or "control", and the OTUs which passed the otu_thresh threshold. """ # Filter metadata to only include IDs present in the table. # Also ensures every distance table ID is present in the metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) metadata = metadata.drop_missing_values() # filter the table to exclude samples that were dropped from # the metadata due to missing values table = table.filter(metadata.ids) metadata = metadata.to_series() ## Convert biom Table into dense pandas dataframe # Transpose so samples are in rows and OTUs/features in columns df = table.to_dataframe().to_dense().T # Set up a list of metadata series, one per batch batches_to_norm = [] if batch is not None: batch = batch.filter_ids(table.ids(axis='sample')) batch = batch.drop_missing_values() batch = batch.to_series() for g, one_batch in batch.groupby(batch): batches_to_norm.append(metadata.loc[one_batch.index]) else: batches_to_norm.append(metadata) norm_dfs = [] for meta in batches_to_norm: # Get case and control samples from metadata control_samples = meta[meta == "control"].index.tolist() case_samples = meta[meta == "case"].index.tolist() # Check that there are cases and controls if len(control_samples) == 0: if len(case_samples) == 0: # Both cases and controls are zero raise ValueError( 'There are no case or control samples in your data. Check the metadata column for "case" and "control" labels.' ) # Just controls as zero raise ValueError( 'There are no control samples in your data. Check the metadata column for "control" labels.' ) # Just cases are zero elif len(case_samples) == 0: raise ValueError( 'There are no case samples in your data. Check the metadata column for "case" labels.' ) # Make sure there are enough controls to perform normalization if len(control_samples) < n_control_thresh: if batch is not None: batch_err = (' in batch ' + str(batch.loc[meta.index].unique()[0]) + '') else: batch_err = '' raise ValueError( "There aren't enough controls in your data. " + batch_err + "(n_control_thresh = {})".format(n_control_thresh)) # Filter OTUs, replace zeros with random value, and # percentile normalize norm_df = _percentile_normalize_one_df(df, control_samples, case_samples, otu_thresh) norm_dfs.append(norm_df) # Merge all normalized data # Keep all samples and all OTUs - OTUs not present in one batch will be NaNs norm_df = pd.concat(norm_dfs, axis=1) # Put this dataframe into biom format norm_biom = biom.Table(data=norm_df.values, observation_ids=norm_df.index, sample_ids=norm_df.columns) return norm_biom
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=None, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write('Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_reference_otus(input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join( [new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, suppress_index_page=True, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback) # perform post-iteration file shuffling whether the previous iteration's # data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append( '%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps), otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures with biom_open(align_and_tree_input_otu_table) as biom_file: table = Table.from_hdf5(biom_file) filtered_otu_table = filter_otus_from_otu_table(table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def rarefy(table: biom.Table, sampling_depth: int) -> biom.Table: return table.subsample(sampling_depth, axis='sample', by_id=False)
def deposit(output_dir, table1, table2, metadata, U, V, B, it, rep): """ Writes down tables, metadata and feature metadata into files. Parameters ---------- output_dir : str output directory table1 : biom.Table Biom table table2 : biom.Table Biom table metadata : pd.DataFrame Dataframe of sample metadata U : np.array Microbial latent variables V : np.array Metabolite latent variables edges : list Edge list for ground truthing. feature_metadata : pd.DataFrame Dataframe of features metadata it : int iteration number rep : int repetition number """ choice = 'abcdefghijklmnopqrstuvwxyz' output_microbes = "%s/table_microbes.%d_%s.biom" % ( output_dir, it, choice[rep]) output_metabolites = "%s/table_metabolites.%d_%s.biom" % ( output_dir, it, choice[rep]) output_md = "%s/metadata.%d_%s.txt" % ( output_dir, it, choice[rep]) output_U = "%s/U.%d_%s.txt" % ( output_dir, it, choice[rep]) output_V = "%s/V.%d_%s.txt" % ( output_dir, it, choice[rep]) output_B = "%s/B.%d_%s.txt" % ( output_dir, it, choice[rep]) output_ranks = "%s/ranks.%d_%s.txt" % ( output_dir, it, choice[rep]) idx1 = table1.sum(axis=0) > 0 idx2 = table2.sum(axis=0) > 0 table1 = table1.loc[:, idx1] table2 = table2.loc[:, idx2] table1 = Table(table1.values.T, table1.columns, table1.index) table2 = Table(table2.values.T, table2.columns, table2.index) with biom_open(output_microbes, 'w') as f: table1.to_hdf5(f, generated_by='moi1') with biom_open(output_metabolites, 'w') as f: table2.to_hdf5(f, generated_by='moi2') ranks = clr(softmax(np.hstack( (np.zeros((U.shape[0], 1)), U @ V)))) ranks = ranks[idx1, :] ranks = ranks[:, idx2] ranks = pd.DataFrame( ranks, index=table1.ids(axis='observation'), columns=table2.ids(axis='observation')) ranks.to_csv(output_ranks, sep='\t') metadata.to_csv(output_md, sep='\t', index_label='#SampleID') np.savetxt(output_B, B) np.savetxt(output_U, U) np.savetxt(output_V, V)
def relative_frequency(table: biom.Table) -> biom.Table: """ Convert feature table in-place from frequencies to relative frequencies """ table.norm(axis='sample', inplace=True) return table
def transpose(table: biom.Table) -> biom.Table: transposed_table = table.transpose() return transposed_table
def presence_absence(table: biom.Table) -> biom.Table: """ Convert feature table in-place to presence/absence data """ table.pa(inplace=True) return table
def ctf_helper( table: biom.Table, sample_metadata: DataFrame, individual_id_column: str, state_columns: list, n_components: int = DEFAULT_COMP, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, max_iterations_als: int = DEFAULT_TENSALS_MAXITER, max_iterations_rptm: int = DEFAULT_TENSALS_MAXITER, n_initializations: int = DEFAULT_TENSALS_MAXITER, feature_metadata: DataFrame = DEFFM ) -> (dict, OrdinationResults, dict, tuple): """ Runs Compositional Tensor Factorization CTF. """ # validate the metadata using q2 as a wrapper if sample_metadata is not None and not isinstance(sample_metadata, DataFrame): sample_metadata = sample_metadata.to_dataframe() keep_cols = state_columns + [individual_id_column] all_sample_metadata = sample_metadata.drop(keep_cols, axis=1) sample_metadata = sample_metadata[keep_cols] # validate the metadata using q2 as a wrapper if feature_metadata is not None and not isinstance(feature_metadata, DataFrame): feature_metadata = feature_metadata.to_dataframe() # match the data (borrowed in part from gneiss.util.match) subtablefids = table.ids('observation') subtablesids = table.ids('sample') if len(subtablesids) != len(set(subtablesids)): raise ValueError('Data-table contains duplicate sample IDs') if len(subtablefids) != len(set(subtablefids)): raise ValueError('Data-table contains duplicate feature IDs') submetadataids = set(sample_metadata.index) subtablesids = set(subtablesids) subtablefids = set(subtablefids) if feature_metadata is not None: submetadatafeat = set(feature_metadata.index) fidx = subtablefids & submetadatafeat if len(fidx) == 0: raise ValueError(("No more features left. Check to make " "sure that the sample names between " "`feature-metadata` and `table` are " "consistent")) feature_metadata = feature_metadata.reindex(fidx) sidx = subtablesids & submetadataids if len(sidx) == 0: raise ValueError(("No more features left. Check to make sure that " "the sample names between `sample-metadata` and" " `table` are consistent")) if feature_metadata is not None: table.filter(list(fidx), axis='observation', inplace=True) table.filter(list(sidx), axis='sample', inplace=True) sample_metadata = sample_metadata.reindex(sidx) # filter and import table for axis, min_sum in zip(['sample', 'observation'], [min_sample_count, min_feature_count]): table = table.filter(table.ids(axis)[table.sum(axis) >= min_sum], axis=axis, inplace=True) # table to dataframe table = DataFrame(table.matrix_data.toarray(), table.ids('observation'), table.ids('sample')) # tensor building tensor = build() tensor.construct(table, sample_metadata, individual_id_column, state_columns) # factorize TF = TensorFactorization(n_components=n_components, max_als_iterations=max_iterations_als, max_rtpm_iterations=max_iterations_rptm, n_initializations=n_initializations).fit( tensor_rclr(tensor.counts)) # label tensor loadings TF.label(tensor, taxonomy=feature_metadata) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> if n_components == 2: TF.subjects.loc[:, 'PC3'] = [0] * len(TF.subjects.index) TF.features.loc[:, 'PC3'] = [0] * len(TF.features.index) TF.proportion_explained['PC3'] = 0 TF.eigvals['PC3'] = 0 # save ordination results short_method_name = 'CTF_Biplot' long_method_name = 'Compositional Tensor Factorization Biplot' # only keep PC -- other tools merge metadata keep_PC = [col for col in TF.features.columns if 'PC' in col] subj_ordin = OrdinationResults( short_method_name, long_method_name, TF.eigvals, samples=TF.subjects[keep_PC].dropna(axis=0), features=TF.features[keep_PC].dropna(axis=0), proportion_explained=TF.proportion_explained) # save distance matrix for each condition distances = {} state_ordn = {} subject_trajectories = {} feature_trajectories = {} for condition, cond, dist, straj, ftraj in zip(tensor.conditions, TF.conditions, TF.subject_distances, TF.subject_trajectory, TF.feature_trajectory): # match distances to metadata ids = straj.index ind_dict = dict((ind, ind_i) for ind_i, ind in enumerate(ids)) inter = set(ind_dict).intersection(sample_metadata.index) indices = sorted([ind_dict[ind] for ind in inter]) dist = dist[indices, :][:, indices] distances[condition] = skbio.stats.distance.DistanceMatrix( dist, ids=ids[indices]) # fix conditions if n_components == 2: cond['PC3'] = [0] * len(cond.index) cond = OrdinationResults(short_method_name, long_method_name, TF.eigvals, samples=cond[keep_PC].dropna(axis=0), features=TF.features[keep_PC].dropna(axis=0), proportion_explained=TF.proportion_explained) state_ordn[condition] = cond # add the sample metadata before returning output # addtionally only keep metadata with trajectory # output available. pre_merge_cols = list(straj.columns) straj = concat( [straj.reindex(all_sample_metadata.index), all_sample_metadata], axis=1, sort=True) straj = straj.dropna(subset=pre_merge_cols) # ensure index name for q2 straj.index.name = "#SampleID" # save traj. keep_PC_traj = [col for col in straj.columns if 'PC' in col] straj[keep_PC_traj] -= straj[keep_PC_traj].mean() ftraj[keep_PC_traj] -= ftraj[keep_PC_traj].mean() subject_trajectories[condition] = straj ftraj.index = ftraj.index.astype(str) feature_trajectories[condition] = ftraj return (state_ordn, subj_ordin, distances, subject_trajectories, feature_trajectories)
def cluster_features_closed_reference(sequences: DNAFASTAFormat, table: biom.Table, reference_sequences: DNAFASTAFormat, perc_identity: float, strand: str = 'plus', threads: int = 1 ) -> (biom.Table, DNAFASTAFormat, DNAFASTAFormat): table_ids = set(table.ids(axis='observation')) sequence_ids = {e.metadata['id'] for e in skbio.io.read( str(sequences), constructor=skbio.DNA, format='fasta')} _error_on_nonoverlapping_ids(table_ids, sequence_ids) matched_seqs, unmatched_seqs = DNAFASTAFormat(), DNAFASTAFormat() with tempfile.NamedTemporaryFile() as fasta_with_sizes, \ tempfile.NamedTemporaryFile() as out_uc, \ tempfile.NamedTemporaryFile() as tmp_unmatched_seqs: _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table) cmd = ['vsearch', '--usearch_global', fasta_with_sizes.name, '--id', str(perc_identity), '--db', str(reference_sequences), '--uc', out_uc.name, '--strand', str(strand), '--qmask', 'none', # ensures no lowercase DNA chars '--notmatched', tmp_unmatched_seqs.name, '--threads', str(threads)] run_command(cmd) out_uc.seek(0) # It is possible for there to be no unmatched sequences --- if that # is the case, skip thie following clean-up. if os.path.getsize(tmp_unmatched_seqs.name) > 0: # We don't really need to sort the matched sequences, this # is just to let us use --xsize, which strips the counts from # the Feature ID. It would be more ideal if --usearch_global, # above let us pass in --xsize, but unfortunately it isn't # supported. cmd = ['vsearch', '--sortbysize', tmp_unmatched_seqs.name, '--xsize', '--output', str(unmatched_seqs)] run_command(cmd) try: conn = _uc_to_sqlite(out_uc) collapse_f = _collapse_f_from_sqlite(conn) _fasta_from_sqlite(conn, str(sequences), str(matched_seqs)) except ValueError: raise VSearchError('No matches were identified to ' 'reference_sequences. This can happen if ' 'sequences are not homologous to ' 'reference_sequences, or if sequences are ' 'not in the same orientation as reference_' 'sequences (i.e., if sequences are reverse ' 'complemented with respect to reference ' 'sequences). Sequence orientation can be ' 'adjusted with the strand parameter.') unmatched_ids = [e.metadata['id'] for e in skbio.io.read(open(str(unmatched_seqs)), constructor=skbio.DNA, format='fasta')] table.filter(ids_to_keep=unmatched_ids, invert=True, axis='observation', inplace=True) table = table.collapse(collapse_f, norm=False, min_group_size=1, axis='observation', include_collapsed_metadata=False) return table, matched_seqs, unmatched_seqs
def deposit_biofilms(output_dir, abs_table1, abs_table2, rel_table1, rel_table2, edges, metadata, sample_id): """ Writes down tables and edges into files. Parameters ---------- output_dir : str output directory rel_table1 : biom.Table Biom table of relative abundances rel_table2 : biom.Table Biom table of relative abundances abs_table1 : biom.Table Biom table of absolute abundances abs_table2 : biom.Table Biom table of absolute abundances edges : list Edge list for ground truthing. metadata : pd.DataFrame Dataframe of sample metadata sample_id : str sample id """ output_abs_microbes = "%s/table.abs.microbes.%s.biom" % ( output_dir, sample_id) output_abs_metabolites = "%s/table.abs.metabolites.%s.biom" % ( output_dir, sample_id) output_rel_microbes = "%s/table.rel.microbes.%s.biom" % ( output_dir, sample_id) output_rel_metabolites = "%s/table.rel.metabolites.%s.biom" % ( output_dir, sample_id) output_md = "%s/metadata.%s.txt" % ( output_dir, sample_id) output_U = "%s/U.%s.txt" % ( output_dir, sample_id) output_V = "%s/V.%s.txt" % ( output_dir, sample_id) output_edges = "%s/edges.%s.txt" % ( output_dir, sample_id) output_ranks = "%s/ranks.%s.txt" % ( output_dir, sample_id) # idx1 = table1.sum(axis=0) > 0 # idx2 = table2.sum(axis=0) > 0 # table1 = table1.loc[:, idx1] # table2 = table2.loc[:, idx2] # relative abundances table1 = Table(rel_table1.values.T, rel_table1.columns, rel_table1.index) table2 = Table(rel_table2.values.T, rel_table2.columns, rel_table2.index) with biom_open(output_rel_microbes, 'w') as f: table1.to_hdf5(f, generated_by='moi1') with biom_open(output_rel_metabolites, 'w') as f: table2.to_hdf5(f, generated_by='moi2') # absolute abundances table1 = Table(abs_table1.values.T, abs_table1.columns, abs_table1.index) table2 = Table(abs_table2.values.T, abs_table2.columns, abs_table2.index) with biom_open(output_abs_microbes, 'w') as f: table1.to_hdf5(f, generated_by='moi1') with biom_open(output_abs_metabolites, 'w') as f: table2.to_hdf5(f, generated_by='moi2') pd.DataFrame(edges).to_csv(output_edges, sep='\t') metadata.to_csv(output_md, sep='\t')
def test_filter_biom(self): table = Table(*map( np.array, prep_table({ 'S1': { 'G1': 4, 'G2': 5, 'G3': 8 }, 'S2': { 'G1': 2, 'G4': 3, 'G5': 7 }, 'S3': { 'G2': 3, 'G5': 5 } }))) obs = filter_biom(table, th=3) exp = Table(*map( np.array, prep_table({ 'S1': { 'G1': 4, 'G2': 5, 'G3': 8 }, 'S2': { 'G4': 3, 'G5': 7 }, 'S3': { 'G2': 3, 'G5': 5 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') obs = filter_biom(table, th=4) exp = Table(*map( np.array, prep_table({ 'S1': { 'G1': 4, 'G2': 5, 'G3': 8 }, 'S2': { 'G5': 7 }, 'S3': { 'G5': 5 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') obs = filter_biom(table, th=6) exp = Table(*map( np.array, prep_table({ 'S1': { 'G3': 8 }, 'S2': { 'G5': 7 }, 'S3': {} }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') obs = filter_biom(table, th=0.25) exp = Table(*map( np.array, prep_table({ 'S1': { 'G2': 5, 'G3': 8 }, 'S2': { 'G4': 3, 'G5': 7 }, 'S3': { 'G2': 3, 'G5': 5 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') obs = filter_biom(table, th=0.5) exp = Table(*map( np.array, prep_table({ 'S1': {}, 'S2': { 'G5': 7 }, 'S3': { 'G5': 5 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # empty BIOM table cannot be directly compared obs = filter_biom(table, th=10) self.assertTupleEqual(obs.to_dataframe(True).shape, (0, 3))
def deposit_biofilm(table1, table2, metadata, U, V, edges, it, rep, output_dir): """ Writes down tables, metadata and feature metadata into files. Parameters ---------- table : biom.Table Biom table metadata : pd.DataFrame Dataframe of sample metadata feature_metadata : pd.DataFrame Dataframe of features metadata it : int iteration number rep : int repetition number output_dir : str output directory """ choice = 'abcdefghijklmnopqrstuvwxyz' output_microbes = "%s/table_microbes.%d_%s.biom" % ( output_dir, it, choice[rep]) output_metabolites = "%s/table_metabolites.%d_%s.biom" % ( output_dir, it, choice[rep]) output_md = "%s/metadata.%d_%s.txt" % ( output_dir, it, choice[rep]) output_U = "%s/U.%d_%s.txt" % ( output_dir, it, choice[rep]) output_V = "%s/V.%d_%s.txt" % ( output_dir, it, choice[rep]) output_B = "%s/edges.%d_%s.txt" % ( output_dir, it, choice[rep]) output_ranks = "%s/ranks.%d_%s.txt" % ( output_dir, it, choice[rep]) idx1 = table1.sum(axis=0) > 0 idx2 = table2.sum(axis=0) > 0 table1 = table1.loc[:, idx1] table2 = table2.loc[:, idx2] table1 = Table(table1.values.T, table1.columns, table1.index) table2 = Table(table2.values.T, table2.columns, table2.index) with biom_open(output_microbes, 'w') as f: table1.to_hdf5(f, generated_by='moi1') with biom_open(output_metabolites, 'w') as f: table2.to_hdf5(f, generated_by='moi2') ranks = (U @ V) ranks = ranks[idx1, :] ranks = ranks[:, idx2] ranks = pd.DataFrame(ranks, index=table1.ids(axis='observation'), columns=table2.ids(axis='observation')) ranks.to_csv(output_ranks, sep='\t') metadata.to_csv(output_md, sep='\t', index_label='#SampleID') B = B[:, idx1] np.savetxt(output_U, U) np.savetxt(output_V, V) np.savetxt(output_B, B)
def test_biom_add_metacol(self): obs = Table(*map( np.array, prep_table({ 'S1': { 'G1': 4, 'G2': 5, 'G3': 8, 'G4': 0, 'G5': 3 }, 'S2': { 'G1': 1, 'G2': 8, 'G3': 0, 'G4': 7, 'G5': 4 }, 'S3': { 'G1': 0, 'G2': 2, 'G3': 3, 'G4': 5, 'G5': 0 } }))) self.assertIsNone(obs.metadata(axis='observation')) rankdic = {'G1': 'S', 'G2': 'S', 'G3': 'F', 'G4': 'O', 'G5': 'P'} biom_add_metacol(obs, rankdic, 'Rank') exp = [{ 'Rank': 'S' }, { 'Rank': 'S' }, { 'Rank': 'F' }, { 'Rank': 'O' }, { 'Rank': 'P' }] self.assertListEqual(list(map(dict, obs.metadata(axis='observation'))), exp) namedic = { 'G1': 'Proteo', 'G3': 'Actino', 'G2': 'Firmic', 'G4': 'Bacter' } biom_add_metacol(obs, namedic, 'Name', missing='X') exp = [{ 'Rank': 'S', 'Name': 'Proteo' }, { 'Rank': 'S', 'Name': 'Firmic' }, { 'Rank': 'F', 'Name': 'Actino' }, { 'Rank': 'O', 'Name': 'Bacter' }, { 'Rank': 'P', 'Name': 'X' }] self.assertListEqual(list(map(dict, obs.metadata(axis='observation'))), exp)
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() if metadata_df.empty or len(metadata.columns) == 0: raise ValueError("All metadata filtered after dropping columns " "that contained non-categorical data.") metadata_df.columns = pd.MultiIndex.from_tuples([ (c, '') for c in metadata_df.columns ]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata( column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = [ 'depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values ] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={ 'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def test_collapse_biom(self): table = Table(*map( np.array, prep_table({ 'S1': { 'G1': 4, 'G2': 5, 'G3': 8, 'G4': 0, 'G5': 3, 'G6': 0 }, 'S2': { 'G1': 1, 'G2': 8, 'G3': 0, 'G4': 7, 'G5': 4, 'G6': 2 }, 'S3': { 'G1': 0, 'G2': 2, 'G3': 3, 'G4': 5, 'G5': 0, 'G6': 9 } }))) # one-to-one mapping (e.g., direct translation) mapping = { 'G1': ['H1'], 'G2': ['H2'], 'G3': ['H3'], 'G4': ['H4'], 'G5': ['H5'], 'G6': ['H6'] } obs = collapse_biom(table.copy(), mapping) exp = Table(*map( np.array, prep_table({ 'S1': { 'H1': 4, 'H2': 5, 'H3': 8, 'H4': 0, 'H5': 3, 'H6': 0 }, 'S2': { 'H1': 1, 'H2': 8, 'H3': 0, 'H4': 7, 'H5': 4, 'H6': 2 }, 'S3': { 'H1': 0, 'H2': 2, 'H3': 3, 'H4': 5, 'H5': 0, 'H6': 9 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # some missing, some extra mapping = {'G1': ['H1'], 'G2': ['H2'], 'G3': ['H3'], 'G9': ['H9']} obs = collapse_biom(table.copy(), mapping) exp = Table(*map( np.array, prep_table({ 'S1': { 'H1': 4, 'H2': 5, 'H3': 8 }, 'S2': { 'H1': 1, 'H2': 8, 'H3': 0 }, 'S3': { 'H1': 0, 'H2': 2, 'H3': 3 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # wrong mapping (no match) mapping = {'H1': ['I1'], 'H2': ['I2'], 'H3': ['I3']} obs = collapse_biom(table.copy(), mapping) self.assertTrue(obs.is_empty()) self.assertListEqual(list(obs.ids('sample')), ['S1', 'S2', 'S3']) self.assertListEqual(list(obs.ids('observation')), []) # many-to-one mapping (e.g., taxonomic rank up) mapping = { 'G1': ['H1'], 'G2': ['H1'], 'G3': ['H2'], 'G4': ['H2'], 'G5': ['H2'], 'G6': ['H3'] } obs = collapse_biom(table.copy(), mapping) exp = Table(*map( np.array, prep_table({ 'S1': { 'H1': 9, 'H2': 11, 'H3': 0 }, 'S2': { 'H1': 9, 'H2': 11, 'H3': 2 }, 'S3': { 'H1': 2, 'H2': 8, 'H3': 9 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # many-to-many mapping (e.g., genes to pathways) mapping = { 'G1': ['H1'], 'G2': ['H1', 'H2'], 'G3': ['H2', 'H3', 'H4'], 'G4': ['H2', 'H5'], 'G5': ['H4'], 'G6': ['H3', 'H5'] } obs = collapse_biom(table.copy(), mapping) exp = Table(*map( np.array, prep_table({ 'S1': { 'H1': 9, 'H2': 13, 'H3': 8, 'H4': 11, 'H5': 0 }, 'S2': { 'H1': 9, 'H2': 15, 'H3': 2, 'H4': 4, 'H5': 9 }, 'S3': { 'H1': 2, 'H2': 10, 'H3': 12, 'H4': 3, 'H5': 14 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # many-to-many mapping, with normalization obs = collapse_biom(table.copy(), mapping, normalize=True) exp = Table(*map( np.array, prep_table({ 'S1': { 'H1': 6, 'H2': 5, 'H3': 3, 'H4': 6, 'H5': 0 }, 'S2': { 'H1': 5, 'H2': 8, 'H3': 1, 'H4': 4, 'H5': 4 }, 'S3': { 'H1': 1, 'H2': 4, 'H3': 6, 'H4': 1, 'H5': 7 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # nothing left after normalization table = Table(*map( np.array, prep_table({ 'S1': { 'G1': 0 }, 'S2': { 'G1': 1 }, 'S3': { 'G1': 2 } }))) mapping = {'G1': ['H1', 'H2', 'H3', 'H4']} obs = collapse_biom(table.copy(), mapping, normalize=True) self.assertTrue(obs.is_empty()) self.assertListEqual(list(obs.ids('sample')), ['S1', 'S2', 'S3']) self.assertListEqual(list(obs.ids('observation')), [])
def test_beta_rarefaction_empty_table(self): table = Table(np.array([[]]), [], []) with self.assertRaisesRegex(ValueError, 'feature table is empty'): beta_rarefaction(self.output_dir, table, 'braycurtis', 'upgma', self.md, 1)
def test_import_shogun_biom(self): shogun_table = ('#OTU ID\t1450\t2563\n' 'k__Archaea\t26\t25\n' 'k__Archaea;p__Crenarchaeota\t3\t5\n' 'k__Archaea;p__Crenarchaeota;c__Thermoprotei\t1\t25\n') exp_biom = Table(np.array([[26, 25], [3, 5], [1, 25]]), ['k__Archaea', 'k__Archaea;p__Crenarchaeota', 'k__Archaea;p__Crenarchaeota;c__Thermoprotei'], ['1450', '2563']) obs_biom = import_shogun_biom(StringIO(shogun_table)) self.assertEqual(exp_biom, obs_biom) tax_metadata = {'k__Archaea': { 'taxonomy': ['k__Archaea']}, 'k__Archaea;p__Crenarchaeota': { 'taxonomy': ['k__Archaea', 'p__Crenarchaeota']}, 'k__Archaea;p__Crenarchaeota;c__Thermoprotei': { 'taxonomy': ['k__Archaea', 'p__Crenarchaeota', 'c__Thermoprotei']}} exp_biom_tax = Table(np.array([[26, 25], [3, 5], [1, 25]]), ['k__Archaea', 'k__Archaea;p__Crenarchaeota', 'k__Archaea;p__Crenarchaeota;c__Thermoprotei'], ['1450', '2563']) exp_biom_tax.add_metadata(tax_metadata, axis='observation') obs_biom_tax = import_shogun_biom( StringIO(shogun_table), names_to_taxonomy=True) self.assertEqual(exp_biom_tax, obs_biom_tax) # test modules module_table = ('#MODULE ID\t1450\t2563\n' 'M00017\t26\t25\n' 'M00018\t3\t5\n') exp_m_biom = Table(np.array([[26, 25], [3, 5]]), ['M00017', 'M00018'], ['1450', '2563']) exp_m_biom.add_metadata(self.mod_md, axis='observation') obs_m_biom = import_shogun_biom( StringIO(module_table), annotation_table=StringIO(self.modules), annotation_type='module') self.assertEqual(exp_m_biom, obs_m_biom) # test pathways path_table = ('#PATHWAY ID\t1450\t2563\n' '1.4.1 With NAD+ or NADP+ as acceptor\t26\t25\n' '1.4.3 With oxygen as acceptor\t3\t5\n') exp_p_biom = Table(np.array([[26, 25], [3, 5]]), ['1.4.1 With NAD+ or NADP+ as acceptor', '1.4.3 With oxygen as acceptor'], ['1450', '2563']) exp_p_biom.add_metadata(self.path_md, axis='observation') obs_p_biom = import_shogun_biom( StringIO(path_table), annotation_table=StringIO(self.pathways), annotation_type='pathway') self.assertEqual(exp_p_biom, obs_p_biom) # test enzymes enzyme_table = ('#KEGG ID\t1450\t2563\n' 'K00001\t26\t25\n' 'K00002\t3\t5\n' 'K00003\t1\t25\n') exp_e_biom = Table(np.array([[26, 25], [3, 5], [1, 25]]), ['K00001', 'K00002', 'K00003'], ['1450', '2563']) exp_e_biom.add_metadata(self.enz_md, axis='observation') obs_e_biom = import_shogun_biom( StringIO(enzyme_table), annotation_table=StringIO(self.enzymes), annotation_type='enzyme') self.assertEqual(exp_e_biom, obs_e_biom) # test empty empty_table = ('#KEGG ID\t1450\t2563\n') exp_empty_biom = Table(np.zeros((0, 2)), [], ['1450', '2563']) obs_empty_biom = import_shogun_biom( StringIO(empty_table), annotation_table=StringIO(self.enzymes), annotation_type='enzyme') self.assertEqual(exp_empty_biom, obs_empty_biom)
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=None, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append( [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append( [('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus( input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath( step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append( [('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set', step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir index_links.append( ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, step3_failures_fasta_fp) commands.append( [('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map( otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append(('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write('# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath( otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append( ('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append( ('New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write('# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() logger.write('# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append( ('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures with biom_open(align_and_tree_input_otu_table) as biom_file: table = Table.from_hdf5(biom_file) filtered_otu_table = filter_otus_from_otu_table(table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
metavar = "filename", help = "[REQUIRED] outfile name", required = True) options = parser.parse_args() ############################# # Import json formatted OTU # ############################# import json jsondata = open(options.biominputfile) biom = json.load(jsondata) jsondata.close() from biom import Table table = Table.from_json(biom) print("") print("Original OTU Table (without taxonomy)") print("-------------------------------------") print("") print(table) print("") min_samplesize = int(min(table.sum(axis='sample'))) print("Subsampling to the smallest sample size: " + str(min_samplesize)) # Subsample table_ss = table.subsample(min_samplesize) # Output
def test_filter_table(self): table = prep_table({ 'S1': { 'G1': 4, 'G2': 5, 'G3': 8 }, 'S2': { 'G1': 2, 'G4': 3, 'G5': 7 }, 'S3': { 'G2': 3, 'G5': 5 } }) # filter by count obs = filter_table(table, th=3) exp = ([[4, 0, 0], [5, 0, 3], [8, 0, 0], [0, 3, 0], [0, 7, 5]], ['G1', 'G2', 'G3', 'G4', 'G5'], ['S1', 'S2', 'S3'], [{}] * 5) self.assertTupleEqual(obs, exp) obs = filter_table(table, th=4) exp = ([[4, 0, 0], [5, 0, 0], [8, 0, 0], [0, 7, 5]], ['G1', 'G2', 'G3', 'G5'], ['S1', 'S2', 'S3'], [{}] * 4) self.assertTupleEqual(obs, exp) obs = filter_table(table, th=6) exp = ([[8, 0, 0], [0, 7, 0]], ['G3', 'G5'], ['S1', 'S2', 'S3'], [{}] * 2) self.assertTupleEqual(obs, exp) # filter by threshold obs = filter_table(table, th=0.25) exp = ([[5, 0, 3], [8, 0, 0], [0, 3, 0], [0, 7, 5]], ['G2', 'G3', 'G4', 'G5'], ['S1', 'S2', 'S3'], [{}] * 4) self.assertTupleEqual(obs, exp) obs = filter_table(table, th=0.5) exp = ([[0, 7, 5]], ['G5'], ['S1', 'S2', 'S3'], [{}]) self.assertTupleEqual(obs, exp) # filter out everything obs = filter_table(table, th=10) exp = ([], [], ['S1', 'S2', 'S3'], []) self.assertTupleEqual(obs, exp) # filter an empty table obs = filter_table(exp, th=1) exp = ([], [], ['S1', 'S2', 'S3'], []) self.assertTupleEqual(obs, exp) # filter a BIOM table table = Table(*map(np.array, table)) obs = filter_table(table, th=3) exp = Table(*map( np.array, prep_table({ 'S1': { 'G1': 4, 'G2': 5, 'G3': 8 }, 'S2': { 'G4': 3, 'G5': 7 }, 'S3': { 'G2': 3, 'G5': 5 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')
def setUpClass(cls): _table1 = [ 'a\ta\t1\t0.0\t0.5\t0.1', 'a\ta\t1\t1.0\t1.0\t0.2', 'a\ta\t1\t2.0\t1.5\t0.2', 'a\tb\t1\t3.0\t2.0\t8.', 'a\tb\t1\t4.0\t2.5\t9.', 'a\tb\t1\t5.0\t3.0\t10.', 'b\ta\t1\t0.0\t2.0\t0.1', 'b\ta\t1\t1.0\t3.0\t0.3', 'b\ta\t1\t2.0\t4.0\t0.1', 'b\tb\t1\t3.0\t5.0\t9.', 'b\tb\t1\t4.0\t6.0\t11.', 'b\tb\t1\t5.0\t7.0\t10.' ] cls.table1 = pd.DataFrame( [(n.split('\t')) for n in _table1], columns=['group', 'dataset', 'level', 'x', 'y', 'c'], dtype=float) cls.table2 = """{"id": "None", "format": "Biological Observation Matrix 1.0.0", "format_url": "http:\/\/biom-format.org", "type": "OTU table", "generated_by": "greg", "date": "2013-08-22T13:10:23.907145", "matrix_type": "sparse", "matrix_element_type": "float", "shape": [ 3, 4 ], "data": [ [ 0, 0, 1 ], [ 0, 1, 2 ], [ 0, 2, 3 ], [ 0, 3, 4 ], [ 1, 0, 2 ], [ 1, 1, 0 ], [ 1, 2, 7 ], [ 1, 3, 8 ], [ 2, 0, 9 ], [ 2, 1, 10 ], [ 2, 2, 11 ], [ 2, 3, 12 ] ], "rows": [ { "id": "o1", "metadata": { "domain": "Archaea" } }, { "id": "o2", "metadata": { "domain": "Bacteria" } }, { "id": "o3", "metadata": { "domain": "Bacteria" } } ], "columns": [ { "id": "s1", "metadata": { "method": "A", "Sample": "A", "parameters": "A" } }, { "id": "s2", "metadata": { "method": "A", "Sample": "A", "parameters": "B" } }, { "id": "s3", "metadata": { "method": "A", "Sample": "A", "parameters": "C" } }, { "id": "s4", "metadata": { "method": "B", "Sample": "A", "parameters": "D" } } ] }""" # table 2 # OTU ID s1 s2 s3 s4 # o1 1.0 2.0 3.0 4.0 # o2 2.0 0.0 7.0 8.0 # o3 9.0 10.0 11.0 12.0 cls.tmpdir = mkdtemp() cls.table2 = Table.from_json(json.loads(cls.table2)) write_biom_table(cls.table2, 'hdf5', join(cls.tmpdir, 'table2.biom')) cls.dm, cls.s_md = make_distance_matrix(join(cls.tmpdir, 'table2.biom'), method="braycurtis") cls.dist = per_method_distance(cls.dm, cls.s_md, group_by='method', standard='B', metric='distance', sample='Sample')
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): """Creates the initial non-rarefied BIOM artifact of the analysis Parameters ---------- analysis : dict Dictionary with the analysis information biom_data : dict Dictionary with the biom file information rarefied_table : biom.Table The rarefied BIOM table Returns ------- int The id of the new artifact """ # The non rarefied biom artifact is the initial biom table of the analysis. # This table does not currently exist anywhere, so we need to actually # create the BIOM file. To create this BIOM file we need: (1) the samples # and artifacts they come from and (2) whether the samples where # renamed or not. (1) is on the database, but we need to inferr (2) from # the existing rarefied BIOM table. Fun, fun... with TRN: # Get the samples included in the BIOM table grouped by artifact id # Note that the analysis contains a BIOM table per data type included # in it, and the table analysis_sample does not differentiate between # datatypes, so we need to check the data type in the artifact table sql = """SELECT artifact_id, array_agg(sample_id) FROM qiita.analysis_sample JOIN qiita.artifact USING (artifact_id) WHERE analysis_id = %s AND data_type_id = %s GROUP BY artifact_id""" TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) samples_by_artifact = TRN.execute_fetchindex() # Create an empty BIOM table to be the new master table new_table = Table([], [], []) ids_map = {} for a_id, samples in samples_by_artifact: # Get the filepath of the BIOM table from the artifact artifact = Artifact(a_id) biom_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_fp = fp # Note that we are sure that the biom table exists for sure, so # no need to check if biom_fp is undefined biom_table = load_table(biom_fp) samples = set(samples).intersection(biom_table.ids()) biom_table.filter(samples, axis='sample', inplace=True) # we need to check if the table has samples left before merging if biom_table.shape[0] != 0 and biom_table.shape[1] != 0: new_table = new_table.merge(biom_table) ids_map.update({sid: "%d.%s" % (a_id, sid) for sid in biom_table.ids()}) # Check if we need to rename the sample ids in the biom table new_table_ids = set(new_table.ids()) if not new_table_ids.issuperset(rarefied_table.ids()): # We need to rename the sample ids new_table.update_ids(ids_map, 'sample', True, True) sql = """INSERT INTO qiita.artifact (generated_timestamp, data_type_id, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s) RETURNING artifact_id""" # Magic number 4 -> visibility sandbox # Magix number 7 -> biom artifact type TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False]) artifact_id = TRN.execute_fetchlast() # Associate the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis['analysis_id'], artifact_id]) # Link the artifact with its file dd_id, mp = get_mountpoint('BIOM')[0] dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) if not exists(dir_fp): makedirs(dir_fp) new_table_fp = join(dir_fp, "biom_table.biom") with biom_open(new_table_fp, 'w') as f: new_table.to_hdf5(f, "Generated by Qiita") sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES (%s, %s, %s, %s, %s) RETURNING filepath_id""" # Magic number 7 -> filepath_type_id = 'biom' # Magic number 1 -> the checksum algorithm id TRN.add(sql, [basename(new_table_fp), 7, compute_checksum(new_table_fp), 1, dd_id]) fp_id = TRN.execute_fetchlast() sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, fp_id]) TRN.execute() return artifact_id
def ft1_factory(): return Artifact.import_data( 'FeatureTable[Frequency]', Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']))
def beta_rarefaction(output_dir: str, table: biom.Table, metric: str, clustering_method: str, metadata: qiime2.Metadata, sampling_depth: int, iterations: int = 10, phylogeny: skbio.TreeNode = None, correlation_method: str = 'spearman', color_scheme: str = 'BrBG') -> None: with qiime2.sdk.Context() as scope: if table.is_empty(): raise ValueError("Input feature table is empty.") # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) table = qiime2.Artifact.import_data('FeatureTable[Frequency]', table) if metric in METRICS['PHYLO']['IMPL'] | METRICS['PHYLO']['UNIMPL']: if phylogeny is None: raise ValueError("A phylogenetic metric (%s) was requested, " "but a phylogenetic tree was not provided. " "Phylogeny must be provided when using a " "phylogenetic diversity metric." % metric) phylogeny = qiime2.Artifact.import_data('Phylogeny[Rooted]', phylogeny) api_method = scope.ctx.get_action('diversity', 'beta_phylogenetic') beta_func = functools.partial(api_method, phylogeny=phylogeny) else: beta_func = scope.ctx.get_action('diversity', 'beta') rare_func = scope.ctx.get_action('feature-table', 'rarefy') distance_matrices = _get_multiple_rarefaction(beta_func, rare_func, metric, iterations, table, sampling_depth) primary = distance_matrices[0] support = distance_matrices[1:] heatmap_fig, similarity_df = _make_heatmap(distance_matrices, metric, correlation_method, color_scheme) heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg')) similarity_df.to_csv(os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'), sep='\t') tree = _cluster_samples(primary, support, clustering_method) tree.write( os.path.join(output_dir, 'sample-clustering-%s.tre' % clustering_method)) emperor = _jackknifed_emperor(primary, support, metadata) emperor_dir = os.path.join(output_dir, 'emperor') emperor.copy_support_files(emperor_dir) with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh: fh.write(emperor.make_emperor(standalone=True)) templates = list( map( lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page), ['index.html', 'heatmap.html', 'tree.html', 'emperor.html'])) context = { 'metric': metric, 'clustering_method': clustering_method, 'tabs': [{ 'url': 'emperor.html', 'title': 'PCoA' }, { 'url': 'heatmap.html', 'title': 'Heatmap' }, { 'url': 'tree.html', 'title': 'Clustering' }] } q2templates.render(templates, output_dir, context=context)
def qarcoal( table: biom.Table, taxonomy: pd.DataFrame, num_string: str, denom_string: str, samples_to_use: Metadata = None, allow_shared_features: bool = False, ) -> pd.DataFrame: """Calculate sample-wise log-ratios of features based on taxonomy. Parameters: ----------- table: biom file with which to calculate log ratios taxonomy: pd.DataFrame with taxonomy information (should have Taxon column in which features will be searched) num_string: numerator string to search for in taxonomy denom_string: denominator string to search for in taxonomy samples_to_use: Q2 Metadata file with samples to use. If provided, feature table will be filtered to only consider samples present in this file. (optional) allow_shared_features: bool denoting handling of shared features between numerator and denominator. If False, an error is raised if features are shared between numerator and denominator. If True, will allow shared features without throwing an error. Returns: -------- comparison_df: pd DataFrame in the form: Sample-ID Num_Sum Denom_Sum log_ratio S1 7 15 -0.762140 """ # biom table is features x samples if samples_to_use is not None: filt_samples = set(samples_to_use.to_dataframe().index) feat_table = table.filter(filt_samples, axis="sample", inplace=False) feat_table = feat_table.to_dataframe() else: feat_table = table.to_dataframe() # raise error if there are any negative counts in the feature table if feat_table.lt(0).any().any(): raise ValueError("Feature table has negative counts!") tax_num_df, tax_denom_df = filter_and_join_taxonomy( feat_table, taxonomy, num_string, denom_string, ) # if shared features are disallowed, check to make sure they don't occur # if allowed, can skip this step at user's risk if not allow_shared_features: shared_features = set(tax_num_df.index) & set(tax_denom_df.index) if shared_features: raise ValueError("Shared features between num and denom!") tax_num_sample_sum = tax_num_df.sum(axis=0) tax_denom_sample_sum = tax_denom_df.sum(axis=0) comparison_df = pd.DataFrame.from_records( [tax_num_sample_sum, tax_denom_sample_sum], index=["Num_Sum", "Denom_Sum"], ).T comparison_df["log_ratio"] = comparison_df.apply( lambda x: np.log(x.Num_Sum / x.Denom_Sum), axis=1) comparison_df.index.name = "Sample-ID" return comparison_df
def full_pipeline( table: biom.Table, seq: pd.Series, threads: int = 1, hsp_method: str = "mp", placement_tool: str = "epa-ng", min_align: float = 0.8, max_nsti: float = 2.0, edge_exponent: float = 0.5, skip_minpath: bool = False, no_gap_fill: bool = False, skip_norm: bool = False, highly_verbose: bool = False) -> (biom.Table, biom.Table, biom.Table): # Write out BIOM table and FASTA to be used in pipeline. with TemporaryDirectory() as temp_dir: # Write out BIOM table: biom_infile = path.join(temp_dir, "intable.biom") with biom.util.biom_open(biom_infile, 'w') as out_biom: table.to_hdf5(h5grp=out_biom, generated_by="PICRUSt2 QIIME 2 Plugin") # Write out Pandas series as FASTA: seq_outfile = path.join(temp_dir, "seqs.fna") with open(seq_outfile, "w") as outfile_fh: for seqname, sequence in seq.iteritems(): print(">" + str(seqname) + "\n" + str(sequence), file=outfile_fh) picrust2_out = path.join(temp_dir, "picrust2_out") func_outputs, pathway_outputs = picrust2.pipeline.full_pipeline( study_fasta=seq_outfile, input_table=biom_infile, output_folder=picrust2_out, processes=threads, placement_tool=placement_tool, ref_dir=default_ref_dir, in_traits="EC,KO", custom_trait_tables=None, marker_gene_table=default_tables["16S"], pathway_map=default_pathway_map, rxn_func="EC", no_pathways=False, regroup_map=default_regroup_map, no_regroup=False, stratified=False, max_nsti=max_nsti, min_reads=1, min_samples=1, hsp_method=hsp_method, edge_exponent=edge_exponent, min_align=min_align, skip_nsti=False, skip_minpath=skip_minpath, no_gap_fill=no_gap_fill, coverage=False, per_sequence_contrib=False, wide_table=False, skip_norm=skip_norm, remove_intermediate=False, verbose=highly_verbose) # Convert the returned unstratified tables to BIOM tables. # Note that the 0-index in the func table returned objects corresponds # to the path to the unstratified table. ko_biom = biom.load_table(func_outputs["KO"][0]) ec_biom = biom.load_table(func_outputs["EC"][0]) pathabun_biom = biom.load_table(pathway_outputs["unstrat_abun"]) return ko_biom, ec_biom, pathabun_biom
def ft3_factory(): return Artifact.import_data( 'FeatureTable[Frequency]', Table(np.array([[0, 4, 9], [4, 4, 8]]), ['O1', 'O4'], ['S7', 'S8', 'S9']))
def convert_precalc_to_biom(precalc_in, ids_to_load=None, transpose=True, md_prefix='metadata_'): """Loads PICRUSTs tab-delimited version of the precalc file and outputs a BIOM object""" #if given a string convert to a filehandle if type(precalc_in) == str or type(precalc_in) == unicode: fh = StringIO.StringIO(precalc_in) else: fh = precalc_in #first line has to be header header_ids = fh.readline().strip().split('\t') col_meta_locs = {} for idx, col_id in enumerate(header_ids): if col_id.startswith(md_prefix): col_meta_locs[col_id[len(md_prefix):]] = idx end_of_data = len(header_ids) - len(col_meta_locs) trait_ids = header_ids[1:end_of_data] col_meta = [] row_meta = [{} for i in trait_ids] if ids_to_load is not None and len(ids_to_load) > 0: ids_to_load = set(ids_to_load) load_all_ids = False else: load_all_ids = True matching = [] otu_ids = [] for line in fh: fields = line.strip().split('\t') row_id = fields[0] if (row_id.startswith(md_prefix)): #handle metadata #determine type of metadata (this may not be perfect) metadata_type = determine_metadata_type(line) for idx, trait_name in enumerate(trait_ids): row_meta[idx][row_id[len(md_prefix):]] = parse_metadata_field( fields[idx + 1], metadata_type) elif load_all_ids or (row_id in set(ids_to_load)): otu_ids.append(row_id) matching.append(map(float, fields[1:end_of_data])) #add metadata col_meta_dict = {} for meta_name in col_meta_locs: col_meta_dict[meta_name] = fields[col_meta_locs[meta_name]] col_meta.append(col_meta_dict) if not load_all_ids: ids_to_load.remove(row_id) if not otu_ids: raise ValueError, "No OTUs match identifiers in precalculated file. PICRUSt requires an OTU table reference/closed picked against GreenGenes.\nExample of the first 5 OTU ids from your table: {0}".format( ', '.join(list(ids_to_load)[:5])) if ids_to_load: raise ValueError, "One or more OTU ids were not found in the precalculated file!\nAre you using the correct --gg_version?\nExample of (the {0}) unknown OTU ids: {1}".format( len(ids_to_load), ', '.join(list(ids_to_load)[:5])) #note that we transpose the data before making biom obj matching = asarray(matching) if transpose: return Table(matching.T, trait_ids, otu_ids, row_meta, col_meta, type='Gene table') else: return Table(matching, otu_ids, trait_ids, col_meta, row_meta, type='Gene table')
def sequence_variants_from_samples(samples: biom.Table) -> DNAIterator: seqs = (DNA(s, metadata={'id': s}) for s in samples.ids(axis='observation')) return DNAIterator(seqs)
def relative_frequency(table: biom.Table, axis: str='sample') -> biom.Table: """ Convert feature table in-place from frequencies to relative frequencies """ table.norm(axis=axis, inplace=True) return table
def _table_to_dataframe(table: biom.Table) -> pd.DataFrame: array = table.matrix_data.toarray().T sample_ids = table.ids(axis='sample') feature_ids = table.ids(axis='observation') return pd.DataFrame(array, index=sample_ids, columns=feature_ids)
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): """Creates the initial non-rarefied BIOM artifact of the analysis Parameters ---------- analysis : dict Dictionary with the analysis information biom_data : dict Dictionary with the biom file information rarefied_table : biom.Table The rarefied BIOM table Returns ------- int The id of the new artifact """ # The non rarefied biom artifact is the initial biom table of the analysis. # This table does not currently exist anywhere, so we need to actually # create the BIOM file. To create this BIOM file we need: (1) the samples # and artifacts they come from and (2) whether the samples where # renamed or not. (1) is on the database, but we need to inferr (2) from # the existing rarefied BIOM table. Fun, fun... with TRN: # Get the samples included in the BIOM table grouped by artifact id # Note that the analysis contains a BIOM table per data type included # in it, and the table analysis_sample does not differentiate between # datatypes, so we need to check the data type in the artifact table sql = """SELECT artifact_id, array_agg(sample_id) FROM qiita.analysis_sample JOIN qiita.artifact USING (artifact_id) WHERE analysis_id = %s AND data_type_id = %s GROUP BY artifact_id""" TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) samples_by_artifact = TRN.execute_fetchindex() # Create an empty BIOM table to be the new master table new_table = Table([], [], []) ids_map = {} for a_id, samples in samples_by_artifact: # Get the filepath of the BIOM table from the artifact artifact = Artifact(a_id) biom_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_fp = fp # Note that we are sure that the biom table exists for sure, so # no need to check if biom_fp is undefined biom_table = load_table(biom_fp) samples = set(samples).intersection(biom_table.ids()) biom_table.filter(samples, axis='sample', inplace=True) # we need to check if the table has samples left before merging if biom_table.shape[0] != 0 and biom_table.shape[1] != 0: new_table = new_table.merge(biom_table) ids_map.update( {sid: "%d.%s" % (a_id, sid) for sid in biom_table.ids()}) # Check if we need to rename the sample ids in the biom table new_table_ids = set(new_table.ids()) if not new_table_ids.issuperset(rarefied_table.ids()): # We need to rename the sample ids new_table.update_ids(ids_map, 'sample', True, True) sql = """INSERT INTO qiita.artifact (generated_timestamp, data_type_id, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s) RETURNING artifact_id""" # Magic number 4 -> visibility sandbox # Magix number 7 -> biom artifact type TRN.add( sql, [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False]) artifact_id = TRN.execute_fetchlast() # Associate the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis['analysis_id'], artifact_id]) # Link the artifact with its file dd_id, mp = get_mountpoint('BIOM')[0] dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) if not exists(dir_fp): makedirs(dir_fp) new_table_fp = join(dir_fp, "biom_table.biom") with biom_open(new_table_fp, 'w') as f: new_table.to_hdf5(f, "Generated by Qiita") sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES (%s, %s, %s, %s, %s) RETURNING filepath_id""" # Magic number 7 -> filepath_type_id = 'biom' # Magic number 1 -> the checksum algorithm id TRN.add(sql, [ basename(new_table_fp), 7, compute_checksum(new_table_fp), 1, dd_id ]) fp_id = TRN.execute_fetchlast() sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, fp_id]) TRN.execute() return artifact_id
def ft2_factory(): return Artifact.import_data( 'FeatureTable[Frequency]', Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'], ['S4', 'S5', 'S6']))