def setUp(self): super().setUp() self.table = biom.Table(np.array([[0, 1, 2], [2, 4, 6], [3, 0, 1]]), ['feature-1', 'feature-2', 'feature-3'], ['sample-1', 'sample-2', 'sample-3']) self.taxonomy_df = pd.DataFrame([['feature-1', 'a; b; c', 0.123], ['feature-2', 'a; b; c; d; e', 0.345], ['feature-3', 'a; f; g; h', 0.678]], columns=['Feature ID', 'Taxon', 'Confidence']) self.taxonomy_df.set_index('Feature ID', inplace=True) self.table2 = biom.Table([[0, 0.1, 0.2], [0.2, 0.4, 0.6], [0.3, 0, 0.1]], ['feature-1', 'feature-2', 'feature-3'], ['sample-1', 'sample-2', 'sample-3']) self.table_artifact = Artifact.import_data( "FeatureTable[Frequency]", self.table ) self.taxonomy_artifact = Artifact.import_data( "FeatureData[Taxonomy]", self.taxonomy_df, ) self.table2_artifact = Artifact.import_data( "FeatureTable[Frequency]", self.table2 ) self.table_qza = self.create_tempfile(suffix='.qza').name self.table_artifact.save(self.table_qza) self.taxonomy_qza = self.create_tempfile(suffix='.qza').name self.taxonomy_artifact.save(self.taxonomy_qza) self.table2_qza = self.create_tempfile(suffix='.qza').name self.table2_artifact.save(self.table2_qza) self.table_biom = self.create_tempfile(suffix='.biom').name with biom_open(self.table_biom, 'w') as f: self.table.to_hdf5(f, 'test-table') self.resources = ResourceManager() self.config = {'table_resources': { 'simple-table': { 'table': self.table_qza, 'q2-type': FeatureTable[Frequency] }, 'second-simple-table': { 'table': self.table2_qza, }, 'table-with-taxonomy': { 'table': self.table_qza, 'feature-data-taxonomy': self.taxonomy_qza, }, 'table-with-variance': { 'table': self.table_qza, 'variances': self.table2_qza, }, 'table-with-taxonomy-and-variance': { 'table': self.table_qza, 'feature-data-taxonomy': self.taxonomy_qza, 'variances': self.table2_qza, }, 'table-from-biom': { 'table': self.table_biom, 'feature-data-taxonomy': self.taxonomy_qza, 'variances': self.table_biom, 'table-format': 'biom', }, 'table-with-cached-taxonomy': { 'table': self.table_qza, 'feature-data-taxonomy': self.taxonomy_qza, 'cache-taxonomy': True, }, }}
def _dataframe_to_table(df): if df.index.inferred_type != 'string': raise TypeError("Please provide a DataFrame with a string-based Index") return biom.Table(df.T.values, observation_ids=df.columns, sample_ids=df.index)
# Main code ############################################### matplotlib.rc('text', usetex=True) # Setup np.random.seed(0) zheng = lambda x, y: zhengr(x, y) res_folder = '../results/correlations/' params = getParams(sts=[11], interactions=['commensal']) params = init_data(params, num_samps=100) corr_mat = build_correlation_matrix(params) table = build_contingency_table(params) # save table bT = biom.Table(table, range(table.shape[0]), range(table.shape[1])) biomname = '../data/tables_6_3_2015/bioms/table_1.biom' txtname = '../data/tables_6_3_2015/txts/table_1.txt' open(biomname, 'w').write(bT.to_json('Jamie')) open(txtname, 'w').write(bT.to_tsv()) ####################################################################### # Absolute Ecological Relations # ####################################################################### pearson_corr_mat = abs(np.corrcoef(table.T)) spearman_corr_mat = abs(spearmanr(table)[0]) zheng_corr_mat = get_corr_matrix(table, zheng) # Can insert sparcc_corr_mat right here. Just need to # 1. read text file of correlations via pandas DataFrame # 2. extract matrix via the pandas as_matrix() command # 3. take absolute value via pandas abs() command
def synthetic_over_sampling(table: biom.Table, metadata: NumericMetadataColumn, concatenate_meta: Str, method: Str = 'SMOTETomek', k_neighbors: Int = 5, m_neighbors: Int = 10, n_jobs: Int = 1, log_fp: Str = None, sampling_strategy: Str = 'auto', random_state: Int = 42, output_log_fp: Str = None) -> biom.Table: if log_fp: logger_ins = LOG( log_fp=log_fp).get_logger('synthetic_sampling_combination') logger_ins.info("The parameters used for oversampling are") logger_ins.info('k_neighbors:', k_neighbors) logger_ins.info('m_neighbors:', m_neighbors) logger_ins.info('Sampling method:', method) logger_ins.info('Output log file path:', log_fp) logger_ins.info('sampling_strategy:', sampling_strategy) logger_ins.info('n_jobs:', n_jobs) logger_ins.info('random_state:', random_state) cls = dispatcher[method] if method != 'RandomOverSampler': table.norm(inplace=True) if log_fp: logger_ins.info( "The input table is normalized before using it for oversampling" ) sorted_table, sorted_metadata = _read_inputs(table, meta_data=metadata) matrix_data = sorted_table.matrix_data.transpose() if method not in dispatcher: raise ValueError('The optional methods for over sampling are', dispatcher.keys(), "instead it received", method) if method == 'ADASYN': over_sampling_cls = cls(sampling_strategy=sampling_strategy, random_state=random_state, n_neighbors=k_neighbors, n_jobs=n_jobs) elif method == 'RandomOverSampler': over_sampling_cls = cls(sampling_strategy=sampling_strategy, random_state=random_state) else: over_sampling_cls = cls(sampling_strategy=sampling_strategy, m_neighbors=m_neighbors, random_state=random_state, n_jobs=n_jobs, k_neighbors=k_neighbors) X_resampled, y_resampled = over_sampling_cls.fit_resample( matrix_data, sorted_metadata) if np.sum(np.abs(X_resampled[:len(matrix_data), :] - matrix_data)) != 0 or \ np.sum(y_resampled[:len(matrix_data)] == metadata) != len(matrix_data): raise ValueError( "Over sampling method changed the data! Please double check your biom table" ) else: if log_fp: logger_ins.info("The oversampling finished successfully!") logger_ins.info( "The first", len(matrix_data), "samples belong to the original training samples and the " "next", len(X_resampled) - len(matrix_data), "samples belong to the new ones") logger_ins.info("Overall, the size of data is", len(X_resampled)) if method != 'RandomOverSampler': dummy_samples = np.asarray( list(sorted_table.ids('sample')) + [ "dummy_sample_" + str(i) for i in range(len(X_resampled) - len(matrix_data)) ]) else: dummy_samples = over_sampling_cls.sample_indices_ oversampled_table = biom.Table( X_resampled, observation_ids=sorted_table.ids('observation'), sample_ids=dummy_samples) oversampled_metadata = pd.DataFrame(index=dummy_samples, data=y_resampled) oversampled_metadata.index.names = ['#SampleID'] oversampled_metadata.columns = ['label'] oversampled_meta = qiime2.Metadata(oversampled_metadata) oversampled_meta.save() return oversampled_table
def _drop_axis_metadata(table): return biom.Table(table.matrix_data, observation_ids=table.ids(axis='observation'), sample_ids=table.ids(axis='sample'))
def test_basic(self): obs = self.cmd(self.uc_minimal) expected = biom.Table(np.array([[1.0]]), observation_ids=['f2_1539'], sample_ids=['f2']) self.assertEqual(obs, expected)
def test_receives_empty_table(self): empty_table = biom.Table(np.array([]), [], []) with self.assertRaisesRegex(ValueError, "empty"): faith_pd(table=empty_table, phylogeny=self.input_tree)
def test_one_to_one_rename(self): sample_mc = qiime2.CategoricalMetadataColumn( pd.Series(['a_new', 'b_new', 'c_new'], name='foo', index=pd.Index(['a', 'b', 'c'], name='sampleid'))) original_sample_ids = sample_mc.to_series().index new_sample_ids = list(sample_mc.to_series()) feature_mc = qiime2.CategoricalMetadataColumn( pd.Series(['x_new', 'y_new'], name='foo', index=pd.Index(['x', 'y'], name='featureid'))) original_feature_ids = feature_mc.to_series().index new_feature_ids = list(feature_mc.to_series()) data = np.array([[1, 2, 3], [30, 20, 10]]) table = biom.Table(data, sample_ids=original_sample_ids, observation_ids=original_feature_ids) # Sample renames expected = biom.Table(data, sample_ids=new_sample_ids, observation_ids=original_feature_ids) # Sample x Sum result = group(table, axis='sample', metadata=sample_mc, mode='sum') self.assertEqual(expected, result) # Sample X Mean result = group(table, axis='sample', metadata=sample_mc, mode='mean-ceiling') self.assertEqual(expected, result) # Sample X Mean result = group(table, axis='sample', metadata=sample_mc, mode='median-ceiling') self.assertEqual(expected, result) # Feature renames expected = biom.Table(data, sample_ids=original_sample_ids, observation_ids=new_feature_ids) # Feature X Sum result = group(table, axis='feature', metadata=feature_mc, mode='sum') self.assertEqual(expected, result) # Feature X Mean result = group(table, axis='feature', metadata=feature_mc, mode='mean-ceiling') self.assertEqual(expected, result) # Feature X Median result = group(table, axis='feature', metadata=feature_mc, mode='median-ceiling') self.assertEqual(expected, result)
def test_uc(self): obs = self.cmd(self.uc) expected = biom.Table(np.array([[1.0, 1.0], [0.0, 1.0]]), observation_ids=['f2_1539', 'f3_1540'], sample_ids=['f2', 'f3']) self.assertEqual(obs, expected)
def test_alpha_unknown_metric(self): t = biom.Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) with self.assertRaises(ValueError): alpha(table=t, metric='not-a-metric')
def test_alpha_empty_table(self): t = biom.Table(np.array([]), [], []) with self.assertRaisesRegex(ValueError, "empty"): alpha(table=t, metric='observed_otus')
def test_alpha_phylo_metric(self): t = biom.Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) with self.assertRaises(ValueError): alpha(table=t, metric='faith_pd')
def synthetic_under_sampling(table: biom.Table, metadata: NumericMetadataColumn, concatenate_meta_fp: Str, method: Str = 'RandomUnderSampler', voting: Str = 'auto', n_jobs: Int = 1, sampling_strategy: Str = 'auto', random_state: Int = 42, output_log_fp: Str = None) -> biom.Table: log_fp = tempfile.mktemp() print("The log file will be writen into", log_fp) if log_fp: logger_ins = LOG(log_fp=log_fp).get_logger('synthetic_over_sampling') logger_ins.info("The parameters used for oversampling are") logger_ins.info('voting (will be used with ClusterCentroids only):', voting) logger_ins.info('Sampling method:', method) logger_ins.info('Output log file path:', log_fp) logger_ins.info('sampling_strategy:', sampling_strategy) logger_ins.info('n_jobs:', n_jobs) logger_ins.info('random_state:', random_state) cls = dispatcher[method] if method != 'RandomUnderSampler': table.norm(inplace=True) if log_fp: logger_ins.info( "The input table is normalized before using it for oversampling" ) sorted_table, sorted_metadata = _read_inputs(table, meta_data=metadata) matrix_data = sorted_table.matrix_data.transpose().todense() if method not in dispatcher: raise ValueError('The optional methods for over sampling are', dispatcher.keys(), "instead it received", method) if method == 'RandomUnderSampler': under_sampling_cls = cls(sampling_strategy=sampling_strategy, random_state=random_state, replacement=False) else: raise NotImplementedError("Method", method, "is not implemented yet") X_resampled, y_resampled = under_sampling_cls.fit_resample( matrix_data, sorted_metadata) if log_fp: logger_ins.info("The under-sampling finished successfully!") logger_ins.info("Overall, the size of data is", len(X_resampled)) if method == 'RandomUnderSampler': dummy_samples_ids = under_sampling_cls.sample_indices_ dummy_samples = [] orig_samples = sorted_table.ids('sample') for sample_id in dummy_samples_ids: dummy_samples.append(orig_samples[sample_id]) else: raise NotImplementedError("Method", method, "is not implemented yet") under_sampling_dummy = sorted_table.filter(ids_to_keep=dummy_samples, inplace=False) under_sampling_dummy = under_sampling_dummy.sort_order(order=dummy_samples, axis='sample') if method == "RandomUnderSampler" and np.sum( under_sampling_dummy.matrix_data.transpose() - X_resampled) != 0: raise ValueError("The undersampling changed the matrix data") undersampled_table = biom.Table( X_resampled.transpose(), observation_ids=sorted_table.ids('observation'), sample_ids=dummy_samples) undersampled_metadata = pd.DataFrame(index=dummy_samples, data=y_resampled) undersampled_metadata.index.names = ['#SampleID'] undersampled_metadata.columns = ['label'] undersampled_meta = qiime2.Metadata(undersampled_metadata) undersampled_meta.save(concatenate_meta_fp) if log_fp: shutil.copy(log_fp, output_log_fp) return undersampled_table
if __name__ == '__main__': deblur_table = Artifact.load(sys.argv[1]).view(Table) inv_otu_map = get_inv_otu_map( [l.strip().split('\t') for l in open(sys.argv[2])]) output = sys.argv[3] new_table_rcv = defaultdict(int) ggid_to_index = {} sample_to_index = {} for deblur_seq, sample in deblur_table.nonzero(): ggid = inv_otu_map.get(deblur_seq, None) # if the deblurred sequence isn't in the OTU map (i.e., didn't recruit) if ggid is None: continue ggid_idx = next_index(ggid_to_index, ggid) sample_idx = next_index(sample_to_index, sample) value = deblur_table.get_value_by_ids(deblur_seq, sample) new_table_rcv[(ggid_idx, sample_idx)] += value ggid_order = index_to_order(ggid_to_index) sample_order = index_to_order(sample_to_index) new_table = biom.Table(new_table_rcv, ggid_order, sample_order) Artifact.import_data("FeatureTable[Frequency]", new_table).save(output)
def __init__(self, table: biom.Table, features: pd.DataFrame, variances: biom.Table = None, formatter: Optional['Formatter'] = None, rank_level: int = 1): """Establish the taxonomy data Parameters ---------- table : biom.Table Relative abundance data per sample or collapsed into higher order entiries (e.g., abx in the past year) features : pd.DataFrame DataFrame relating an observation to a Taxon variances : biom.Table, optional Variation information about a taxon within a label. rank_level : int The taxonomic level (depth) to compute ranks over. Level 0 is domain, level 1 is phylum, etc. """ self._table = table.norm(inplace=False) self._group_id_lookup = set(self._table.ids()) self._feature_id_lookup = set(self._table.ids(axis='observation')) self._feature_order = self._table.ids(axis='observation') self._features = features if variances is None: empty = ss.csr_matrix((len( self._table.ids(axis='observation')), len(self._table.ids())), dtype=float) self._variances = biom.Table(empty, self._table.ids(axis='observation'), self._table.ids()) else: self._variances = variances if set(self._variances.ids()) != set(self._table.ids()): raise DisjointError("Table and variances are disjoint") if set(self._variances.ids(axis='observation')) != \ set(self._table.ids(axis='observation')): raise DisjointError("Table and variances are disjoint") if not self._feature_id_lookup.issubset(set(self._features.index)): raise SubsetError("Table features are not a subset of the " "taxonomy information") self._ranked, self._ranked_order = self._rankdata(rank_level) self._features = self._features.loc[self._feature_order] self._variances = self._variances.sort_order(self._feature_order, axis='observation') if formatter is None: formatter: Formatter = GreengenesFormatter() self._formatter = formatter # initialize taxonomy tree tree_data = ((i, lineage.split('; ')) for i, lineage in self._features['Taxon'].items()) self.taxonomy_tree = skbio.TreeNode.from_taxonomy(tree_data) for node in self.taxonomy_tree.traverse(): node.length = 1 self.bp_tree = parse_newick(str(self.taxonomy_tree)) feature_taxons = self._features self._formatted_taxa_names = { i: self._formatter.dict_format(lineage) for i, lineage in feature_taxons['Taxon'].items() }
def test_uc_w_mapping(self): obs = self.cmd(self.uc, self.rep_set) expected = biom.Table(np.array([[1.0, 1.0], [0.0, 1.0]]), observation_ids=['otu1', 'otu2'], sample_ids=['f2', 'f3']) self.assertEqual(obs, expected)
region2_align = \ Artifact.import_data('FeatureData[KmerAlignment]', pd.DataFrame( data=np.array([['seq1', 'asv06', 15, 0, 2, 'Gotham'], ['seq2', 'asv07', 15, 0, 2, 'Gotham'], ['seq3', 'asv08', 15, 0, 2, 'Gotham'], ['seq4', 'asv09', 15, 0, 2, 'Gotham'], ['seq5', 'asv10', 15, 0, 2, 'Gotham'], ['seq6', 'asv11', 15, 0, 2, 'Gotham']], dtype=object), columns=['kmer', 'asv', 'length', 'mismatch', 'max-mismatch', 'region'] )) region1_counts = Artifact.import_data( 'FeatureTable[Frequency]', biom.Table( np.array([[150, 0, 0, 50, 50], [125, 50, 50, 25, 25], [100, 0, 100, 50, 50]]).T, sample_ids=['sample1', 'sample2', 'sample3'], observation_ids=['asv01', 'asv02', 'asv03', 'asv04', 'asv05'], )) region1_alt_counts = Artifact.import_data( 'FeatureTable[Frequency]', biom.Table( np.array([[150, 0, 0, 50, 50, 50], [125, 50, 50, 25, 25, 25], [100, 0, 100, 50, 50, 0]]).T, sample_ids=['sample1', 'sample2', 'sample3'], observation_ids=['asv01', 'asv02', 'asv03', 'asv04', 'asv05', 'asv20'], )) region2_counts = Artifact.import_data( 'FeatureTable[Frequency]', biom.Table( data=np.array([[100, 50, 0, 50, 50, 50], [100, 25, 100, 25, 25, 25], [0, 100, 100, 0, 50, 50]]).T,
def _biom_from_samples(context, samples, get=None, normalize_taxonomy=None): """Create a BIOM table from an iterable of samples Parameters ---------- context : str The context to obtain sample data from. samples : iterable of str The samples to fetch. get : a make_get instance, optional A constructed get method. normalize_taxonomy : list, optional The ranks to normalize a lineage too (e.g., [k, p, c, o, f, g, s]) Returns ------- biom.Table A Table populated with the found samples. dict A map of {sample_id_in_table: original_id}. This map can be used to identify what samples are ambiguous based off their original IDs. Redis command summary --------------------- HMGET <context>:feature-index-inverted EVALSHA <fetch-sample-sha1> 0 context <redbiom-id> """ from operator import itemgetter import scipy.sparse as ss import biom import redbiom.admin import redbiom._requests import redbiom.util import redbiom config = redbiom.get_config() if get is None: get = redbiom._requests.make_get(config) se = redbiom._requests.make_script_exec(config) redbiom._requests.valid(context, get) samples = list(samples) # unroll iterator if necessary # resolve ambiguities stable_ids, unobserved, ambig_assoc, rimap = \ redbiom.util.resolve_ambiguities(context, samples, get) table_data = [] unique_indices = set() fetch_sample = redbiom.admin.ScriptManager.get('fetch-sample') for id_ in rimap: # 0 -> we're passing 0 keys, and instead using ARGV data = se(fetch_sample, 0, context, id_) table_data.append((id_, data)) unique_indices.update(data) # construct a mapping of # {feature ID : index position in the BIOM table} unique_indices_map = { observed: index for index, observed in enumerate(unique_indices) } # pull out the feature and sample IDs in the desired ordering obs_ids = [ id_ for id_, _ in sorted(unique_indices_map.items(), key=itemgetter(1)) ] sample_ids = [id_ for id_, _ in table_data] # fill in the matrix mat = ss.lil_matrix((len(unique_indices), len(table_data))) for col, (sample, col_data) in enumerate(table_data): # since this isn't dense, hopefully roworder doesn't hose us for obs_id, value in col_data.items(): mat[unique_indices_map[obs_id], col] = value lineages = taxon_ancestors(context, obs_ids, get, normalize=normalize_taxonomy) if lineages is not None: obs_md = [{'taxonomy': lineage} for lineage in lineages] else: obs_md = None table = biom.Table(mat, obs_ids, sample_ids, obs_md) table.update_ids(rimap) return table, ambig_assoc
def test_basic_w_mapping(self): obs = self.cmd(self.uc_minimal, self.rep_set) expected = biom.Table(np.array([[1.0]]), observation_ids=['otu1'], sample_ids=['f2']) self.assertEqual(obs, expected)
def setUp(self): super().setUp() self.empty_table = biom.Table(np.array([]), [], [])