def filter_features_conditionally(table: biom.Table, abundance: float, prevalence: float, ) -> biom.Table: """ A function to perform joint filtering because it makes life better """ num_observations, num_samples = table.shape prevalence = prevalence * num_samples # Calculates the filtering parameters on the original table def _filter_f(values, id_, metadata): return (values >= abundance).sum() >= prevalence # Normalized the table to get the prevalance # Copy is because biom really wants to normalize the original table. By # copying and not using inplace, the original table is preserved. # Redundant, but better safe that sorry. table_norm = table.copy().norm(axis='sample', inplace=False) table_norm.filter(_filter_f, axis='observation', inplace=True) filter_ids = table_norm.ids(axis='observation') new_table = table.filter(filter_ids, axis='observation', inplace=False) return new_table
def filter_biom(table: biom.Table, th: float): """Filter a BIOM table by per-sample count or percentage threshold. Parameters ---------- table : biom.Table BIOM table to filter. th : float Per-sample minimum abundance threshold. If >= 1, this value is an absolute count; if < 1, it is a fraction of sum of counts. Returns ------- biom.Table Filtered BIOM table. """ def f(data, id_, md): bound = th if th >= 1 else data.sum() * th data[data < bound] = 0 return data res = table.copy() res.transform(f, axis='sample') res.remove_empty(axis='observation') return res
def rpca( table: biom.Table, rank: int = 3, min_sample_count: int = 500, min_feature_count: int = 10, iterations: int = 5 ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """ Runs RPCA with an rclr preprocessing step""" # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_count table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T.drop_duplicates() table = table.T[table.sum() > min_feature_count].T # rclr preprocessing and OptSpace (RPCA) opt = OptSpace(rank=rank, iteration=iterations).fit(rclr().fit_transform( table.copy())) rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)} # Feature Loadings feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns) feature_loading = feature_loading.rename(columns=rename_cols) feature_loading.sort_values('PC1', inplace=True, ascending=True) # Sample Loadings sample_loading = pd.DataFrame(opt.sample_weights, index=table.index) sample_loading = sample_loading.rename(columns=rename_cols) # % var explained proportion_explained = pd.Series(opt.explained_variance_ratio, index=list(rename_cols.values())) # eigan-vals eigvals = pd.Series(opt.eigenvalues, index=list(rename_cols.values())) # if the rank is two add PC3 of zeros if rank == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res
def __init__( self, otu_data: Table, sample_metadata: Optional[pd.DataFrame] = None, obs_metadata: Optional[pd.DataFrame] = None, ) -> None: if not isinstance(otu_data, Table): raise TypeError("Otu data must be of type `biom.Table`") otu_data_copy = otu_data.copy() if sample_metadata: samplemeta_type = SamplemetaType() samplemeta_type.validate(sample_metadata) otu_data_copy.add_metadata(sample_metadata.to_dict(orient="index"), axis="sample") if obs_metadata: obsmeta_type = ObsmetaType() obsmeta_type.validate(obs_metadata) otu_data_copy.add_metadata(obs_metadata.to_dict(orient="index"), axis="observation") biom_type = BiomType() biom_type.validate(otu_data_copy) self.otu_data = otu_data_copy
def check_alignment_discard( alignment: pd.DataFrame, table: biom.Table, max_mismatch: int = None, discarded: bool = True, ) -> biom.Table: """ Filters to the sequences retained during a sidle alignmeent Parameters ---------- alignment: pd.DataFrame The alignment matrix for the dataset table : biom.Table The counts table for the region max_mismatch: int, optional The maximum mismatch to use, if not using the value from the alignment discard: bool, optional Whether the features that were discarded (discard) should be in the table Returns ------- biom.Table The filtered biom table """ if max_mismatch is None: aligned_asvs = alignment['asv'].unique() else: aligned_asvs = \ alignment.loc[alignment['mismatch'] <= max_mismatch, 'asv'] aligned_asvs = aligned_asvs.unique() filt_table = table.copy().filter( aligned_asvs, axis='observation', invert=discarded, ) return filt_table
def test_collapse_biom(self): table = Table(*map( np.array, prep_table({ 'S1': { 'G1': 4, 'G2': 5, 'G3': 8, 'G4': 0, 'G5': 3, 'G6': 0 }, 'S2': { 'G1': 1, 'G2': 8, 'G3': 0, 'G4': 7, 'G5': 4, 'G6': 2 }, 'S3': { 'G1': 0, 'G2': 2, 'G3': 3, 'G4': 5, 'G5': 0, 'G6': 9 } }))) # one-to-one mapping (e.g., direct translation) mapping = { 'G1': ['H1'], 'G2': ['H2'], 'G3': ['H3'], 'G4': ['H4'], 'G5': ['H5'], 'G6': ['H6'] } obs = collapse_biom(table.copy(), mapping) exp = Table(*map( np.array, prep_table({ 'S1': { 'H1': 4, 'H2': 5, 'H3': 8, 'H4': 0, 'H5': 3, 'H6': 0 }, 'S2': { 'H1': 1, 'H2': 8, 'H3': 0, 'H4': 7, 'H5': 4, 'H6': 2 }, 'S3': { 'H1': 0, 'H2': 2, 'H3': 3, 'H4': 5, 'H5': 0, 'H6': 9 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # some missing, some extra mapping = {'G1': ['H1'], 'G2': ['H2'], 'G3': ['H3'], 'G9': ['H9']} obs = collapse_biom(table.copy(), mapping) exp = Table(*map( np.array, prep_table({ 'S1': { 'H1': 4, 'H2': 5, 'H3': 8 }, 'S2': { 'H1': 1, 'H2': 8, 'H3': 0 }, 'S3': { 'H1': 0, 'H2': 2, 'H3': 3 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # wrong mapping (no match) mapping = {'H1': ['I1'], 'H2': ['I2'], 'H3': ['I3']} obs = collapse_biom(table.copy(), mapping) self.assertTrue(obs.is_empty()) self.assertListEqual(list(obs.ids('sample')), ['S1', 'S2', 'S3']) self.assertListEqual(list(obs.ids('observation')), []) # many-to-one mapping (e.g., taxonomic rank up) mapping = { 'G1': ['H1'], 'G2': ['H1'], 'G3': ['H2'], 'G4': ['H2'], 'G5': ['H2'], 'G6': ['H3'] } obs = collapse_biom(table.copy(), mapping) exp = Table(*map( np.array, prep_table({ 'S1': { 'H1': 9, 'H2': 11, 'H3': 0 }, 'S2': { 'H1': 9, 'H2': 11, 'H3': 2 }, 'S3': { 'H1': 2, 'H2': 8, 'H3': 9 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # many-to-many mapping (e.g., genes to pathways) mapping = { 'G1': ['H1'], 'G2': ['H1', 'H2'], 'G3': ['H2', 'H3', 'H4'], 'G4': ['H2', 'H5'], 'G5': ['H4'], 'G6': ['H3', 'H5'] } obs = collapse_biom(table.copy(), mapping) exp = Table(*map( np.array, prep_table({ 'S1': { 'H1': 9, 'H2': 13, 'H3': 8, 'H4': 11, 'H5': 0 }, 'S2': { 'H1': 9, 'H2': 15, 'H3': 2, 'H4': 4, 'H5': 9 }, 'S3': { 'H1': 2, 'H2': 10, 'H3': 12, 'H4': 3, 'H5': 14 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # many-to-many mapping, with normalization obs = collapse_biom(table.copy(), mapping, normalize=True) exp = Table(*map( np.array, prep_table({ 'S1': { 'H1': 6, 'H2': 5, 'H3': 3, 'H4': 6, 'H5': 0 }, 'S2': { 'H1': 5, 'H2': 8, 'H3': 1, 'H4': 4, 'H5': 4 }, 'S3': { 'H1': 1, 'H2': 4, 'H3': 6, 'H4': 1, 'H5': 7 } }))) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # nothing left after normalization table = Table(*map( np.array, prep_table({ 'S1': { 'G1': 0 }, 'S2': { 'G1': 1 }, 'S3': { 'G1': 2 } }))) mapping = {'G1': ['H1', 'H2', 'H3', 'H4']} obs = collapse_biom(table.copy(), mapping, normalize=True) self.assertTrue(obs.is_empty()) self.assertListEqual(list(obs.ids('sample')), ['S1', 'S2', 'S3']) self.assertListEqual(list(obs.ids('observation')), [])