Ejemplo n.º 1
0
def filter_features_conditionally(table: biom.Table,
                                  abundance: float,
                                  prevalence: float,
                                  ) -> biom.Table:
    """
    A function to perform joint filtering because it makes life better
    """
    num_observations, num_samples = table.shape
    prevalence = prevalence * num_samples

    # Calculates the filtering parameters on the original table
    def _filter_f(values, id_, metadata):
        return (values >= abundance).sum() >= prevalence

    # Normalized the table to get the prevalance
    # Copy is because biom really wants to normalize the original table. By
    # copying and not using inplace, the original table is preserved.
    # Redundant, but better safe that sorry.
    table_norm = table.copy().norm(axis='sample', inplace=False)
    table_norm.filter(_filter_f, axis='observation', inplace=True)
    filter_ids = table_norm.ids(axis='observation')

    new_table = table.filter(filter_ids, axis='observation', inplace=False)

    return new_table
Ejemplo n.º 2
0
def filter_biom(table: biom.Table, th: float):
    """Filter a BIOM table by per-sample count or percentage threshold.

    Parameters
    ----------
    table : biom.Table
        BIOM table to filter.
    th : float
        Per-sample minimum abundance threshold. If >= 1, this value is an
        absolute count; if < 1, it is a fraction of sum of counts.

    Returns
    -------
    biom.Table
        Filtered BIOM table.
    """
    def f(data, id_, md):
        bound = th if th >= 1 else data.sum() * th
        data[data < bound] = 0
        return data

    res = table.copy()
    res.transform(f, axis='sample')
    res.remove_empty(axis='observation')
    return res
Ejemplo n.º 3
0
def rpca(
        table: biom.Table,
        rank: int = 3,
        min_sample_count: int = 500,
        min_feature_count: int = 10,
        iterations: int = 5
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """ Runs RPCA with an rclr preprocessing step"""

    # filter sample to min depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T.drop_duplicates()
    table = table.T[table.sum() > min_feature_count].T

    # rclr preprocessing and OptSpace (RPCA)
    opt = OptSpace(rank=rank, iteration=iterations).fit(rclr().fit_transform(
        table.copy()))
    rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)}

    # Feature Loadings
    feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns)
    feature_loading = feature_loading.rename(columns=rename_cols)
    feature_loading.sort_values('PC1', inplace=True, ascending=True)

    # Sample Loadings
    sample_loading = pd.DataFrame(opt.sample_weights, index=table.index)
    sample_loading = sample_loading.rename(columns=rename_cols)

    # % var explained
    proportion_explained = pd.Series(opt.explained_variance_ratio,
                                     index=list(rename_cols.values()))
    # eigan-vals
    eigvals = pd.Series(opt.eigenvalues, index=list(rename_cols.values()))

    # if the rank is two add PC3 of zeros
    if rank == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res
Ejemplo n.º 4
0
 def __init__(
     self,
     otu_data: Table,
     sample_metadata: Optional[pd.DataFrame] = None,
     obs_metadata: Optional[pd.DataFrame] = None,
 ) -> None:
     if not isinstance(otu_data, Table):
         raise TypeError("Otu data must be of type `biom.Table`")
     otu_data_copy = otu_data.copy()
     if sample_metadata:
         samplemeta_type = SamplemetaType()
         samplemeta_type.validate(sample_metadata)
         otu_data_copy.add_metadata(sample_metadata.to_dict(orient="index"),
                                    axis="sample")
     if obs_metadata:
         obsmeta_type = ObsmetaType()
         obsmeta_type.validate(obs_metadata)
         otu_data_copy.add_metadata(obs_metadata.to_dict(orient="index"),
                                    axis="observation")
     biom_type = BiomType()
     biom_type.validate(otu_data_copy)
     self.otu_data = otu_data_copy
Ejemplo n.º 5
0
def check_alignment_discard(
    alignment: pd.DataFrame,
    table: biom.Table,
    max_mismatch: int = None,
    discarded: bool = True,
) -> biom.Table:
    """
    Filters to the sequences retained during a sidle alignmeent

    Parameters
    ----------
    alignment: pd.DataFrame
        The alignment matrix for the dataset
    table : biom.Table
        The counts table for the region
    max_mismatch: int, optional
        The maximum mismatch to use, if not using the value from the alignment
    discard: bool, optional
        Whether the features that were discarded (discard) should be in the 
        table

    Returns
    -------
    biom.Table
        The filtered biom table
    """
    if max_mismatch is None:
        aligned_asvs = alignment['asv'].unique()
    else:
        aligned_asvs = \
            alignment.loc[alignment['mismatch'] <= max_mismatch, 'asv']
        aligned_asvs = aligned_asvs.unique()
    filt_table = table.copy().filter(
        aligned_asvs,
        axis='observation',
        invert=discarded,
    )
    return filt_table
Ejemplo n.º 6
0
    def test_collapse_biom(self):
        table = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'G1': 4,
                    'G2': 5,
                    'G3': 8,
                    'G4': 0,
                    'G5': 3,
                    'G6': 0
                },
                'S2': {
                    'G1': 1,
                    'G2': 8,
                    'G3': 0,
                    'G4': 7,
                    'G5': 4,
                    'G6': 2
                },
                'S3': {
                    'G1': 0,
                    'G2': 2,
                    'G3': 3,
                    'G4': 5,
                    'G5': 0,
                    'G6': 9
                }
            })))

        # one-to-one mapping (e.g., direct translation)
        mapping = {
            'G1': ['H1'],
            'G2': ['H2'],
            'G3': ['H3'],
            'G4': ['H4'],
            'G5': ['H5'],
            'G6': ['H6']
        }
        obs = collapse_biom(table.copy(), mapping)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'H1': 4,
                    'H2': 5,
                    'H3': 8,
                    'H4': 0,
                    'H5': 3,
                    'H6': 0
                },
                'S2': {
                    'H1': 1,
                    'H2': 8,
                    'H3': 0,
                    'H4': 7,
                    'H5': 4,
                    'H6': 2
                },
                'S3': {
                    'H1': 0,
                    'H2': 2,
                    'H3': 3,
                    'H4': 5,
                    'H5': 0,
                    'H6': 9
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        # some missing, some extra
        mapping = {'G1': ['H1'], 'G2': ['H2'], 'G3': ['H3'], 'G9': ['H9']}
        obs = collapse_biom(table.copy(), mapping)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'H1': 4,
                    'H2': 5,
                    'H3': 8
                },
                'S2': {
                    'H1': 1,
                    'H2': 8,
                    'H3': 0
                },
                'S3': {
                    'H1': 0,
                    'H2': 2,
                    'H3': 3
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        # wrong mapping (no match)
        mapping = {'H1': ['I1'], 'H2': ['I2'], 'H3': ['I3']}
        obs = collapse_biom(table.copy(), mapping)
        self.assertTrue(obs.is_empty())
        self.assertListEqual(list(obs.ids('sample')), ['S1', 'S2', 'S3'])
        self.assertListEqual(list(obs.ids('observation')), [])

        # many-to-one mapping (e.g., taxonomic rank up)
        mapping = {
            'G1': ['H1'],
            'G2': ['H1'],
            'G3': ['H2'],
            'G4': ['H2'],
            'G5': ['H2'],
            'G6': ['H3']
        }
        obs = collapse_biom(table.copy(), mapping)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'H1': 9,
                    'H2': 11,
                    'H3': 0
                },
                'S2': {
                    'H1': 9,
                    'H2': 11,
                    'H3': 2
                },
                'S3': {
                    'H1': 2,
                    'H2': 8,
                    'H3': 9
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        # many-to-many mapping (e.g., genes to pathways)
        mapping = {
            'G1': ['H1'],
            'G2': ['H1', 'H2'],
            'G3': ['H2', 'H3', 'H4'],
            'G4': ['H2', 'H5'],
            'G5': ['H4'],
            'G6': ['H3', 'H5']
        }
        obs = collapse_biom(table.copy(), mapping)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'H1': 9,
                    'H2': 13,
                    'H3': 8,
                    'H4': 11,
                    'H5': 0
                },
                'S2': {
                    'H1': 9,
                    'H2': 15,
                    'H3': 2,
                    'H4': 4,
                    'H5': 9
                },
                'S3': {
                    'H1': 2,
                    'H2': 10,
                    'H3': 12,
                    'H4': 3,
                    'H5': 14
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        # many-to-many mapping, with normalization
        obs = collapse_biom(table.copy(), mapping, normalize=True)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'H1': 6,
                    'H2': 5,
                    'H3': 3,
                    'H4': 6,
                    'H5': 0
                },
                'S2': {
                    'H1': 5,
                    'H2': 8,
                    'H3': 1,
                    'H4': 4,
                    'H5': 4
                },
                'S3': {
                    'H1': 1,
                    'H2': 4,
                    'H3': 6,
                    'H4': 1,
                    'H5': 7
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        # nothing left after normalization
        table = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'G1': 0
                },
                'S2': {
                    'G1': 1
                },
                'S3': {
                    'G1': 2
                }
            })))
        mapping = {'G1': ['H1', 'H2', 'H3', 'H4']}
        obs = collapse_biom(table.copy(), mapping, normalize=True)
        self.assertTrue(obs.is_empty())
        self.assertListEqual(list(obs.ids('sample')), ['S1', 'S2', 'S3'])
        self.assertListEqual(list(obs.ids('observation')), [])