Ejemplo n.º 1
0
    def test_OptSpace(self):
        """Tests the basic validity of the
        actual OptSpace() method's output."""

        # run base OptSpace
        opt = MatrixCompletion(n_components=self.rank,
                               max_iterations=self.iteration,
                               tol=self.tol).fit(self.test_rclr)
        U_res, s_res, V_res = MatrixCompletion(n_components=self.rank,
                                               max_iterations=self.iteration,
                                               tol=self.tol).fit_transform(
                                                   self.test_rclr)
        # use base optspace helper to check
        # that wrapper is not changing outcomes
        U_exp, s_exp, V_exp = OptSpace(n_components=self.rank,
                                       max_iterations=self.iteration,
                                       tol=self.tol).solve(self.test_rclr)
        # more exact testing of directionally is done
        # in test_method.py. Here we just compare abs
        # see  (c/o @cameronmartino's comment in #29).
        for i in range(self.rank):
            np.testing.assert_array_almost_equal(abs(U_exp[:, i]),
                                                 abs(opt.sample_weights[:, i]))
            np.testing.assert_array_almost_equal(abs(s_exp[:, i]),
                                                 abs(opt.s[:, i]))
            np.testing.assert_array_almost_equal(
                abs(V_exp[:, i]), abs(opt.feature_weights[:, i]))
            np.testing.assert_array_almost_equal(abs(U_exp[:, i]),
                                                 abs(U_res[:, i]))
            np.testing.assert_array_almost_equal(abs(s_exp[:, i]),
                                                 abs(s_res[:, i]))
            np.testing.assert_array_almost_equal(abs(V_exp[:, i]),
                                                 abs(V_res[:, i]))
Ejemplo n.º 2
0
 def test_OptSpace_iter_raises(self):
     """Tests ValueError for OptSpace() iteration 0."""
     # test iter too low
     try:
         MatrixCompletion(max_iterations=0).fit(self.test_rclr)
     except ValueError:
         pass
     else:
         raise AssertionError("ValueError was not raised")
Ejemplo n.º 3
0
 def test_OptSpace_illformatted_raises(self):
     """Tests ValueError for OptSpace() no infs."""
     # test inf
     try:
         MatrixCompletion().fit(clr(self.test_table))
     except ValueError:
         pass
     else:
         raise AssertionError("ValueError was not raised")
Ejemplo n.º 4
0
 def test_OptSpace_rank_raises(self):
     """Tests ValueError for OptSpace() rank."""
     # test rank too low
     try:
         MatrixCompletion(n_components=1).fit(self.test_rclr)
     except ValueError:
         pass
     else:
         raise AssertionError("ValueError was not raised")
     # test rank way too high
     try:
         MatrixCompletion(n_components=10000).fit(self.test_rclr)
     except ValueError:
         pass
     else:
         raise AssertionError("ValueError was not raised")
     try:
         MatrixCompletion(n_components=100).fit(self.test_rclr)
     except ValueError:
         pass
     else:
         raise AssertionError("ValueError was not raised")
Ejemplo n.º 5
0
def rpca(adata):
    from deicode.matrix_completion import MatrixCompletion
    from deicode.preprocessing import rclr
    min_samples = max(3, np.floor(n_samples * 0.1))
    sc.pp.filter_genes(adata, min_cells=min_samples)
    X = rclr(adata.raw.X)
    opt = MatrixCompletion(n_components=n_comps, max_iterations=10).fit(X)
    n_components = opt.s.shape[0]
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    adata.obsm['X_deicode'] = sc.tl.pca(X,
                                        svd_solver='arpack',
                                        n_comps=n_comps)
    return adata
Ejemplo n.º 6
0
def rpca(
    table: biom.Table,
    n_components: int = DEFAULT_RANK,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    max_iterations: int = DEFAULT_ITERATIONS
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """Runs RPCA with an rclr preprocessing step.

       This code will be run by both the standalone and QIIME 2 versions of
       DEICODE.
    """

    # filter sample to min depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    def observation_filter(val, id_, md):
        return sum(val) > min_feature_count

    # filter and import table
    table = table.filter(observation_filter, axis='observation')
    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T
    if len(table.index) != len(set(table.index)):
        raise ValueError('Data-table contains duplicate indices')
    if len(table.columns) != len(set(table.columns)):
        raise ValueError('Data-table contains duplicate columns')

    # rclr preprocessing and OptSpace (RPCA)
    opt = MatrixCompletion(n_components=n_components,
                           max_iterations=max_iterations).fit(rclr(table))

    rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    u, s, v = svd(X)
    u = u[:, :n_components]
    v = v.T[:, :n_components]
    p = s**2 / np.sum(s**2)
    p = p[:n_components]
    s = s[:n_components]
    feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols)
    sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols)

    # % var explained
    proportion_explained = pd.Series(p, index=rename_cols)
    # get eigenvalues
    eigvals = pd.Series(s, index=rename_cols)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    # discussed in DEICODE -- PR#29
    if n_components == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res
Ejemplo n.º 7
0
def rpca(
    table: biom.Table,
    n_components: Union[int, str] = DEFAULT_RANK,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    min_feature_frequency: float = DEFAULT_MFF,
    max_iterations: int = DEFAULT_ITERATIONS
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """Runs RPCA with an rclr preprocessing step.

       This code will be run by both the standalone and QIIME 2 versions of
       DEICODE.
    """
    # get shape of table
    n_features, n_samples = table.shape

    # filter sample to min seq. depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    # filter features to min total counts
    def observation_filter(val, id_, md):
        return sum(val) > min_feature_count

    # filter features by N samples presence
    def frequency_filter(val, id_, md):
        return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100)

    # filter and import table for each filter above
    table = table.filter(observation_filter, axis='observation')
    table = table.filter(frequency_filter, axis='observation')
    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T
    # check the table after filtering
    if len(table.index) != len(set(table.index)):
        raise ValueError('Data-table contains duplicate indices')
    if len(table.columns) != len(set(table.columns)):
        raise ValueError('Data-table contains duplicate columns')
    # Robust-clt (rclr) preprocessing and OptSpace (RPCA)
    opt = MatrixCompletion(n_components=n_components,
                           max_iterations=max_iterations).fit(rclr(table))
    # get new n-comp when applicable
    n_components = opt.s.shape[0]
    # get PC column labels for the skbio OrdinationResults
    rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
    # get completed matrix for centering
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    # center again around zero after completion
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    # re-factor the data
    u, s, v = svd(X)
    # only take n-components
    u = u[:, :n_components]
    v = v.T[:, :n_components]
    # calc. the new variance using projection
    p = s**2 / np.sum(s**2)
    p = p[:n_components]
    s = s[:n_components]
    # save the loadings
    robust_clr = pd.DataFrame(X, index=table.index, columns=table.columns)
    feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols)
    sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols)
    # % var explained
    proportion_explained = pd.Series(p, index=rename_cols)
    # get eigenvalues
    eigvals = pd.Series(s, index=rename_cols)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    # discussed in DEICODE -- PR#29
    if n_components == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res, robust_clr