Esempio n. 1
0
def rclr_test_table():
    # build a table to test
    test_table = create_test_table()
    # export table from biom
    test_table = test_table.matrix_data.toarray()
    # the matrix_rclr is tested in other places
    # this is just used as input into
    # the OptSpace tests
    test_table = np.array(test_table)
    table_rclr = matrix_rclr(test_table)

    return test_table, table_rclr
Esempio n. 2
0
 def test_errors(self):
     """Test building a tensor error raises."""
     # flatten tensor into matrix
     matrix_counts = self.tensor_true.transpose([0, 2, 1])
     matrix_counts = matrix_counts.reshape(9, 2)
     # build mapping and table dataframe to rebuild
     mapping = np.array([[0, 0, 0, 1, 1, 1, 2, 2, 2],
                         [0, 1, 2, 0, 1, 2, 0, 1, 2]])
     mapping = pd.DataFrame(mapping.T, columns=['ID', 'conditional'])
     table = pd.DataFrame(matrix_counts.T)
     # rebuild the tensor
     tensor = build()
     tensor.construct(table, mapping, 'ID', ['conditional'])
     # test less than 2D throws ValueError
     with self.assertRaises(ValueError):
         tensor_rclr(np.array(range(3)))
     # test negatives throws ValueError
     with self.assertRaises(ValueError):
         tensor_rclr(tensor.counts * -1)
     tensor_true_error = self.tensor_true.astype(float)
     tensor_true_error[tensor_true_error <= 10] = np.inf
     # test infs throws ValueError
     with self.assertRaises(ValueError):
         tensor_rclr(tensor_true_error)
     tensor_true_error = self.tensor_true.astype(float)
     tensor_true_error[tensor_true_error <= 10] = np.nan
     # test nan(s) throws ValueError
     with self.assertRaises(ValueError):
         tensor_rclr(tensor_true_error)
     # test matrix_rclr on already made tensor
     with self.assertRaises(ValueError):
         matrix_rclr(self.tensor_true)
     # test matrix_rclr on negatives
     with self.assertRaises(ValueError):
         matrix_rclr(self.tensor_true * -1)
     # test that missing id in mapping ValueError
     with self.assertRaises(ValueError):
         tensor.construct(table, mapping.drop(['ID'], axis=1), 'ID',
                          ['conditional'])
     # test that missing conditional in mapping ValueError
     with self.assertRaises(ValueError):
         tensor.construct(table, mapping.drop(['conditional'], axis=1),
                          'ID', ['conditional'])
     # test negatives throws ValueError
     with self.assertRaises(ValueError):
         tensor.construct(table * -1, mapping, 'ID', ['conditional'])
     table_error = table.astype(float)
     table_error[table_error <= 10] = np.inf
     # test infs throws ValueError
     with self.assertRaises(ValueError):
         tensor.construct(table_error, mapping, 'ID', ['conditional'])
     table_error = table.astype(float)
     table_error[table_error <= 10] = np.nan
     # test nan(s) throws ValueError
     with self.assertRaises(ValueError):
         tensor.construct(table_error, mapping, 'ID', ['conditional'])
     # test adding up counts for repeat samples
     table[9] = table[8] - 1
     mapping.loc[9,
                 ['ID', 'conditional']] = mapping.loc[8,
                                                      ['ID', 'conditional']]
     with self.assertWarns(Warning):
         tensor.construct(table, mapping, 'ID', ['conditional'])
     duplicate_tensor_true = self.tensor_true.copy()
     duplicate_tensor_true[2, :, 2] = duplicate_tensor_true[2, :, 2] - 1
     npt.assert_allclose(tensor.counts, duplicate_tensor_true.astype(float))
Esempio n. 3
0
 def test_rclr_nan_raises(self):
     """Test matrix_rclr ValueError on missing (as nan)."""
     # test nan throw value error
     with self.assertRaises(ValueError):
         matrix_rclr(self.bad3)
Esempio n. 4
0
 def test_rclr_inf_raises(self):
     """Test matrix_rclr ValueError on undefined."""
     # test undefined throw value error
     with self.assertRaises(ValueError):
         matrix_rclr(self.bad2)
Esempio n. 5
0
 def test_rclr_negative_raises(self):
     """Test matrix_rclr ValueError on negative."""
     # test negatives throw value error
     with self.assertRaises(ValueError):
         matrix_rclr(self.bad1)
Esempio n. 6
0
 def test_rclr_sparse(self):
     """Test matrix_rclr on sparse data."""
     # test a case with zeros
     cmat = matrix_rclr(self.cdata2)
     npt.assert_allclose(cmat, self.true2)
Esempio n. 7
0
 def test_rclr_dense(self):
     """Test matrix_rclr and clr are the same on dense datasets."""
     # test clr works the same if there are no zeros
     cmat = matrix_rclr(self.cdata1)
     npt.assert_allclose(cmat, clr(self.cdata1.copy()))
Esempio n. 8
0
def rpca(
    table: biom.Table,
    n_components: Union[int, str] = DEFAULT_COMP,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    min_feature_frequency: float = DEFAULT_MFF,
    max_iterations: int = DEFAULT_OPTSPACE_ITERATIONS
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """Runs RPCA with an matrix_rclr preprocessing step.

       This code will be run by both the standalone and QIIME 2 versions of
       gemelli.
    """
    # get shape of table
    n_features, n_samples = table.shape

    # filter sample to min seq. depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    # filter features to min total counts
    def observation_filter(val, id_, md):
        return sum(val) > min_feature_count

    # filter features by N samples presence
    def frequency_filter(val, id_, md):
        return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100)

    # filter and import table for each filter above
    table = table.filter(observation_filter, axis='observation')
    table = table.filter(frequency_filter, axis='observation')
    table = table.filter(sample_filter, axis='sample')
    # table to dataframe
    table = pd.DataFrame(table.matrix_data.toarray(), table.ids('observation'),
                         table.ids('sample')).T
    # check the table after filtering
    if len(table.index) != len(set(table.index)):
        raise ValueError('Data-table contains duplicate indices')
    if len(table.columns) != len(set(table.columns)):
        raise ValueError('Data-table contains duplicate columns')
    # Robust-clt (matrix_rclr) preprocessing and OptSpace (RPCA)
    opt = MatrixCompletion(n_components=n_components,
                           max_iterations=max_iterations).fit(
                               matrix_rclr(table))
    # get new n-comp when applicable
    n_components = opt.s.shape[0]
    # get PC column labels for the skbio OrdinationResults
    rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
    # get completed matrix for centering
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    # center again around zero after completion
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    # re-factor the data
    u, s, v = svd(X)
    # only take n-components
    u = u[:, :n_components]
    v = v.T[:, :n_components]
    # calc. the new variance using projection
    p = s**2 / np.sum(s**2)
    p = p[:n_components]
    s = s[:n_components]
    # save the loadings
    feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols)
    sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols)
    # % var explained
    proportion_explained = pd.Series(p, index=rename_cols)
    # get eigenvalues
    eigvals = pd.Series(s, index=rename_cols)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    # discussed in gemelli -- PR#29
    if n_components == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res