Beispiel #1
0
def rpca(in_biom: str, output_dir: str,
         min_sample_depth: int, rank: int) -> None:
    """ Runs RPCA with an rclr preprocessing step"""

    # import table
    table = load_table(in_biom)
    # filter sample to min depth

    def sample_filter(val, id_, md): return sum(val) > min_sample_depth
    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T.drop_duplicates()
    # rclr for saving the transformed OTU table (RSC edited)
    tablefit = rclr().fit_transform(table.copy())
    U,s,V = OptSpace().fit_transform(tablefit)
    tablefit = np.dot(np.dot(U, s), V.T)
    tablefit = pd.DataFrame(tablefit.T, index=table.columns, columns=table.index)
    with open(os.path.join(output_dir, 'rclr_OTUtable.txt'), 'w'):
        tablefit.to_csv(os.path.join(output_dir, 'rclr_OTUtable.txt'), sep='\t', index_label='OTU_ID')
    
    # rclr preprocessing and OptSpace (RPCA)
    opt = OptSpace(rank=rank).fit(rclr().fit_transform(table.copy()))
    rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)}

    # Feature Loadings
    feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns)
    feature_loading = feature_loading.rename(columns=rename_cols)
    feature_loading.sort_values('PC1', inplace=True, ascending=True)

    # Sample Loadings
    sample_loading = pd.DataFrame(opt.sample_weights, index=table.index)
    sample_loading = sample_loading.rename(columns=rename_cols)

    proportion_explained = pd.Series(opt.explained_variance_ratio,
                                     index=list(rename_cols.values()))
    eigvals = pd.Series(opt.eigenvalues,
                        index=list(rename_cols.values()))
    # save ordination results
    ord_res = OrdinationResults(
        'PCoA',
        'Principal Coordinate Analysis',
        eigvals.copy(),
        sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # write files to output folder
    ord_res.write(os.path.join(output_dir, 'RPCA_Ordination.txt'))
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(
        opt.distance, ids=sample_loading.index)
    dist_res.write(os.path.join(output_dir, 'RPCA_distance.txt'))
    return
Beispiel #2
0
def rpca(
        table: biom.Table,
        rank: int = 3,
        min_sample_count: int = 500,
        min_feature_count: int = 10,
        iterations: int = 5
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """ Runs RPCA with an rclr preprocessing step"""

    # filter sample to min depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T.drop_duplicates()
    table = table.T[table.sum() > min_feature_count].T

    # rclr preprocessing and OptSpace (RPCA)
    opt = OptSpace(rank=rank, iteration=iterations).fit(rclr().fit_transform(
        table.copy()))
    rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)}

    # Feature Loadings
    feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns)
    feature_loading = feature_loading.rename(columns=rename_cols)
    feature_loading.sort_values('PC1', inplace=True, ascending=True)

    # Sample Loadings
    sample_loading = pd.DataFrame(opt.sample_weights, index=table.index)
    sample_loading = sample_loading.rename(columns=rename_cols)

    # % var explained
    proportion_explained = pd.Series(opt.explained_variance_ratio,
                                     index=list(rename_cols.values()))
    # eigan-vals
    eigvals = pd.Series(opt.eigenvalues, index=list(rename_cols.values()))

    # if the rank is two add PC3 of zeros
    if rank == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res
Beispiel #3
0
def tensor_rclr(T):
    # flatten, transform, and reshape
    T_rclr = np.concatenate([T[i, :, :].T for i in range(T.shape[0])], axis=0)
    T_rclr = rclr().fit_transform(T_rclr)
    T_rclr = np.dstack([
        T_rclr[(i - 1) * T.shape[-1]:(i) * T.shape[-1]]
        for i in range(1, T.shape[0] + 1)
    ])
    T_rclr[np.isnan(T_rclr)] = 0
    return T_rclr
Beispiel #4
0
    def setUp(self):

        self.cdata1 = np.array([[2, 2, 6], [4, 4, 2]])
        self.cdata2 = [[3, 3, 0], [0, 4, 2]]
        self.true2 = np.array([[0.0, 0.0, np.nan],
                               [np.nan, 0.34657359, -0.34657359]])

        self.bad1 = np.array([1, 2, -1])
        self.bad1
        self._rclr = rclr()
        self._inv = inverse_rclr()
        pass
Beispiel #5
0
def rpca(adata):
    from deicode.matrix_completion import MatrixCompletion
    from deicode.preprocessing import rclr
    min_samples = max(3, np.floor(n_samples * 0.1))
    sc.pp.filter_genes(adata, min_cells=min_samples)
    X = rclr(adata.raw.X)
    opt = MatrixCompletion(n_components=n_comps, max_iterations=10).fit(X)
    n_components = opt.s.shape[0]
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    adata.obsm['X_deicode'] = sc.tl.pca(X,
                                        svd_solver='arpack',
                                        n_comps=n_comps)
    return adata
Beispiel #6
0
def create_test_table():
    _, test_table = build_block_model(rank=2,
                                      hoced=20,
                                      hsced=20,
                                      spar=2e3,
                                      C_=2e3,
                                      num_samples=50,
                                      num_features=500,
                                      mapping_on=False)

    # the rclr is tested in other places
    # this is just used as input into
    # the OptSpace tests
    test_table = np.array(test_table)
    table_rclr = rclr(test_table)

    return test_table, table_rclr
Beispiel #7
0
def get_distance_matrix(otutabledf, index, method, rank, logger_ins, training_index):

	if method == "deicode":
		logger_ins.info("method is deicode, and the rank is", rank)
		rclr_obj = rclr()
		table_norm = rclr_obj.fit_transform(copy.deepcopy(otutabledf))
		opt = OptSpace(rank=rank,iteration=10,tol=1e-5).fit(table_norm)
		sample_loading = pd.DataFrame(opt.sample_weights, index=index)
		rapca = DistanceMatrix(distance.cdist(sample_loading.values, sample_loading.values, 'euclidean'))

	else:
		table_norm = np.asarray(otutabledf / np.sum(otutabledf, 1))
		logger_ins.info("Using the braycurtis way of calculating distances between samples")
		logger_ins.info("the shape of the normalized table is", table_norm.shape)
		logger_ins.info("The sum of columns of the table is", set(list(table_norm.sum(1).squeeze())), "and the size is", table_norm.sum(1).shape)
		sample_loading = pd.DataFrame(table_norm, index=index)
		rapca = DistanceMatrix(pdist(sample_loading.values, 'braycurtis'))
	return rapca
Beispiel #8
0
 def test_rclr_nan_raises(self):
     """Test rclr ValueError on missing (as nan)."""
     # test nan throw value error
     with self.assertRaises(ValueError):
         rclr(self.bad3)
Beispiel #9
0
 def test_rclr_inf_raises(self):
     """Test rclr ValueError on undefined."""
     # test undefined throw value error
     with self.assertRaises(ValueError):
         rclr(self.bad2)
Beispiel #10
0
 def test_rclr_negative_raises(self):
     """Test rclr ValueError on negative."""
     # test negatives throw value error
     with self.assertRaises(ValueError):
         rclr(self.bad1)
Beispiel #11
0
 def test_rclr_sparse(self):
     """Test rclr on sparse data."""
     # test a case with zeros :)
     cmat = rclr(self.cdata2)
     npt.assert_allclose(cmat, self.true2)
Beispiel #12
0
 def test_rclr_dense(self):
     """Test rclr and clr are the same on dense datasets."""
     # test clr works the same if there are no zeros
     cmat = rclr(self.cdata1)
     npt.assert_allclose(cmat, clr(self.cdata1.copy()))
Beispiel #13
0
def rpca(
    table: biom.Table,
    n_components: int = DEFAULT_RANK,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    max_iterations: int = DEFAULT_ITERATIONS
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """Runs RPCA with an rclr preprocessing step.

       This code will be run by both the standalone and QIIME 2 versions of
       DEICODE.
    """

    # filter sample to min depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    def observation_filter(val, id_, md):
        return sum(val) > min_feature_count

    # filter and import table
    table = table.filter(observation_filter, axis='observation')
    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T
    if len(table.index) != len(set(table.index)):
        raise ValueError('Data-table contains duplicate indices')
    if len(table.columns) != len(set(table.columns)):
        raise ValueError('Data-table contains duplicate columns')

    # rclr preprocessing and OptSpace (RPCA)
    opt = MatrixCompletion(n_components=n_components,
                           max_iterations=max_iterations).fit(rclr(table))

    rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    u, s, v = svd(X)
    u = u[:, :n_components]
    v = v.T[:, :n_components]
    p = s**2 / np.sum(s**2)
    p = p[:n_components]
    s = s[:n_components]
    feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols)
    sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols)

    # % var explained
    proportion_explained = pd.Series(p, index=rename_cols)
    # get eigenvalues
    eigvals = pd.Series(s, index=rename_cols)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    # discussed in DEICODE -- PR#29
    if n_components == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res
Beispiel #14
0
            ), :].copy().T
            #get the base truth data for that subset
            basetmp_sub = base_truth.loc[(
                rank_,
                power_,
                depth_,
            ), :].copy().T
            # sub sampled
            subtmp_sub = subtmp.copy()
            #meta on cluster
            meta = np.array([1] * int(subtmp.shape[0] / 2) +
                            [2] * int(subtmp.shape[0] / 2)).T
            meta = pd.DataFrame(meta, index=subtmp.index, columns=['group'])

            # test KL with rcl
            X_sparse = rclr().fit_transform(subtmp_sub.copy())
            U, s, V = OptSpace(iteration=1000).fit_transform(X_sparse)
            clr_res = clr_inv(np.dot(np.dot(U, s), V.T))
            # use just kl_div here because already closed
            kl_clr = entropy(closure(basetmp_sub).T, clr_res.T).mean()
            results[(rank_, power_, depth_, 'rclr', 'KL-Div')] = [kl_clr]

            # test KL without rclr
            X_spn = np.array(subtmp_sub.copy()).astype(float)
            X_spn[X_spn == 0] = np.nan
            U_, s_, V_ = OptSpace(iteration=1000).fit_transform(X_spn)
            res_raw = np.dot(np.dot(U_, s_), V_.T)
            res_raw[res_raw <= 0] = 1
            kl_raw = entropy(closure(basetmp_sub).T, closure(res_raw).T).mean()
            results[(rank_, power_, depth_, 'Raw Counts', 'KL-Div')] = [kl_raw]
Beispiel #15
0
    control_libs += list(abund_df3.index[abund_df3.index.str.contains("CDSBBR")])

    otus_to_strip_c, otus_to_strip_nc = set(), set()
    for c in control_libs:
        otus_to_strip_nc.update(abund_df3.columns[abund_df3.loc[c, :] > 0])
        otus_to_strip_c.update(abund_df5.columns[abund_df5.loc[c, :] > 0])
        print("{}/{} are to be removed".format(len(otus_to_strip_nc), len(otus_to_strip_c)))

    # remove OTUS in 50 unecessary samples 
    abund_df_c = abund_df5.loc[:, ~abund_df5.columns.isin(otus_to_strip_c)]
    abund_df_c2 = abund_df_c[abund_df_c.sum(1)!=0]
    abund_df_nc = abund_df3.loc[:, ~abund_df3.columns.isin(otus_to_strip_nc)]
    abund_df_nc2 = abund_df_nc[abund_df_nc.sum(1)!=0]

    rename_cols = {i - 1: 'PC' + str(i) for i in range(1, 3)}
    rclr_mat = rclr().fit_transform(abund_df_c2.values)
    rclr_mat2 = rclr().fit_transform(abund_df_nc2.values)
    opt=OptSpace(rank=2).fit(rclr_mat)
    # PCA of new mat
    feature_loading = pd.DataFrame(opt.feature_weights, index=abund_df_c2.columns).rename(columns=rename_cols)
    feature_loading.sort_values('PC1', inplace=True, ascending=True)
    sample_loading = pd.DataFrame(opt.sample_weights, index=abund_df_c2.index).rename(columns=rename_cols)
    # PCA of old mat
    opt2=OptSpace(rank=2).fit(rclr_mat2)
    fl_clean = pd.DataFrame(opt2.feature_weights, index=abund_df_nc2.columns).rename(columns=rename_cols)
    fl_clean.sort_values('PC1', inplace=True, ascending=True)
    sl_clean = pd.DataFrame(opt2.sample_weights, index=abund_df_nc2.index).rename(columns=rename_cols)
    sub_meta_c_st = meta_df.loc[abund_df_c2.index, 'station'].tolist()
    sub_meta_c_date = meta_df.loc[abund_df_c2.index, 'date'].tolist()
    sub_meta_c = meta_df.loc[abund_df_c2.index, 'run'].tolist()
    sub_meta_nc = meta_df.loc[abund_df_nc2.index, 'run'].tolist()
Beispiel #16
0
def rpca(
    table: biom.Table,
    n_components: Union[int, str] = DEFAULT_RANK,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    min_feature_frequency: float = DEFAULT_MFF,
    max_iterations: int = DEFAULT_ITERATIONS
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """Runs RPCA with an rclr preprocessing step.

       This code will be run by both the standalone and QIIME 2 versions of
       DEICODE.
    """
    # get shape of table
    n_features, n_samples = table.shape

    # filter sample to min seq. depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    # filter features to min total counts
    def observation_filter(val, id_, md):
        return sum(val) > min_feature_count

    # filter features by N samples presence
    def frequency_filter(val, id_, md):
        return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100)

    # filter and import table for each filter above
    table = table.filter(observation_filter, axis='observation')
    table = table.filter(frequency_filter, axis='observation')
    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T
    # check the table after filtering
    if len(table.index) != len(set(table.index)):
        raise ValueError('Data-table contains duplicate indices')
    if len(table.columns) != len(set(table.columns)):
        raise ValueError('Data-table contains duplicate columns')
    # Robust-clt (rclr) preprocessing and OptSpace (RPCA)
    opt = MatrixCompletion(n_components=n_components,
                           max_iterations=max_iterations).fit(rclr(table))
    # get new n-comp when applicable
    n_components = opt.s.shape[0]
    # get PC column labels for the skbio OrdinationResults
    rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
    # get completed matrix for centering
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    # center again around zero after completion
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    # re-factor the data
    u, s, v = svd(X)
    # only take n-components
    u = u[:, :n_components]
    v = v.T[:, :n_components]
    # calc. the new variance using projection
    p = s**2 / np.sum(s**2)
    p = p[:n_components]
    s = s[:n_components]
    # save the loadings
    robust_clr = pd.DataFrame(X, index=table.index, columns=table.columns)
    feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols)
    sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols)
    # % var explained
    proportion_explained = pd.Series(p, index=rename_cols)
    # get eigenvalues
    eigvals = pd.Series(s, index=rename_cols)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    # discussed in DEICODE -- PR#29
    if n_components == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res, robust_clr