def rpca(in_biom: str, output_dir: str, min_sample_depth: int, rank: int) -> None: """ Runs RPCA with an rclr preprocessing step""" # import table table = load_table(in_biom) # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_depth table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T.drop_duplicates() # rclr for saving the transformed OTU table (RSC edited) tablefit = rclr().fit_transform(table.copy()) U,s,V = OptSpace().fit_transform(tablefit) tablefit = np.dot(np.dot(U, s), V.T) tablefit = pd.DataFrame(tablefit.T, index=table.columns, columns=table.index) with open(os.path.join(output_dir, 'rclr_OTUtable.txt'), 'w'): tablefit.to_csv(os.path.join(output_dir, 'rclr_OTUtable.txt'), sep='\t', index_label='OTU_ID') # rclr preprocessing and OptSpace (RPCA) opt = OptSpace(rank=rank).fit(rclr().fit_transform(table.copy())) rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)} # Feature Loadings feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns) feature_loading = feature_loading.rename(columns=rename_cols) feature_loading.sort_values('PC1', inplace=True, ascending=True) # Sample Loadings sample_loading = pd.DataFrame(opt.sample_weights, index=table.index) sample_loading = sample_loading.rename(columns=rename_cols) proportion_explained = pd.Series(opt.explained_variance_ratio, index=list(rename_cols.values())) eigvals = pd.Series(opt.eigenvalues, index=list(rename_cols.values())) # save ordination results ord_res = OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals.copy(), sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # write files to output folder ord_res.write(os.path.join(output_dir, 'RPCA_Ordination.txt')) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix( opt.distance, ids=sample_loading.index) dist_res.write(os.path.join(output_dir, 'RPCA_distance.txt')) return
def rpca( table: biom.Table, rank: int = 3, min_sample_count: int = 500, min_feature_count: int = 10, iterations: int = 5 ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """ Runs RPCA with an rclr preprocessing step""" # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_count table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T.drop_duplicates() table = table.T[table.sum() > min_feature_count].T # rclr preprocessing and OptSpace (RPCA) opt = OptSpace(rank=rank, iteration=iterations).fit(rclr().fit_transform( table.copy())) rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)} # Feature Loadings feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns) feature_loading = feature_loading.rename(columns=rename_cols) feature_loading.sort_values('PC1', inplace=True, ascending=True) # Sample Loadings sample_loading = pd.DataFrame(opt.sample_weights, index=table.index) sample_loading = sample_loading.rename(columns=rename_cols) # % var explained proportion_explained = pd.Series(opt.explained_variance_ratio, index=list(rename_cols.values())) # eigan-vals eigvals = pd.Series(opt.eigenvalues, index=list(rename_cols.values())) # if the rank is two add PC3 of zeros if rank == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res
def tensor_rclr(T): # flatten, transform, and reshape T_rclr = np.concatenate([T[i, :, :].T for i in range(T.shape[0])], axis=0) T_rclr = rclr().fit_transform(T_rclr) T_rclr = np.dstack([ T_rclr[(i - 1) * T.shape[-1]:(i) * T.shape[-1]] for i in range(1, T.shape[0] + 1) ]) T_rclr[np.isnan(T_rclr)] = 0 return T_rclr
def setUp(self): self.cdata1 = np.array([[2, 2, 6], [4, 4, 2]]) self.cdata2 = [[3, 3, 0], [0, 4, 2]] self.true2 = np.array([[0.0, 0.0, np.nan], [np.nan, 0.34657359, -0.34657359]]) self.bad1 = np.array([1, 2, -1]) self.bad1 self._rclr = rclr() self._inv = inverse_rclr() pass
def rpca(adata): from deicode.matrix_completion import MatrixCompletion from deicode.preprocessing import rclr min_samples = max(3, np.floor(n_samples * 0.1)) sc.pp.filter_genes(adata, min_cells=min_samples) X = rclr(adata.raw.X) opt = MatrixCompletion(n_components=n_comps, max_iterations=10).fit(X) n_components = opt.s.shape[0] X = opt.sample_weights @ opt.s @ opt.feature_weights.T X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) adata.obsm['X_deicode'] = sc.tl.pca(X, svd_solver='arpack', n_comps=n_comps) return adata
def create_test_table(): _, test_table = build_block_model(rank=2, hoced=20, hsced=20, spar=2e3, C_=2e3, num_samples=50, num_features=500, mapping_on=False) # the rclr is tested in other places # this is just used as input into # the OptSpace tests test_table = np.array(test_table) table_rclr = rclr(test_table) return test_table, table_rclr
def get_distance_matrix(otutabledf, index, method, rank, logger_ins, training_index): if method == "deicode": logger_ins.info("method is deicode, and the rank is", rank) rclr_obj = rclr() table_norm = rclr_obj.fit_transform(copy.deepcopy(otutabledf)) opt = OptSpace(rank=rank,iteration=10,tol=1e-5).fit(table_norm) sample_loading = pd.DataFrame(opt.sample_weights, index=index) rapca = DistanceMatrix(distance.cdist(sample_loading.values, sample_loading.values, 'euclidean')) else: table_norm = np.asarray(otutabledf / np.sum(otutabledf, 1)) logger_ins.info("Using the braycurtis way of calculating distances between samples") logger_ins.info("the shape of the normalized table is", table_norm.shape) logger_ins.info("The sum of columns of the table is", set(list(table_norm.sum(1).squeeze())), "and the size is", table_norm.sum(1).shape) sample_loading = pd.DataFrame(table_norm, index=index) rapca = DistanceMatrix(pdist(sample_loading.values, 'braycurtis')) return rapca
def test_rclr_nan_raises(self): """Test rclr ValueError on missing (as nan).""" # test nan throw value error with self.assertRaises(ValueError): rclr(self.bad3)
def test_rclr_inf_raises(self): """Test rclr ValueError on undefined.""" # test undefined throw value error with self.assertRaises(ValueError): rclr(self.bad2)
def test_rclr_negative_raises(self): """Test rclr ValueError on negative.""" # test negatives throw value error with self.assertRaises(ValueError): rclr(self.bad1)
def test_rclr_sparse(self): """Test rclr on sparse data.""" # test a case with zeros :) cmat = rclr(self.cdata2) npt.assert_allclose(cmat, self.true2)
def test_rclr_dense(self): """Test rclr and clr are the same on dense datasets.""" # test clr works the same if there are no zeros cmat = rclr(self.cdata1) npt.assert_allclose(cmat, clr(self.cdata1.copy()))
def rpca( table: biom.Table, n_components: int = DEFAULT_RANK, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, max_iterations: int = DEFAULT_ITERATIONS ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """Runs RPCA with an rclr preprocessing step. This code will be run by both the standalone and QIIME 2 versions of DEICODE. """ # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_count def observation_filter(val, id_, md): return sum(val) > min_feature_count # filter and import table table = table.filter(observation_filter, axis='observation') table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T if len(table.index) != len(set(table.index)): raise ValueError('Data-table contains duplicate indices') if len(table.columns) != len(set(table.columns)): raise ValueError('Data-table contains duplicate columns') # rclr preprocessing and OptSpace (RPCA) opt = MatrixCompletion(n_components=n_components, max_iterations=max_iterations).fit(rclr(table)) rename_cols = ['PC' + str(i + 1) for i in range(n_components)] X = opt.sample_weights @ opt.s @ opt.feature_weights.T X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) u, s, v = svd(X) u = u[:, :n_components] v = v.T[:, :n_components] p = s**2 / np.sum(s**2) p = p[:n_components] s = s[:n_components] feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols) sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols) # % var explained proportion_explained = pd.Series(p, index=rename_cols) # get eigenvalues eigvals = pd.Series(s, index=rename_cols) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> # discussed in DEICODE -- PR#29 if n_components == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res
), :].copy().T #get the base truth data for that subset basetmp_sub = base_truth.loc[( rank_, power_, depth_, ), :].copy().T # sub sampled subtmp_sub = subtmp.copy() #meta on cluster meta = np.array([1] * int(subtmp.shape[0] / 2) + [2] * int(subtmp.shape[0] / 2)).T meta = pd.DataFrame(meta, index=subtmp.index, columns=['group']) # test KL with rcl X_sparse = rclr().fit_transform(subtmp_sub.copy()) U, s, V = OptSpace(iteration=1000).fit_transform(X_sparse) clr_res = clr_inv(np.dot(np.dot(U, s), V.T)) # use just kl_div here because already closed kl_clr = entropy(closure(basetmp_sub).T, clr_res.T).mean() results[(rank_, power_, depth_, 'rclr', 'KL-Div')] = [kl_clr] # test KL without rclr X_spn = np.array(subtmp_sub.copy()).astype(float) X_spn[X_spn == 0] = np.nan U_, s_, V_ = OptSpace(iteration=1000).fit_transform(X_spn) res_raw = np.dot(np.dot(U_, s_), V_.T) res_raw[res_raw <= 0] = 1 kl_raw = entropy(closure(basetmp_sub).T, closure(res_raw).T).mean() results[(rank_, power_, depth_, 'Raw Counts', 'KL-Div')] = [kl_raw]
control_libs += list(abund_df3.index[abund_df3.index.str.contains("CDSBBR")]) otus_to_strip_c, otus_to_strip_nc = set(), set() for c in control_libs: otus_to_strip_nc.update(abund_df3.columns[abund_df3.loc[c, :] > 0]) otus_to_strip_c.update(abund_df5.columns[abund_df5.loc[c, :] > 0]) print("{}/{} are to be removed".format(len(otus_to_strip_nc), len(otus_to_strip_c))) # remove OTUS in 50 unecessary samples abund_df_c = abund_df5.loc[:, ~abund_df5.columns.isin(otus_to_strip_c)] abund_df_c2 = abund_df_c[abund_df_c.sum(1)!=0] abund_df_nc = abund_df3.loc[:, ~abund_df3.columns.isin(otus_to_strip_nc)] abund_df_nc2 = abund_df_nc[abund_df_nc.sum(1)!=0] rename_cols = {i - 1: 'PC' + str(i) for i in range(1, 3)} rclr_mat = rclr().fit_transform(abund_df_c2.values) rclr_mat2 = rclr().fit_transform(abund_df_nc2.values) opt=OptSpace(rank=2).fit(rclr_mat) # PCA of new mat feature_loading = pd.DataFrame(opt.feature_weights, index=abund_df_c2.columns).rename(columns=rename_cols) feature_loading.sort_values('PC1', inplace=True, ascending=True) sample_loading = pd.DataFrame(opt.sample_weights, index=abund_df_c2.index).rename(columns=rename_cols) # PCA of old mat opt2=OptSpace(rank=2).fit(rclr_mat2) fl_clean = pd.DataFrame(opt2.feature_weights, index=abund_df_nc2.columns).rename(columns=rename_cols) fl_clean.sort_values('PC1', inplace=True, ascending=True) sl_clean = pd.DataFrame(opt2.sample_weights, index=abund_df_nc2.index).rename(columns=rename_cols) sub_meta_c_st = meta_df.loc[abund_df_c2.index, 'station'].tolist() sub_meta_c_date = meta_df.loc[abund_df_c2.index, 'date'].tolist() sub_meta_c = meta_df.loc[abund_df_c2.index, 'run'].tolist() sub_meta_nc = meta_df.loc[abund_df_nc2.index, 'run'].tolist()
def rpca( table: biom.Table, n_components: Union[int, str] = DEFAULT_RANK, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, min_feature_frequency: float = DEFAULT_MFF, max_iterations: int = DEFAULT_ITERATIONS ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """Runs RPCA with an rclr preprocessing step. This code will be run by both the standalone and QIIME 2 versions of DEICODE. """ # get shape of table n_features, n_samples = table.shape # filter sample to min seq. depth def sample_filter(val, id_, md): return sum(val) > min_sample_count # filter features to min total counts def observation_filter(val, id_, md): return sum(val) > min_feature_count # filter features by N samples presence def frequency_filter(val, id_, md): return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100) # filter and import table for each filter above table = table.filter(observation_filter, axis='observation') table = table.filter(frequency_filter, axis='observation') table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T # check the table after filtering if len(table.index) != len(set(table.index)): raise ValueError('Data-table contains duplicate indices') if len(table.columns) != len(set(table.columns)): raise ValueError('Data-table contains duplicate columns') # Robust-clt (rclr) preprocessing and OptSpace (RPCA) opt = MatrixCompletion(n_components=n_components, max_iterations=max_iterations).fit(rclr(table)) # get new n-comp when applicable n_components = opt.s.shape[0] # get PC column labels for the skbio OrdinationResults rename_cols = ['PC' + str(i + 1) for i in range(n_components)] # get completed matrix for centering X = opt.sample_weights @ opt.s @ opt.feature_weights.T # center again around zero after completion X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) # re-factor the data u, s, v = svd(X) # only take n-components u = u[:, :n_components] v = v.T[:, :n_components] # calc. the new variance using projection p = s**2 / np.sum(s**2) p = p[:n_components] s = s[:n_components] # save the loadings robust_clr = pd.DataFrame(X, index=table.index, columns=table.columns) feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols) sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols) # % var explained proportion_explained = pd.Series(p, index=rename_cols) # get eigenvalues eigvals = pd.Series(s, index=rename_cols) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> # discussed in DEICODE -- PR#29 if n_components == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res, robust_clr