def center_log_ratio(exp: Experiment, method=lambda matrix: matrix + 1, centralize=False, inplace=False): """ Performs a clr transform to each sample. Parameters ---------- method : callable, optional An optional function to specify how the pseudocount method should be handled (to deal with zeros in the matrix) centralize : bool, optional centralize feature-wise to zero or not inplace : bool, optional False (default) to create a new experiment, True to normalize in place Returns ------- Experiment The normalized experiment. Note that all features are clr normalized. See Also -------- skbio.stats.composition.clr skbio.stats.composition.centralize """ from skbio.stats.composition import clr, centralize as skbio_centralize logger.debug('clr transforming the data') if not inplace: exp = deepcopy(exp) if exp.sparse: exp.sparse = False if centralize: exp.data = clr(skbio_centralize(method(exp.data))) else: exp.data = clr(method(exp.data)) return exp
def test_clr(self): cmat = clr(closure(self.data1)) A = np.array([.2, .2, .6]) B = np.array([.4, .4, .2]) npt.assert_allclose(cmat, [np.log(A / np.exp(np.log(A).mean())), np.log(B / np.exp(np.log(B).mean()))]) cmat = clr(closure(self.data2)) A = np.array([.2, .2, .6]) npt.assert_allclose(cmat, np.log(A / np.exp(np.log(A).mean()))) cmat = clr(closure(self.data5)) A = np.array([.2, .2, .6]) B = np.array([.4, .4, .2]) npt.assert_allclose(cmat, [np.log(A / np.exp(np.log(A).mean())), np.log(B / np.exp(np.log(B).mean()))]) with self.assertRaises(ValueError): clr(self.bad1) with self.assertRaises(ValueError): clr(self.bad2) # make sure that inplace modification is not occurring clr(self.data2) npt.assert_allclose(self.data2, np.array([2, 2, 6]))
def test_clr(self): cmat = clr(closure(self.cdata1)) A = np.array([.2, .2, .6]) B = np.array([.4, .4, .2]) npt.assert_allclose(cmat, [ np.log(A / np.exp(np.log(A).mean())), np.log(B / np.exp(np.log(B).mean())) ]) cmat = clr(closure(self.cdata2)) A = np.array([.2, .2, .6]) npt.assert_allclose(cmat, np.log(A / np.exp(np.log(A).mean()))) cmat = clr(closure(self.cdata5)) A = np.array([.2, .2, .6]) B = np.array([.4, .4, .2]) npt.assert_allclose(cmat, [ np.log(A / np.exp(np.log(A).mean())), np.log(B / np.exp(np.log(B).mean())) ]) with self.assertRaises(ValueError): clr(self.bad1) with self.assertRaises(ValueError): clr(self.bad2) # make sure that inplace modification is not occurring clr(self.cdata2) npt.assert_allclose(self.cdata2, np.array([2, 2, 6]))
def test_convert_beta_coordinates(): # Total: (n draws x p covariates x d features) # Each draw: (p covariates x d features) draw1 = np.array([[0.1, 0.2, 0.3, 0.4], [0.3, 0.1, 0.1, 0.5], [0.2, 0.2, 0.2, 0.3], [0.5, 0.1, 0.2, 0.2]]) draw2 = np.array([[0.2, 0.2, 0.3, 0.3], [0.1, 0.6, 0.2, 0.1], [0.4, 0.4, 0.1, 0.1], [0.1, 0.1, 0.1, 0.7]]) alr_coords = np.stack([alr(draw1), alr(draw2)]) # 2 x 4 x 3 clr_coords = util.convert_beta_coordinates(alr_coords) # 2 x 4 x 4 exp_coords = np.stack([clr(draw1), clr(draw2)]) np.testing.assert_array_almost_equal(clr_coords, exp_coords) clr_coords_sums = clr_coords.sum(axis=2) exp_clr_coords_sums = np.zeros((2, 4)) np.testing.assert_array_almost_equal(exp_clr_coords_sums, clr_coords_sums)
def normalize_transform(self, mode='clr'): """ Some operations may require transformed data. This function performs normalization and a clr transform on all OTU tables in a Batch object. It returns a deep copy of the original Batch object, so the original file is not modified. :param mode: transformation mode; clr (centered log-ratio) or ilr (isometric log-ratio) :return: Transformed copy of Batch object. """ batchcopy = copy.deepcopy(self) try: for x in list(self.otu): # normalizes the data by samples normbiom = batchcopy.otu[x].norm(axis='sample', inplace=False) mat = csr_matrix.toarray(normbiom.matrix_data) # replaces all zeros with a small value # multiplicative replacement preserves ratios between values mat = multiplicative_replacement(mat) if mode is 'clr': mat = clr(mat) elif mode is 'ilr': mat = ilr(mat) else: raise ValueError("Only CLR and ILR transformations are currently supported.") normbiom._data = csc_matrix(mat) batchcopy.otu[x] = normbiom except Exception: logger.error("Failed to normalize data", exc_info=True) return batchcopy
def clr_wrapper(state: PipelineState): # Unfortunately, clr needs pseudocounts or it crashes out. clr_data = clr(state.df.to_numpy() + .5) new_df = pd.DataFrame(data=clr_data, index=state.df.index, columns=state.df.columns) return state.update_df(new_df)
def test_build(self): """Test building a tensor from metadata (multi-mode) & matrix_rclr.""" # flatten tensor into matrix matrix_counts = self.tensor_true.transpose([0, 2, 1]) reshape_shape = matrix_counts.shape matrix_counts = matrix_counts.reshape(9, 2) # build mapping and table dataframe to rebuild mapping = np.array([[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]]) mapping = pd.DataFrame(mapping.T, columns=['ID', 'conditional']) table = pd.DataFrame(matrix_counts.T) # rebuild the tensor tensor = build() tensor.construct(table, mapping, 'ID', ['conditional']) # ensure rebuild tensor is the same as it started npt.assert_allclose(tensor.counts, self.tensor_true.astype(float)) # test tensor is ordered correctly in every dimension self.assertListEqual(tensor.subject_order, list(range(3))) self.assertListEqual(tensor.feature_order, list(range(2))) self.assertListEqual(tensor.condition_orders[0], list(range(3))) # test that flattened matrix has the same clr # transform as the tensor tensor_rclr tensor_clr_true = clr(matrix_counts).reshape(reshape_shape) tensor_clr_true = tensor_clr_true.transpose([0, 2, 1]) npt.assert_allclose(tensor_rclr(tensor.counts), tensor_clr_true)
def test_biplot(self): exp = clr(centralize(clr_inv(self.beta))) res = regression_biplot(self.beta) self.assertIsInstance(res, OrdinationResults) u = res.samples.values v = res.features.values.T npt.assert_allclose(u @ v, np.array(exp), atol=0.5, rtol=0.5)
def clrdata(data): logger.debug('clr transforming data') data[data == 0] = 1 clrdata = np.zeros(np.shape(data)) for ncol in range(np.shape(data)[1]): clrdata[:, ncol] = clr(data[:, ncol]) return clrdata
def pls_balances_cmd(table_file, metadata_file, category, output_file): metadata = pd.read_table(metadata_file, index_col=0) table = load_table(table_file) table = pd.DataFrame(np.array(table.matrix_data.todense()).T, index=table.ids(axis='sample'), columns=table.ids(axis='observation')) ctable = pd.DataFrame(clr(centralize(table + 1)), index=table.index, columns=table.columns) rfc = PLSRegression(n_components=1) if metadata[category].dtype != np.float: cats = np.unique(metadata[category]) groups = (metadata[category] == cats[0]).astype(np.int) else: groups = metadata[category] rfc.fit(X=ctable.values, Y=groups) pls_df = pd.DataFrame(rfc.x_weights_, index=ctable.columns, columns=['PLS1']) l, r = round_balance(pls_df.values, means_init=[[pls_df.PLS1.min()], [0], [pls_df.PLS1.max()]], n_init=100) num = pls_df.loc[pls_df.PLS1 > r] denom = pls_df.loc[pls_df.PLS1 < l] diff_features = list(num.index.values) diff_features += list(denom.index.values) with open(output_file, 'w') as f: f.write(','.join(diff_features))
def test_center_log(self): dat = np.array([[10, 20, 1, 20, 5, 100, 844, 100], [10, 20, 2, 19, 0, 100, 849, 200], [10, 20, 3, 18, 5, 100, 844, 300], [10, 20, 4, 17, 0, 100, 849, 400], [10, 20, 5, 16, 4, 100, 845, 500], [10, 20, 6, 15, 0, 100, 849, 600], [10, 20, 7, 14, 3, 100, 846, 700], [10, 20, 8, 13, 0, 100, 849, 800], [10, 20, 9, 12, 7, 100, 842, 900]]) + 1 obs = self.test2.center_log() exp = clr(dat) assert_array_almost_equal(exp, obs.data) obs = self.test2.center_log(centralize=True) exp = clr(centralize(dat)) assert_array_almost_equal(exp, obs.data)
def clrtransform(self, dataframe): """ Performs zero imputations to fill in the zeroes followed by centred-log-ratio (clr) transformation. Parameters ------------ dataframe: pandas dataframe, microbiome count data Returns ------------ X_clr: pandas dataframe, dataframe containing the clr transformed values of the count data. """ df = dataframe.copy() ### impute zeroes with 0.55 df.fillna(0, inplace=True) X_imputed = df.replace(0, 0.55) ### clr transform data X_clr = composition.clr(X_imputed) return pd.DataFrame(X_clr, columns=df.columns, index=df.index).sort_index()
def globalCLRPermTest(otuDf, labels, statfunc=_sumRhoStat, nperms=999, seed=110820, binary=False): """Calculates centered-log-ratios (CLR) for each sample and performs global permutation tests to determine if there is a significant correlation over all log-median-ratios, with respect to the label variable of interest. Parameters ---------- otuDf : pd.DataFrame [samples x OTUs] Contains relative abundance [0-1] for all samples (rows) and OTUs (colums) labels: pd.Series (float) Contains binary variable indicating membership into one of two categories (e.g. treatment conditions). Must share index with otuDf. statfunc : function Takes a np.ndarray [n x k] and float index [n] as parameters and returns a float summarizing over k. nperms : int Number of iterations for the permutation test. seed :int Seed for random permutation generation. Returns: -------- pvalue : float Global p-value for a significant association of OTU log-median-ratios with label, based on the summary statistic. obs : float Statistic summarizing the label difference.""" nSamples, nOTUs = otuDf.shape if binary: labelValues = labels.values.astype(bool) else: labelValues = labels.values.astype(float) # Make proportions otuDf = otuDf / otuDf.sum() # Apply multiplicative replacement for zero values otuMR = multiplicative_replacement(otuDf.values) # Calculate the CLR otuCLR = clr(otuMR) # Make into a DataFrame otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns) np.random.seed(seed) obs = statfunc(otuCLR.values, labelValues) samples = np.array([ statfunc(otuCLR.values, labelValues[np.random.permutation(nSamples)]) for permi in range(nperms) ]) """Since test is based on the abs statistic it is inherently two-sided""" pvalue = ((np.abs(samples) >= np.abs(obs)).sum() + 1) / (nperms + 1) return pvalue, obs
def test_ilr_inv_basis_one_dimension_error(self): basis = clr(np.array([[0.80442968, 0.19557032]])) table = np.array([[np.log(1/10)*np.sqrt(1/2), np.log(1.14141414 / 9.90909091)*np.sqrt(1/2), np.log(1.28282828 / 9.81818182)*np.sqrt(1/2), np.log(1.42424242 / 9.72727273)*np.sqrt(1/2), np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T with self.assertRaises(ValueError): ilr_inv(table, basis=basis)
def test_OptSpace_illformatted_raises(self): """Tests ValueError for OptSpace() no infs.""" # test inf try: MatrixCompletion().fit(clr(self.test_table)) except ValueError: pass else: raise AssertionError("ValueError was not raised")
def test_center_log_ration(self): from skbio.stats.composition import clr, centralize dat = np.array([[10, 20, 1, 20, 5, 100, 844, 100], [10, 20, 2, 19, 0, 100, 849, 200], [10, 20, 3, 18, 5, 100, 844, 300], [10, 20, 4, 17, 0, 100, 849, 400], [10, 20, 5, 16, 4, 100, 845, 500], [10, 20, 6, 15, 0, 100, 849, 600], [10, 20, 7, 14, 3, 100, 846, 700], [10, 20, 8, 13, 0, 100, 849, 800], [10, 20, 9, 12, 7, 100, 842, 900]]) + 1 obs = self.test2.center_log_ratio() exp = clr(dat) assert_array_almost_equal(exp, obs.data) obs = self.test2.center_log_ratio(centralize=True) exp = clr(centralize(dat)) assert_array_almost_equal(exp, obs.data)
def dendrogram_heatmap(output_dir: str, table: pd.DataFrame, tree: TreeNode, metadata: MetadataCategory, ndim=10, method='clr', color_map='viridis'): nodes = [n.name for n in tree.levelorder() if not n.is_tip()] nlen = min(ndim, len(nodes)) numerator_color, denominator_color = '#fb9a99', '#e31a1c' highlights = pd.DataFrame([[numerator_color, denominator_color]] * nlen, index=nodes[:nlen]) if method == 'clr': mat = pd.DataFrame(clr(centralize(table)), index=table.index, columns=table.columns) elif method == 'log': mat = pd.DataFrame(np.log(table), index=table.index, columns=table.columns) # TODO: There are a few hard-coded constants here # will need to have some adaptive defaults set in the future fig = heatmap(mat, tree, metadata.to_series(), highlights, cmap=color_map, highlight_width=0.01, figsize=(12, 8)) fig.savefig(os.path.join(output_dir, 'heatmap.svg')) fig.savefig(os.path.join(output_dir, 'heatmap.pdf')) css = r""" .square { float: left; width: 100px; height: 20px; margin: 5px; border: 1px solid rgba(0, 0, 0, .2); } .numerator { background: %s; } .denominator { background: %s; } """ % (numerator_color, denominator_color) index_fp = os.path.join(output_dir, 'index.html') with open(index_fp, 'w') as index_f: index_f.write('<html><body>\n') index_f.write('<h1>Dendrogram heatmap</h1>\n') index_f.write('<img src="heatmap.svg" alt="heatmap">') index_f.write('<a href="heatmap.pdf">') index_f.write('Download as PDF</a><br>\n') index_f.write('<style>%s</style>' % css) index_f.write('<div class="square numerator">' 'Numerator<br/></div>') index_f.write('<div class="square denominator">' 'Denominator<br/></div>') index_f.write('</body></html>\n')
def balance_classify(table, cats, num_folds, **init_kwds): """ Builds a balance classifier. If categorical, it is assumed that the classes are binary. """ skf = KFold(n_splits=num_folds, shuffle=True) ctable = pd.DataFrame(clr(centralize(table)), index=table.index, columns=table.columns) cv = pd.DataFrame(columns=['Q2', 'AUROC'], index=np.arange(num_folds)) for i, (train, test) in enumerate(skf.split(ctable.values, cats.values)): X_train, X_test = ctable.iloc[train], ctable.iloc[test] Y_train, Y_test = cats.iloc[train], cats.iloc[test] plsc = PLSRegression(n_components=1) plsc.fit(X=X_train, Y=Y_train) pls_df = pd.DataFrame(plsc.x_weights_, index=ctable.columns, columns=['PLS1']) l, r = round_balance(pls_df, **init_kwds) denom = pls_df.loc[pls_df.PLS1 < l] num = pls_df.loc[pls_df.PLS1 > r] # make the prediction and evaluate the accuracy idx = table.index[test] pls_balance = (np.log(table.loc[idx, num.index] + 1).mean(axis=1) - np.log(table.loc[idx, denom.index] + 1).mean(axis=1)) group_fpr, group_tpr, thresholds = roc_curve(y_true=1 - (Y_test == 1).astype(int), y_score=pls_balance) auroc = auc(group_tpr, group_fpr) press = ((pls_balance - Y_test)**2).sum() tss = ((Y_test.mean() - Y_test)**2).sum() Q2 = 1 - (press / tss) cv.loc[i, 'Q2'] = Q2 cv.loc[i, 'AUROC'] = auroc # build model on entire dataset plsc = PLSRegression(n_components=1) plsc.fit(X=table.values, Y=cats.values) pls_df = pd.DataFrame(plsc.x_weights_, index=ctable.columns, columns=['PLS1']) l, r = round_balance(pls_df, **init_kwds) denom = pls_df.loc[pls_df.PLS1 < l] num = pls_df.loc[pls_df.PLS1 > r] pls_balance = (np.log(table.loc[:, num.index]).mean(axis=1) - np.log(table.loc[:, denom.index]).mean(axis=1)) return num, denom, pls_balance, cv
def clr_transform_cags_via_mult_rep_method(self): """ NOT GENERALIZABLE - DELETE uses multiplicative replacement to replace zeros with half of the lowest non-zero relative abundance value. Then performs clr transformation. Arguments --------- taxonomic_level : string "phlyum" through "species" Assigns ------- self.cags_dict : dictionary dictionary keyed on 'cags' with the following attributes: 1. cags_wide_df - relative abundances 2. cags_wide_mr_clr_df - clr transformed abundances (uses multiplicative replacement) 3. half_nzra - on-zero relative abundance (NZRA) used for Mult Rep step """ cag_wide = self._pivot_cags() # one solution is to use the lowest non-zero relative abundance (NZRA), or more typically NZRA/2 nzra = np.min(cag_wide.values.flatten()[cag_wide.values.flatten() > 0]) half_nzra = nzra / 2 # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1 cag_wide_mr = multiplicative_replacement(cag_wide, delta=half_nzra) # clr transform cag_wide_mr_clr = clr(cag_wide_mr) # clr transform array to data.frame with index and column matching mp_wide_taxa cag_wide_mr_clr_df = pd.DataFrame(cag_wide_mr_clr) cag_wide_mr_clr_df.columns = cag_wide.columns cag_wide_mr_clr_df.index = cag_wide.index self.cags_dict["cags"] = { "cags_wide_df": cag_wide, "cags_wide_mr_clr_df": cag_wide_mr_clr_df, "half_nzra": half_nzra } return cag_wide_mr_clr_df def fetch_metaphlan_result(self, clr=True, taxonomic_level="phylum"): """ getter """ if clr: key = 'mp_wide_taxa_mr_clr_df' else: key = 'mp_wide_taxa_df' try: return (self.metaphlan_dict[taxonomic_level][key]) except KeyError: print( "NO METAPHLAN MATRIX CREATED SEE clr_transform_metaphlan_via_mult_rep_method()" )
def test_alr_to_clr(): mat = np.array([[0.1, 0.2, 0.3, 0.4, 0.3], [0.3, 0.1, 0.1, 0.2, 0.5], [0.4, 0.3, 0.5, 0.1, 0.1], [0.2, 0.4, 0.1, 0.3, 0.1]]) # skbio alr & clr take rows as compositions, columns as components alr_mat = alr(mat.T, 0) # 5 x 3 clr_mat = util.alr_to_clr(alr_mat.T).T # 5 x 4 exp_clr = clr(mat.T) # 5 x 4 np.testing.assert_array_almost_equal(clr_mat, exp_clr)
def test_clr_to_alr(): mat = np.array([[0.1, 0.2, 0.3, 0.4, 0.3], [0.3, 0.1, 0.1, 0.2, 0.5], [0.4, 0.3, 0.5, 0.1, 0.1], [0.2, 0.4, 0.1, 0.3, 0.1]]) # skbio alr & clr take rows as compositions, columns as components clr_mat = clr(mat.T) alr_mat = util.clr_to_alr(clr_mat.T).T exp_alr = alr(mat.T) np.testing.assert_array_almost_equal(alr_mat, exp_alr)
def test_matrix_tensor_rclr(self): """Test matrix == tensor matrix_rclr.""" # test clr works the same if there are no zeros npt.assert_allclose( tensor_rclr(self.count_data_one.T).T, clr(self.count_data_one)) # test a case with zeros tensor_rclr(self.count_data_two) # test negatives throw ValueError with self.assertRaises(ValueError): tensor_rclr(self.tensor_true * -1)
def balance_regression(table, cats, num_folds, **init_kwds): """ Builds a balance classifier. If categorical, it is assumed that the classes are binary. """ skf = KFold(n_splits=num_folds, shuffle=True) cats = cats * -1 # wtf?? ctable = pd.DataFrame(clr(centralize(table)), index=table.index, columns=table.columns) cv = pd.DataFrame(columns=['Q2'], index=np.arange(num_folds)) for i, (train, test) in enumerate(skf.split(ctable.values, cats.values)): X_train, X_test = ctable.iloc[train], ctable.iloc[test] Y_train, Y_test = cats.iloc[train], cats.iloc[test] plsc = PLSRegression(n_components=1) plsc.fit(X=X_train, Y=Y_train) pls_df = pd.DataFrame(plsc.x_weights_, index=ctable.columns, columns=['PLS1']) l, r = round_balance(pls_df, **init_kwds) denom = pls_df.loc[pls_df.PLS1 < l] num = pls_df.loc[pls_df.PLS1 > r] idx = table.index[train] pls_balance = (np.log(table.loc[idx, num.index] + 1).mean(axis=1) - np.log(table.loc[idx, denom.index] + 1).mean(axis=1)) b_, int_, _, _, _ = linregress(pls_balance, Y_train) idx = table.index[test] pls_balance = (np.log(table.loc[idx, num.index] + 1).mean(axis=1) - np.log(table.loc[idx, denom.index] + 1).mean(axis=1)) pred = pls_balance * b_ + int_ press = ((pred - Y_test)**2).sum() tss = ((Y_test.mean() - Y_test)**2).sum() Q2 = 1 - (press / tss) cv.loc[i, 'Q2'] = Q2 # build model on entire dataset plsc = PLSRegression(n_components=1) plsc.fit(X=table.values, Y=cats.values) pls_df = pd.DataFrame(plsc.x_weights_, index=ctable.columns, columns=['PLS1']) l, r = round_balance(pls_df, **init_kwds) denom = pls_df.loc[pls_df.PLS1 < l] num = pls_df.loc[pls_df.PLS1 > r] pls_balance = (np.log(table.loc[:, num.index]).mean(axis=1) - np.log(table.loc[:, denom.index]).mean(axis=1)) return num, denom, pls_balance, cv
def multinomial(table: biom.Table, metadata: Metadata, formula: str, training_column: str = None, num_random_test_examples: int = 10, epoch: int = 10, batch_size: int = 5, beta_prior: float = 1, learning_rate: float = 0.1, clipnorm: float = 10, min_sample_count: int = 10, min_feature_count: int = 10, summary_interval: int = 60) -> (pd.DataFrame): # load metadata and tables metadata = metadata.to_dataframe() # match them table, metadata, design = match_and_filter(table, metadata, formula, training_column, num_random_test_examples, min_sample_count, min_feature_count) # convert to dense representation dense_table = table.to_dataframe().to_dense().T # split up training and testing trainX, testX, trainY, testY = split_training(dense_table, metadata, design, training_column, num_random_test_examples) model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm, beta_mean=beta_prior, batch_size=batch_size, save_path=None) with tf.Graph().as_default(), tf.Session() as session: model(session, trainX, trainY, testX, testY) model.fit(epoch=epoch, summary_interval=summary_interval, checkpoint_interval=None) md_ids = np.array(design.columns) obs_ids = table.ids(axis='observation') beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B)))) beta_ = pd.DataFrame( beta_.T, columns=md_ids, index=obs_ids, ) return beta_
def normalize_clr(data): "replace zeros and apply clr" assert data.shape[0]< data.shape[1], "samples should be indexes, I don't think you have" normalized=composition.clr(composition.multiplicative_replacement(data)) normalized= pd.DataFrame(normalized, index= data.index,columns= data.columns) return normalized
def test_clr_inv(self): npt.assert_allclose(clr_inv(self.rdata1), self.ortho1) npt.assert_allclose(clr(clr_inv(self.rdata1)), self.rdata1) # make sure that inplace modification is not occurring clr_inv(self.rdata1) npt.assert_allclose( self.rdata1, np.array([[0.70710678, -0.70710678, 0., 0.], [0.40824829, 0.40824829, -0.81649658, 0.], [0.28867513, 0.28867513, 0.28867513, -0.8660254]]))
def test_fit(self): tf.set_random_seed(0) md = self.md md.name = 'sampleid' md = qiime2.Metadata(md) exp_beta = clr(clr_inv(np.hstack((np.zeros((2, 1)), self.beta.T)))) res_beta = multinomial(table=self.table, metadata=md, formula="X", epoch=50000) npt.assert_allclose(exp_beta, res_beta.T, atol=0.5, rtol=0.5)
def _clr_transform_via_mult_rep_method(self, df): nzra = np.min(df.values.flatten()[df.values.flatten() > 0]) half_nzra = nzra / 2 # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1 df_mr = multiplicative_replacement(df, delta=half_nzra) # clr transform mr_clr = clr(df_mr) # clr transform array to data.frame with index and column matching mp_wide_taxa mr_clr_df = pd.DataFrame(mr_clr) mr_clr_df.columns = df.columns mr_clr_df.index = df.index return mr_clr_df
def test_rclr(self): # test clr works the same if there are no zeros cmat = self._rclr.fit_transform(self.cdata1) npt.assert_allclose(cmat, clr(self.cdata1.copy())) # test a case with zeros :) cmat = self._rclr.fit_transform(self.cdata2) npt.assert_allclose(cmat, self.true2) with self.assertRaises(ValueError): self._rclr.fit_transform(self.bad1)
def test_clr_inv(self): npt.assert_allclose(clr_inv(self.rdata1), self.ortho1) npt.assert_allclose(clr(clr_inv(self.rdata1)), self.rdata1) # make sure that inplace modification is not occurring clr_inv(self.rdata1) npt.assert_allclose(self.rdata1, np.array([[0.70710678, -0.70710678, 0., 0.], [0.40824829, 0.40824829, -0.81649658, 0.], [0.28867513, 0.28867513, 0.28867513, -0.8660254]]))
def preprocess_df(df, rep, state): """ Aitchi transformed subset of data. """ df_subset = df[select_rep_state_intensities(rep, state)] cols = df_subset.columns df_subset = drop_zero_rows( df_subset) #index should be the same as protein/peptides index = df_subset.index df_subset = multiplicative_replacement(df_subset) df_subset = clr(df_subset) df_subset = pd.DataFrame(df_subset, index=index, columns=cols) return df_subset
def rhoMetric(npArray): nColumns=npArray.shape[-1] tempArray = np.zeros(shape=(nColumns,nColumns)) clrVals=clr(npArray) for i in range(nColumns): for j in range(nColumns): columnI = clrVals[:,i] columnJ = clrVals[:,j] tempArray[i,j] = 1-(columnI-columnJ).var()/(columnI.var()+columnJ.var()) return tempArray
def aitchison(x, y, **kwds): return euclidean(clr(x), clr(y))
def CLRPermTest(otuDf, labels, statfunc=_rhoStat, nperms=999, adjMethod='fdr_bh', seed=110820, binary=False): """Calculates centered-log-ratio (CLR) for all OTUs and performs permutation tests to determine if there is a significant correlation in OTU ratios with respect to the label variable of interest. Parameters ---------- otuDf : pd.DataFrame [samples x OTUs] Contains relative abundance [0-1] for all samples (rows) and OTUs (colums) labels: pd.Series (float) Contains binary variable indicating membership into one of two categories (e.g. treatment conditions). Must share index with otuDf. statfunc : function Takes a np.array [n x k] and float index [n] as parameters and returns a 1-D array of the statistic [k]. nperms : int Number of iterations for the permutation test. adjMethod : string Passed to sm.stats.multipletests for p-value multiplicity adjustment. If value is None then no adjustment is made. seed :int Seed for random permutation generation. Returns: -------- qvalues : pd.Series [index: OTU] Q/P-values for each OTU computed. observed : pd.Series [index: OTU] Log-ratio statistic summarizing across samples.""" nSamples, nOTUs = otuDf.shape if binary: labelValues = labels.values.astype(bool) else: labelValues = labels.values.astype(float) # Make proportions otuDf = otuDf / otuDf.sum() # Apply multiplicative replacement for zero values otuMR = multiplicative_replacement(otuDf.values) # Calculate the CLR otuCLR = clr(otuMR) # Make into a DataFrame otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns) obs = statfunc(otuCLR.values, labelValues) np.random.seed(seed) samples = np.zeros((nperms, nOTUs)) for permi in range(nperms): samples[permi, :] = statfunc( otuCLR.values, labelValues[np.random.permutation(nSamples)] ) pvalues = ((np.abs(samples) >= np.abs(obs[None, :])).sum( axis=0) + 1) / (nperms + 1) if adjMethod is None or adjMethod.lower() == 'none': qvalues = pvalues else: qvalues = _pvalueAdjust(pvalues, method=adjMethod) qvalues = pd.Series(qvalues, index=otuDf.columns) observed = pd.Series(obs, index=otuDf.columns) return qvalues, observed