def test_multiplicative_replacement(self): amat = multiplicative_replacement(closure(self.cdata3)) npt.assert_allclose( amat, np.array([[0.087273, 0.174545, 0.261818, 0.04, 0.436364], [0.092, 0.04, 0.04, 0.368, 0.46], [0.066667, 0.133333, 0.2, 0.266667, 0.333333]]), rtol=1e-5, atol=1e-5) amat = multiplicative_replacement(closure(self.cdata4)) npt.assert_allclose( amat, np.array([0.087273, 0.174545, 0.261818, 0.04, 0.436364]), rtol=1e-5, atol=1e-5) amat = multiplicative_replacement(closure(self.cdata6)) npt.assert_allclose( amat, np.array([[0.087273, 0.174545, 0.261818, 0.04, 0.436364], [0.092, 0.04, 0.04, 0.368, 0.46], [0.066667, 0.133333, 0.2, 0.266667, 0.333333]]), rtol=1e-5, atol=1e-5) with self.assertRaises(ValueError): multiplicative_replacement(self.bad1) with self.assertRaises(ValueError): multiplicative_replacement(self.bad2) # make sure that inplace modification is not occurring multiplicative_replacement(self.cdata4) npt.assert_allclose(self.cdata4, np.array([1, 2, 3, 0, 5]))
def test_multiplicative_replacement(self): amat = multiplicative_replacement(closure(self.data3)) npt.assert_allclose(amat, np.array([[0.087273, 0.174545, 0.261818, 0.04, 0.436364], [0.092, 0.04, 0.04, 0.368, 0.46], [0.066667, 0.133333, 0.2, 0.266667, 0.333333]]), rtol=1e-5, atol=1e-5) amat = multiplicative_replacement(closure(self.data4)) npt.assert_allclose(amat, np.array([0.087273, 0.174545, 0.261818, 0.04, 0.436364]), rtol=1e-5, atol=1e-5) amat = multiplicative_replacement(closure(self.data6)) npt.assert_allclose(amat, np.array([[0.087273, 0.174545, 0.261818, 0.04, 0.436364], [0.092, 0.04, 0.04, 0.368, 0.46], [0.066667, 0.133333, 0.2, 0.266667, 0.333333]]), rtol=1e-5, atol=1e-5) with self.assertRaises(ValueError): multiplicative_replacement(self.bad1) with self.assertRaises(ValueError): multiplicative_replacement(self.bad2) # make sure that inplace modification is not occurring multiplicative_replacement(self.data4) npt.assert_allclose(self.data4, np.array([1, 2, 3, 0, 5]))
def mult_replace(df): """ wrapper for skbio's multiplicative multiplicative_replacement Parameters ---------- df : DataFrame Returns ------- df_mr : DataFrame modified via multiplicative replacement Notes ----- Replaces zeros with the minimum non zero value in the entire matrix. Use multiplicaive replacement to ensure rows sum close to 1. """ assert (isinstance(df, pd.DataFrame)) nzra = np.min(df.values.flatten()[df.values.flatten() > 0]) half_nzra = nzra / 2 # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1 df_mr = pd.DataFrame(multiplicative_replacement(df, delta=half_nzra)) assert (np.all(df_mr.values > 0)) return df_mr
def normalize_transform(self, mode='clr'): """ Some operations may require transformed data. This function performs normalization and a clr transform on all OTU tables in a Batch object. It returns a deep copy of the original Batch object, so the original file is not modified. :param mode: transformation mode; clr (centered log-ratio) or ilr (isometric log-ratio) :return: Transformed copy of Batch object. """ batchcopy = copy.deepcopy(self) try: for x in list(self.otu): # normalizes the data by samples normbiom = batchcopy.otu[x].norm(axis='sample', inplace=False) mat = csr_matrix.toarray(normbiom.matrix_data) # replaces all zeros with a small value # multiplicative replacement preserves ratios between values mat = multiplicative_replacement(mat) if mode is 'clr': mat = clr(mat) elif mode is 'ilr': mat = ilr(mat) else: raise ValueError("Only CLR and ILR transformations are currently supported.") normbiom._data = csc_matrix(mat) batchcopy.otu[x] = normbiom except Exception: logger.error("Failed to normalize data", exc_info=True) return batchcopy
def globalCLRPermTest(otuDf, labels, statfunc=_sumRhoStat, nperms=999, seed=110820, binary=False): """Calculates centered-log-ratios (CLR) for each sample and performs global permutation tests to determine if there is a significant correlation over all log-median-ratios, with respect to the label variable of interest. Parameters ---------- otuDf : pd.DataFrame [samples x OTUs] Contains relative abundance [0-1] for all samples (rows) and OTUs (colums) labels: pd.Series (float) Contains binary variable indicating membership into one of two categories (e.g. treatment conditions). Must share index with otuDf. statfunc : function Takes a np.ndarray [n x k] and float index [n] as parameters and returns a float summarizing over k. nperms : int Number of iterations for the permutation test. seed :int Seed for random permutation generation. Returns: -------- pvalue : float Global p-value for a significant association of OTU log-median-ratios with label, based on the summary statistic. obs : float Statistic summarizing the label difference.""" nSamples, nOTUs = otuDf.shape if binary: labelValues = labels.values.astype(bool) else: labelValues = labels.values.astype(float) # Make proportions otuDf = otuDf / otuDf.sum() # Apply multiplicative replacement for zero values otuMR = multiplicative_replacement(otuDf.values) # Calculate the CLR otuCLR = clr(otuMR) # Make into a DataFrame otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns) np.random.seed(seed) obs = statfunc(otuCLR.values, labelValues) samples = np.array([ statfunc(otuCLR.values, labelValues[np.random.permutation(nSamples)]) for permi in range(nperms) ]) """Since test is based on the abs statistic it is inherently two-sided""" pvalue = ((np.abs(samples) >= np.abs(obs)).sum() + 1) / (nperms + 1) return pvalue, obs
def normalize(df: pd.DataFrame, method: str = "am_clr", out: str = None, force: bool = False) -> pd.DataFrame: """Normalize raw k-mer counts by center or isometric log-ratio transform. Parameters ---------- df : pd.DataFrame k-mer counts dataframe. i.e. for 3-mers; Index='contig', columns=[AAA, AAT, ...] method : str, optional Normalize k-mer counts using CLR or ILR transformation (the default is Autometa's CLR implementation). choices = ['ilr', 'clr', 'am_clr'] Other transformations come from the skbio.stats.composition module out : str, optional Path to write normalized k-mers. force : bool, optional Whether to overwrite existing `out` file path, by default False. Returns ------- pd.DataFrame Normalized counts using provided `method`. Raises ------ ValueError Provided `method` is not available. """ method = method.lower() out_specified = out is not None out_exists = os.path.exists(out) if out else False case1 = out_specified and out_exists and not force if case1: logger.debug( f"{out} already exists. Use force to overwrite. retrieving...") return pd.read_csv(out, sep="\t", index_col="contig") logger.debug(f"Transforming k-mer counts using {method}") choices = {"ilr", "clr", "am_clr"} if method == "am_clr": norm_df = autometa_clr(df) elif method in choices: transforms = {"ilr": ilr, "clr": clr} X = df.fillna(0).to_numpy() X = multiplicative_replacement(X) X_norm = transforms[method](X) norm_df = pd.DataFrame(X_norm, index=df.index) else: choices = ", ".join(choices) raise ValueError( f"Normalize Method not available! {method}. choices: {choices}") case2 = out_specified and out_exists and force case3 = out_specified and not out_exists if case2 or case3: norm_df.to_csv(out, sep="\t", index=True, header=True) return norm_df
def clr_transform_cags_via_mult_rep_method(self): """ NOT GENERALIZABLE - DELETE uses multiplicative replacement to replace zeros with half of the lowest non-zero relative abundance value. Then performs clr transformation. Arguments --------- taxonomic_level : string "phlyum" through "species" Assigns ------- self.cags_dict : dictionary dictionary keyed on 'cags' with the following attributes: 1. cags_wide_df - relative abundances 2. cags_wide_mr_clr_df - clr transformed abundances (uses multiplicative replacement) 3. half_nzra - on-zero relative abundance (NZRA) used for Mult Rep step """ cag_wide = self._pivot_cags() # one solution is to use the lowest non-zero relative abundance (NZRA), or more typically NZRA/2 nzra = np.min(cag_wide.values.flatten()[cag_wide.values.flatten() > 0]) half_nzra = nzra / 2 # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1 cag_wide_mr = multiplicative_replacement(cag_wide, delta=half_nzra) # clr transform cag_wide_mr_clr = clr(cag_wide_mr) # clr transform array to data.frame with index and column matching mp_wide_taxa cag_wide_mr_clr_df = pd.DataFrame(cag_wide_mr_clr) cag_wide_mr_clr_df.columns = cag_wide.columns cag_wide_mr_clr_df.index = cag_wide.index self.cags_dict["cags"] = { "cags_wide_df": cag_wide, "cags_wide_mr_clr_df": cag_wide_mr_clr_df, "half_nzra": half_nzra } return cag_wide_mr_clr_df def fetch_metaphlan_result(self, clr=True, taxonomic_level="phylum"): """ getter """ if clr: key = 'mp_wide_taxa_mr_clr_df' else: key = 'mp_wide_taxa_df' try: return (self.metaphlan_dict[taxonomic_level][key]) except KeyError: print( "NO METAPHLAN MATRIX CREATED SEE clr_transform_metaphlan_via_mult_rep_method()" )
def normalize_clr(data): "replace zeros and apply clr" assert data.shape[0]< data.shape[1], "samples should be indexes, I don't think you have" normalized=composition.clr(composition.multiplicative_replacement(data)) normalized= pd.DataFrame(normalized, index= data.index,columns= data.columns) return normalized
def _clr_transform_via_mult_rep_method(self, df): nzra = np.min(df.values.flatten()[df.values.flatten() > 0]) half_nzra = nzra / 2 # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1 df_mr = multiplicative_replacement(df, delta=half_nzra) # clr transform mr_clr = clr(df_mr) # clr transform array to data.frame with index and column matching mp_wide_taxa mr_clr_df = pd.DataFrame(mr_clr) mr_clr_df.columns = df.columns mr_clr_df.index = df.index return mr_clr_df
def mult_replace(self, df): """ replace zeros with the minimum non zero value in the entire matrix. Use multiplicaive replacement to ensure rows sum close to 1.s """ nzra = np.min(df.values.flatten()[df.values.flatten() > 0]) half_nzra = nzra / 2 # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1 df_mr = pd.DataFrame(multiplicative_replacement(df, delta=half_nzra)) return (df_mr)
def aitchison_transform_part(df): """ Aitchison tranformation on df with all columns belonging to same batch. df should consist of all samples tagged together in one channel (i.e. A549_S_rep1 etc.) """ df_aitchison = multiplicative_replacement(df) #df_aitchison = closure(df) df_idx = df.index df_col = df.columns df_aitchison = pd.DataFrame(df_aitchison, index=df_idx, columns=df_col) return df_aitchison
def preprocess_df(df, rep, state): """ Aitchi transformed subset of data. """ df_subset = df[select_rep_state_intensities(rep, state)] cols = df_subset.columns df_subset = drop_zero_rows( df_subset) #index should be the same as protein/peptides index = df_subset.index df_subset = multiplicative_replacement(df_subset) df_subset = clr(df_subset) df_subset = pd.DataFrame(df_subset, index=index, columns=cols) return df_subset
def clr(counts_data, log=np.log2): #TODO: check if count data # remove columns with all data = counts_data.loc[:, ~(counts_data <= 1).all()] #dataframe with replace zeros data = pd.DataFrame(composition.multiplicative_replacement(data), columns=data.columns, index=data.index) data = log(data) data = (data.T - data.mean(1)).T return data
def clr_transform_metaphlan_via_mult_rep_method(self, taxonomic_level="phylum"): """ NOT GENERALIZABLE - DELETE uses multiplicative replacement to replace zeros with half of the lowest non-zero relative abundance value. Then performs clr transformation. Arguments --------- taxonomic_level : string "phlyum" through "species" Assigns ------- self.metaphlan_dict : dictionary dictionary keyed on taxa level with the following attributes: 1. mp_wide_taxa_df - taxa level relative abundances 2. mp_wide_taxa_mr_clr_df - taxa level clr transformed abundances (uses multiplicative replacement) 3. half_nzra - on-zero relative abundance (NZRA) used for Mult Rep step """ mp_wide_taxa = self._pivot_metaphlan(taxonomic_level=taxonomic_level) # one solution is to use the lowest non-zero relative abundance (NZRA), or more typically NZRA/2 nzra = np.min( mp_wide_taxa.values.flatten()[mp_wide_taxa.values.flatten() > 0]) half_nzra = nzra / 2 # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1 mp_wide_taxa_mr = multiplicative_replacement(mp_wide_taxa, delta=half_nzra) # clr transform mp_wide_taxa_mr_clr = clr(mp_wide_taxa_mr) # clr transform array to data.frame with index and column matching mp_wide_taxa mp_wide_taxa_mr_clr_df = pd.DataFrame(mp_wide_taxa_mr_clr) mp_wide_taxa_mr_clr_df.columns = mp_wide_taxa.columns mp_wide_taxa_mr_clr_df.index = mp_wide_taxa.index self.metaphlan_dict[taxonomic_level] = { "mp_wide_taxa_df": mp_wide_taxa, "mp_wide_taxa_mr_clr_df": mp_wide_taxa_mr_clr_df, "half_nzra": half_nzra } return (mp_wide_taxa_mr_clr_df)
def aitchison_distance(self, rank=Rank.Auto): """Calculate the Aitchison distance between samples. Aitchison distance is the Euclidean distance between centre logratio-normalized samples (abundances). As this requires log-transforms, we first need to 'estimate' zeros in the data; i.e. replace zeros with small, positive values, while maintaining a constant sum to 1. Parameters ---------- rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ import numpy as np from skbio.stats.composition import multiplicative_replacement, clr from sklearn.metrics.pairwise import euclidean_distances from skbio.stats.distance import DistanceMatrix df = self.to_df(rank=rank, normalize=self._guess_normalized() ) # get a dataframe of abundances df_n0 = multiplicative_replacement( df) # replace 0s with positive small numbers df_n0_clr = clr(df_n0) # clr-normalize aitchison_array = euclidean_distances( df_n0_clr, df_n0_clr) # get the euclidean distances # Due to rounding differences, we must force mirroring on the matrix aitchison_dm = np.zeros(aitchison_array.shape) aitchison_dm[np.triu_indices(aitchison_array.shape[0], k=0)] = aitchison_array[np.triu_indices( aitchison_array.shape[0], k=0)] aitchison_dm = aitchison_dm + aitchison_dm.T - np.diag( np.diag(aitchison_dm)) aitchison_dm = DistanceMatrix(aitchison_dm, df.index) return aitchison_dm
def cluster_heatmap(self, working_samples, samples_list, tax_level): """ saves a cluster heatmap based on Aitchison distance and the y-axis labels""" from skbio.stats.composition import clr from skbio.stats.composition import multiplicative_replacement import seaborn as sns if self.abundance_df.groupAbsoluteSamples() is not None: data0 = self.abundance_df.groupAbsoluteSamples( )[samples_list].astype('int') ids = list(data0.columns) index0 = list(data0.index) data1 = clr(data0.transpose().values.tolist()) mr_df = multiplicative_replacement(data0.T) mr_clr = clr(mr_df) mr_clr_df = pd.DataFrame(mr_clr.T, index=index0, columns=ids) #g = sns.clustermap(mr_clr_df, metric="correlation", cmap="mako", robust=True, annot_kws={"size": 6}) g = sns.clustermap(mr_clr_df, metric="euclidean", cmap="mako", robust=True, annot_kws={"size": 6}, yticklabels=False) filename = self.save_high_resolution_figure( g, 'Select file to save the cluster heatmap', 'cluster_heatmap', defaultextension='.png') filename = ('.').join(filename.split('.')[:-1]) #save y-axis labels y_labels = list(data0.iloc[g.dendrogram_row.reordered_ind].index) with open(filename + '_yaxis_labels.txt', 'w') as f: f.write('\n'.join([x.strip('_') for x in y_labels])) import matplotlib.pyplot as plt plt.close("all")
def clr(counts_data,log= np.log2): "Convert counts data to centered log ratio with log2. " "Zeros are replaced by multiplicative_replacement from scikit-bio. " "See wikipedia for centered log ratio." from skbio.stats import composition #TODO: check if count data data= counts_data.astype(int) # remove columns with all zeros data= data.loc[:,~(data<=1).all()] #dataframe with replace zeros data= pd.DataFrame( composition.multiplicative_replacement(data), columns=data.columns, index= data.index ) data= log(data) data = (data.T-data.mean(1)).T return data
def multiplicative_replacement_warning(self): with self.assertRaises(ValueError): multiplicative_replacement([0, 1, 2], delta=1)
def loadAbundance(filename, compositionNorm=True, truncate=True): """Load OTU counts file (phylum, genus or species level) with OTUs along the rows and samples along the columns. Parameters ---------- filename : str Excel file from QIIME pipeline. Contains OTUs along the rows and samples along the columns, with a few header rows. compositionNorm : bool Add delta count to zeros and normalize each sample by the total number of reads. (uses skbio.stats.composition.multiplicative_replacement) truncate : bool Discard taxa with less than 0.5% of total reads. Discard taxa that are not present in 25% of samples. """ def _cleanCountDf(df): """Drop extra columns/headers and transpose so that samples are along rows and OTUs along columns. Returns ------- outDf : pd.DataFrame [index: samples, columns: OTUs]""" df = df.drop(['tax_id', 'rank'], axis = 1) df = df.dropna(subset=['tax_name'], axis = 0) df = df.rename_axis({'tax_name':'OTU'}, axis=1) df = df.set_index('OTU') df = df.drop(['specimen'], axis = 0) df = df.T df = df.dropna(subset=['label'], axis=0) df['sid'] = df.label.str.replace('Sample-', 'S') df = df.set_index('sid') df = df.drop('label', axis=1) df = df.astype(float) return df def _discardLow(df, thresh=0.005): """Discard taxa/columns with less than 0.5% of reads""" totReads = df.values.sum() keepInd1 = (df.sum(axis=0)/totReads) > thresh """Also discard taxa that are not present in 25% of samples""" keepInd2 = (df>0).sum(axis=0)/df.shape[0] > 0.25 return df.loc[:, keepInd1 & keepInd2] df = pd.read_excel(filename) df = _cleanCountDf(df) if truncate: df = _discardLow(df) if compositionNorm: values = composition.multiplicative_replacement(df.values) df = pd.DataFrame(values, columns=df.columns, index=df.index) cols = [c for c in df.columns if not c in ['sid']] print('Abundance data: %s samples, %s taxa' % (df.shape[0], len(cols))) return df, cols
def clr_on_subset(df_subset): df_subset = drop_zero_rows(df_subset) df_subset = multiplicative_replacement(df_subset) df_subset = clr(df_subset) return df_subset
#data_corrected = pycombat(df_norm_prot,batch) data_corrected = pycombat(df_norm.fillna(0),batch[0]) ##################### THiNK ABOUT THIS... maybe aitchison before ComBat? ######################################## # Aitchison multiplicative_replacement # ######################################## data_corrected.sum() df_int.sum() df_aitchison = multiplicative_replacement(df_int) df_aitchison = pd.DataFrame(df_int, columns = midx) def aitchison_transform(df): """ Aitchison tranformation on df. df should consist of all samples tagged together in one channel (i.e. A549_S_rep1 etc.) """ df_aitchison = multiplicative_replacement(df) #df_aitchison = closure(df) df_idx = df.index df_col = df.columns df_aitchison = pd.DataFrame(df_aitchison, index = df_idx, columns = df_col) return df_aitchison
labs = f.read().split("\t") # Remove new-line characters that have numbers after them regex = re.compile(r'\n.*') labels = [re.sub(regex, "", e) for e in labs] # Remove first element "x" labels.pop(0) # Ensure that this is not the rarefied ASV table sample_counts = unscaled_tab.sum(axis=1) # T # Perform total sum scaling normalization (TSS) scaled = unscaled_tab.div(unscaled_tab.sum(axis=1), axis=0) # scaled.sum(axis=1) # check # Substitute zeros with small pseudocounts since... zeros_scaled = comp.multiplicative_replacement(scaled) # numpy.ndarray # Isoform log transform since... ilr_transformed = comp.ilr(zeros_scaled) # Convert ndarray back to dataframe because... df_ilr_transformed = pd.DataFrame(ilr_transformed, index=scaled.index, columns=scaled.columns) ######################################################################################################## # Decision tree methods tended to perform well # HFE OTU feature reduction method brought a substantial performance improvement for nearly all methods # After feature reduction most methods performed similarly so need to do that ########################################################################################################
microbe_iv['group'] = microbe_iv['group'].map(catdict) metabolite_iv['group'] = metabolite_iv['group'].map(catdict) # highlight features with p-value <= 0.001 max_pval = 0.001 microbe_iv.loc[microbe_iv.pval > max_pval, 'group'] = 'None' print('Number of significant microbes: %d' % microbe_iv[microbe_iv['group'] != 'None'].shape[0]) metabolite_iv.loc[metabolite_iv.pval > max_pval, 'group'] = 'None' print('Number of significant metabolites: %d' % metabolite_iv[metabolite_iv['group'] != 'None'].shape[0]) plssvd = PLSSVD(n_components=3) plssvd.fit(X=clr(centralize(multiplicative_replacement(microbes))), Y=clr(centralize(multiplicative_replacement(metabolites)))) def standardize(A): A = (A - np.mean(A, axis=0)) / np.std(A, axis=0) return A pls_microbes = pd.DataFrame(standardize(plssvd.x_weights_), columns=['PCA1', 'PCA2', 'PCA3'], index=microbes.columns) pls_metabolites = pd.DataFrame(standardize(plssvd.y_weights_), columns=['PCA1', 'PCA2', 'PCA3'], index=metabolites.columns)
shortest = dijkstra(dm.values) shortest = pd.DataFrame(shortest, columns=dm.index, index=dm.columns) shortest = shortest.reindex_axis(sorted(otu_table.columns), axis=0) shortest = shortest.reindex_axis(sorted(otu_table.columns), axis=1) # otu_table = table.T otu_table = otu_table.reindex_axis(sorted(otu_table.columns), axis=1) # Uses an idea similar to simrank graph_dm = (otu_table > 0).dot(cosine).dot((otu_table > 0).T) graph_dm.to_csv('../results/simrank.txt', '\t') # Uses Aitchison distance # samples = ['CF31_A', u'CF31_B', u'CF141_A', u'CF141_B', u'Tuni', u'Bry'] dm = cosine.values dm[dm == np.inf] = 0 mat = otu_table.values mat = multiplicative_replacement(mat) graph_dm = connected_dm(mat, dm) graph_dm += graph_dm.T samples = otu_table.index graph_dm = pd.DataFrame(graph_dm, index=samples, columns=samples) graph_dm.to_csv('../results/aitchison.txt', '\t') # Read in graph_dm graph_dm = pd.read_csv('../results/unconnected_aitchison.txt', sep='\t', index_col=0) # table = pd.read_table('../data/skinmap_chemiFrac_test.txt', # sep='\t', index_col=0) graph_dm.index = table.columns graph_dm.columns = table.columns # _dm = pw_distances('braycurtis', table.values, table.index.values)
shortest = pd.DataFrame(shortest, columns=dm.index, index=dm.columns) shortest = shortest.reindex_axis(sorted(otu_table.columns), axis=0) shortest = shortest.reindex_axis(sorted(otu_table.columns), axis=1) # otu_table = table.T otu_table = otu_table.reindex_axis(sorted(otu_table.columns), axis=1) # Uses an idea similar to simrank graph_dm = (otu_table>0).dot(cosine).dot((otu_table>0).T) graph_dm.to_csv('../results/simrank.txt', '\t') # Uses Aitchison distance # samples = ['CF31_A', u'CF31_B', u'CF141_A', u'CF141_B', u'Tuni', u'Bry'] dm = cosine.values dm[dm==np.inf]=0 mat = otu_table.values mat = multiplicative_replacement(mat) graph_dm = connected_dm(mat, dm) graph_dm += graph_dm.T samples = otu_table.index graph_dm = pd.DataFrame(graph_dm, index=samples, columns=samples) graph_dm.to_csv('../results/aitchison.txt', '\t') # Read in graph_dm graph_dm = pd.read_csv('../results/unconnected_aitchison.txt', sep='\t', index_col=0) # table = pd.read_table('../data/skinmap_chemiFrac_test.txt', # sep='\t', index_col=0) graph_dm.index = table.columns graph_dm.columns = table.columns
# which is much more faster than R package ancom.R::ANCOM from ancomP.stats.ancom import ancom import pandas as pd import numpy as np from skbio.stats.composition import multiplicative_replacement p = 20 for j in range(50): dir1 = 'H:/Tree/tree_base/p=' str(p) + '/otu_table.' + str(j+1) +'.txt' data = open(dir1, 'r') tmp=[] lines = data.readlines() for line in lines: line = list(line.strip().split(' ')) s = [] for n in line: s.append(int(n)) tmp.append(s) data.close() dat = multiplicative_replacement(tmp) ind = np.arange(1,p+1,1) sam = np.arange(1,101,1) table = pd.DataFrame(dat, index = sam, columns = ind) grouping = pd.Series(sorted([0,1]*50),index = sam) results = ancom(table, grouping) # default parameters resultsT = results.T resultsT.to_csv('H:/Tree/tree_base/p=' + str(p) + '/ANCOM.csv', mode = 'a',header = False)
def loadAbundance(filename, compositionNorm=True, truncate=True): """Load OTU counts file (phylum, genus or species level) with OTUs along the rows and samples along the columns. Parameters ---------- filename : str Excel file from QIIME pipeline. Contains OTUs along the rows and samples along the columns, with a few header rows. compositionNorm : bool Add delta count to zeros and normalize each sample by the total number of reads. (uses skbio.stats.composition.multiplicative_replacement) truncate : bool Discard taxa with less than 0.5% of total reads. Discard taxa that are not present in 25% of samples. """ def _cleanCountDf(df): """Drop extra columns/headers and transpose so that samples are along rows and OTUs along columns. Returns ------- outDf : pd.DataFrame [index: samples, columns: OTUs]""" df = df.drop(['tax_id', 'rank'], axis=1) df = df.dropna(subset=['tax_name'], axis=0) df = df.rename_axis({'tax_name': 'OTU'}, axis=1) df = df.set_index('OTU') df = df.drop(['specimen'], axis=0) df = df.T df = df.dropna(subset=['label'], axis=0) df['sid'] = df.label.str.replace('Sample-', 'S') df = df.set_index('sid') df = df.drop('label', axis=1) df = df.astype(float) return df def _discardLow(df, thresh=0.005): """Discard taxa/columns with less than 0.5% of reads""" totReads = df.values.sum() keepInd1 = (df.sum(axis=0) / totReads) > thresh """Also discard taxa that are not present in 25% of samples""" keepInd2 = (df > 0).sum(axis=0) / df.shape[0] > 0.25 return df.loc[:, keepInd1 & keepInd2] df = pd.read_excel(filename) df = _cleanCountDf(df) if truncate: df = _discardLow(df) if compositionNorm: values = composition.multiplicative_replacement(df.values) df = pd.DataFrame(values, columns=df.columns, index=df.index) cols = [c for c in df.columns if not c in ['sid']] print('Abundance data: %s samples, %s taxa' % (df.shape[0], len(cols))) return df, cols
def CLRPermTest(otuDf, labels, statfunc=_rhoStat, nperms=999, adjMethod='fdr_bh', seed=110820, binary=False): """Calculates centered-log-ratio (CLR) for all OTUs and performs permutation tests to determine if there is a significant correlation in OTU ratios with respect to the label variable of interest. Parameters ---------- otuDf : pd.DataFrame [samples x OTUs] Contains relative abundance [0-1] for all samples (rows) and OTUs (colums) labels: pd.Series (float) Contains binary variable indicating membership into one of two categories (e.g. treatment conditions). Must share index with otuDf. statfunc : function Takes a np.array [n x k] and float index [n] as parameters and returns a 1-D array of the statistic [k]. nperms : int Number of iterations for the permutation test. adjMethod : string Passed to sm.stats.multipletests for p-value multiplicity adjustment. If value is None then no adjustment is made. seed :int Seed for random permutation generation. Returns: -------- qvalues : pd.Series [index: OTU] Q/P-values for each OTU computed. observed : pd.Series [index: OTU] Log-ratio statistic summarizing across samples.""" nSamples, nOTUs = otuDf.shape if binary: labelValues = labels.values.astype(bool) else: labelValues = labels.values.astype(float) # Make proportions otuDf = otuDf / otuDf.sum() # Apply multiplicative replacement for zero values otuMR = multiplicative_replacement(otuDf.values) # Calculate the CLR otuCLR = clr(otuMR) # Make into a DataFrame otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns) obs = statfunc(otuCLR.values, labelValues) np.random.seed(seed) samples = np.zeros((nperms, nOTUs)) for permi in range(nperms): samples[permi, :] = statfunc( otuCLR.values, labelValues[np.random.permutation(nSamples)]) pvalues = ((np.abs(samples) >= np.abs(obs[None, :])).sum(axis=0) + 1) / (nperms + 1) if adjMethod is None or adjMethod.lower() == 'none': qvalues = pvalues else: qvalues = _pvalueAdjust(pvalues, method=adjMethod) qvalues = pd.Series(qvalues, index=otuDf.columns) observed = pd.Series(obs, index=otuDf.columns) return qvalues, observed
def CLRPermTest(otuDf, labels, statfunc=_rhoStat, nperms=999, adjMethod='fdr_bh', seed=110820, binary=False): """Calculates centered-log-ratio (CLR) for all OTUs and performs permutation tests to determine if there is a significant correlation in OTU ratios with respect to the label variable of interest. Parameters ---------- otuDf : pd.DataFrame [samples x OTUs] Contains relative abundance [0-1] for all samples (rows) and OTUs (colums) labels: pd.Series (float) Contains binary variable indicating membership into one of two categories (e.g. treatment conditions). Must share index with otuDf. statfunc : function Takes a np.array [n x k] and float index [n] as parameters and returns a 1-D array of the statistic [k]. nperms : int Number of iterations for the permutation test. adjMethod : string Passed to sm.stats.multipletests for p-value multiplicity adjustment. If value is None then no adjustment is made. seed :int Seed for random permutation generation. Returns: -------- qvalues : pd.Series [index: OTU] Q/P-values for each OTU computed. observed : pd.Series [index: OTU] Log-ratio statistic summarizing across samples.""" nSamples, nOTUs = otuDf.shape if binary: labelValues = labels.values.astype(bool) else: labelValues = labels.values.astype(float) # Make proportions otuDf = otuDf / otuDf.sum() # Apply multiplicative replacement for zero values otuMR = multiplicative_replacement(otuDf.values) # Calculate the CLR otuCLR = clr(otuMR) # Make into a DataFrame otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns) obs = statfunc(otuCLR.values, labelValues) np.random.seed(seed) samples = np.zeros((nperms, nOTUs)) for permi in range(nperms): samples[permi, :] = statfunc( otuCLR.values, labelValues[np.random.permutation(nSamples)] ) pvalues = ((np.abs(samples) >= np.abs(obs[None, :])).sum( axis=0) + 1) / (nperms + 1) if adjMethod is None or adjMethod.lower() == 'none': qvalues = pvalues else: qvalues = _pvalueAdjust(pvalues, method=adjMethod) qvalues = pd.Series(qvalues, index=otuDf.columns) observed = pd.Series(obs, index=otuDf.columns) return qvalues, observed