def test_match_immutable(self): # tests to make sure that the original tables don't change. table = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]], index=['s1', 's2', 's3', 's4'], columns=['o1', 'o2', 'o3', 'o4']) metadata = pd.DataFrame([['a', 'control'], ['c', 'diseased'], ['b', 'control']], index=['s1', 's3', 's2'], columns=['Barcode', 'Treatment']) exp_table = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]], index=['s1', 's2', 's3', 's4'], columns=['o1', 'o2', 'o3', 'o4']) exp_metadata = pd.DataFrame([['a', 'control'], ['c', 'diseased'], ['b', 'control']], index=['s1', 's3', 's2'], columns=['Barcode', 'Treatment']) match(table, metadata) pdt.assert_frame_equal(table, exp_table) pdt.assert_frame_equal(metadata, exp_metadata)
def test_match_duplicate(self): table1 = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]], index=['s2', 's2', 's3', 's4'], columns=['o1', 'o2', 'o3', 'o4']) metadata1 = pd.DataFrame([['a', 'control'], ['b', 'control'], ['c', 'diseased'], ['d', 'diseased']], index=['s1', 's2', 's3', 's4'], columns=['Barcode', 'Treatment']) table2 = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]], index=['s1', 's2', 's3', 's4'], columns=['o1', 'o2', 'o3', 'o4']) metadata2 = pd.DataFrame([['a', 'control'], ['b', 'control'], ['c', 'diseased'], ['d', 'diseased']], index=['s1', 's1', 's3', 's4'], columns=['Barcode', 'Treatment']) with self.assertRaises(ValueError): match(table1, metadata1) with self.assertRaises(ValueError): match(table2, metadata2)
def test_biom_match_no_common_ids(self): table = Table( np.array([[0, 0, 1, 1], [2, 3, 4, 4], [5, 5, 3, 3]]).T, ['a', 'b', 'c', 'd'], ['y2', 'y3', 'y4']) md = pd.DataFrame({ 'x1': [1, 3, 2], 'x2': [1, 1, 0] }, columns=['s2', 's2', 's3']).T with self.assertRaises(ValueError): match(table, md)
def test_match_empty(self): table = pd.DataFrame( [[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]], index=['s1', 's2', 's3', 's4'], columns=['o1', 'o2', 'o3', 'o4']) metadata = pd.DataFrame([['a', 'control'], ['b', 'control'], ['c', 'diseased'], ['d', 'diseased']], index=['a1', 'a2', 'a3', 'a4'], columns=['Barcode', 'Treatment']) with self.assertRaises(ValueError): match(table, metadata)
def test_match_intersect(self): table = pd.DataFrame( [[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]], index=['s1', 's2', 's3', 's4'], columns=['o1', 'o2', 'o3', 'o4']) metadata = pd.DataFrame( [['a', 'control'], ['c', 'diseased'], ['b', 'control']], index=['s1', 's3', 's2'], columns=['Barcode', 'Treatment']) exp_table = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3]], index=['s1', 's2', 's3'], columns=['o1', 'o2', 'o3', 'o4']) exp_metadata = pd.DataFrame( [['a', 'control'], ['b', 'control'], ['c', 'diseased']], index=['s1', 's2', 's3'], columns=['Barcode', 'Treatment']) res_table, res_metadata = match(table, metadata) # sort for comparison, since the match function # scrambles the names due to hashing. res_table = res_table.sort_index() res_metadata = res_metadata.sort_index() pdt.assert_frame_equal(exp_table, res_table) pdt.assert_frame_equal(exp_metadata, res_metadata)
def test_match_scrambled(self): table = pd.DataFrame( [[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]], index=['s1', 's2', 's3', 's4'], columns=['o1', 'o2', 'o3', 'o4']) metadata = pd.DataFrame([['a', 'control'], ['c', 'diseased'], ['b', 'control'], ['d', 'diseased']], index=['s1', 's3', 's2', 's4'], columns=['Barcode', 'Treatment']) exp_table = table exp_metadata = pd.DataFrame([['a', 'control'], ['b', 'control'], ['c', 'diseased'], ['d', 'diseased']], index=['s1', 's2', 's3', 's4'], columns=['Barcode', 'Treatment']) res_table, res_metadata = match(table, metadata) # make sure that the metadata and table indeces match pdt.assert_index_equal(res_table.index, res_metadata.index) res_table = res_table.sort_index() exp_table = exp_table.sort_index() res_metadata = res_metadata.sort_index() exp_metadata = exp_metadata.sort_index() pdt.assert_frame_equal(exp_table, res_table) pdt.assert_frame_equal(exp_metadata, res_metadata)
def test_match_intersect(self): table = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]], index=['s1', 's2', 's3', 's4'], columns=['o1', 'o2', 'o3', 'o4']) metadata = pd.DataFrame([['a', 'control'], ['c', 'diseased'], ['b', 'control']], index=['s1', 's3', 's2'], columns=['Barcode', 'Treatment']) exp_table = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3]], index=['s1', 's2', 's3'], columns=['o1', 'o2', 'o3', 'o4']) exp_metadata = pd.DataFrame([['a', 'control'], ['b', 'control'], ['c', 'diseased']], index=['s1', 's2', 's3'], columns=['Barcode', 'Treatment']) res_table, res_metadata = match(table, metadata) # sort for comparison, since the match function # scrambles the names due to hashing. res_table = res_table.sort_index() res_metadata = res_metadata.sort_index() pdt.assert_frame_equal(exp_table, res_table) pdt.assert_frame_equal(exp_metadata, res_metadata)
def test_match(self): table = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]], index=['s1', 's2', 's3', 's4'], columns=['o1', 'o2', 'o3', 'o4']) metadata = pd.DataFrame([['a', 'control'], ['b', 'control'], ['c', 'diseased'], ['d', 'diseased']], index=['s1', 's2', 's3', 's4'], columns=['Barcode', 'Treatment']) exp_table, exp_metadata = table, metadata res_table, res_metadata = match(table, metadata) # make sure that the metadata and table indeces match pdt.assert_index_equal(res_table.index, res_metadata.index) res_table = res_table.sort_index() exp_table = exp_table.sort_index() res_metadata = res_metadata.sort_index() exp_metadata = exp_metadata.sort_index() pdt.assert_frame_equal(exp_table, res_table) pdt.assert_frame_equal(exp_metadata, res_metadata)
def test_biom_match_no_common_ids(self): table = Table( np.array([[0, 0, 1, 1], [2, 3, 4, 4], [5, 5, 3, 3]]).T, ['a', 'b', 'c', 'd'], ['y2', 'y3', 'y4']) md = pd.DataFrame( { 'x1': [1, 3, 2], 'x2': [1, 1, 0] }, columns=['s2', 's2', 's3'] ).T with self.assertRaises(ValueError): match(table, md)
def test_match_empty(self): table = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]], index=['s1', 's2', 's3', 's4'], columns=['o1', 'o2', 'o3', 'o4']) metadata = pd.DataFrame([['a', 'control'], ['b', 'control'], ['c', 'diseased'], ['d', 'diseased']], index=['a1', 'a2', 'a3', 'a4'], columns=['Barcode', 'Treatment']) with self.assertRaises(ValueError): match(table, metadata)
def gradient_clustering(table: pd.DataFrame, gradient: NumericMetadataColumn, weighted: bool = True) -> skbio.TreeNode: """ Builds a tree for features based on a gradient. Parameters ---------- table : pd.DataFrame Contingency table where rows are samples and columns are features. gradient : qiime2.NumericMetadataColumn Continuous vector of measurements corresponding to samples. weighted : bool Specifies if abundance or presence/absence information should be used to perform the clustering. Returns ------- skbio.TreeNode Represents the partitioning of features with respect to the gradient. """ c = gradient.to_series() if not weighted: table = (table > 0).astype(np.float) table, c = match(table, c) t = gradient_linkage(table, c, method='average') mean_g = mean_niche_estimator(table, c) mean_g = pd.Series(mean_g, index=table.columns) mean_g = mean_g.sort_values() t = gradient_sort(t, mean_g) return t
def test_biom_match(self): table = Table( np.array([[0, 0, 1, 1], [2, 3, 4, 4], [5, 5, 3, 3]]).T, ['a', 'b', 'c', 'd'], ['s2', 's3', 's4']) md = pd.DataFrame({ 'x1': [1, 3, 2], 'x2': [1, 1, 0] }, columns=['s1', 's2', 's3']).T exp_table = Table( np.array([[0, 0, 1, 1], [2, 3, 4, 4]]).T, ['a', 'b', 'c', 'd'], ['s2', 's3']) exp_md = pd.DataFrame({ 'x1': [3, 2], 'x2': [1, 0] }, columns=['s2', 's3']).T res_table, res_md = match(table, md) exp_df = pd.DataFrame(exp_table.to_dataframe()) res_df = pd.DataFrame(res_table.to_dataframe()) exp_df = exp_df.reindex_axis(sorted(exp_df.columns), axis=1) res_df = res_df.reindex_axis(sorted(res_df.columns), axis=1) pdt.assert_frame_equal(exp_df, res_df) exp_md = exp_md.reindex_axis(sorted(exp_md.index), axis=0) res_md = res_md.reindex_axis(sorted(res_md.index), axis=0) pdt.assert_frame_equal(res_md, exp_md)
def gradient_linkage(X, y, method='average'): """ Principal Balance Analysis using Hierarchical Clustering on known gradient. The hierarchy is built based on the values of the samples located along a gradient. Given a feature :math:`x`, the mean gradient values that :math:`x` was observed in is calculated by .. math:: f(g , x) = \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j} Where :math:`N` is the number of samples, :math:`x_i` is the proportion of feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value at sample `i`. The distance between two features :math:`x` and :math:`y` can be defined as .. math:: d(x, y) = (f(g, x) - f(g, y))^2 If :math:`d(x, y)` is very small, then :math:`x` and :math:`y` are expected to live in very similar positions across the gradient. A hierarchical clustering is then performed using :math:`d(x, y)` as the distance metric. Parameters ---------- X : pd.DataFrame Contingency table where the samples are rows and the features are columns. y : pd.Series Continuous vector representing some ordering of the features in X. method : str Clustering method. (default='average') Returns ------- skbio.TreeNode Tree generated from principal balance analysis. See Also -------- mean_niche_estimator """ _X, _y = match(X, y) mean_X = mean_niche_estimator(_X, gradient=_y) dm = DistanceMatrix.from_iterable(mean_X, euclidean) lm = linkage(dm.condensed_form(), method) return TreeNode.from_linkage_matrix(lm, X.columns)
def niche_sort(table, gradient, niche_estimator=mean_niche_estimator): """ Sort the table according to estimated niches. Sorts the table by samples along the gradient and otus by their estimated niche along the gradient. Parameters ---------- table : pd.DataFrame Contingency table where samples are rows and features (i.e. OTUs) are columns. gradient : pd.Series Vector of numerical gradient values. niche_estimator : function, optional A function that takes in two pandas series and returns an ordered object. The ability for the object to be ordered is critical, since this will allow the table to be sorted according to this ordering. By default, `mean_niche_estimator` will be used. Returns ------- pd.DataFrame : Sorted table according to the gradient of the samples, and the niches of the organisms along that gradient. Raises ------ ValueError : Raised if `niche_estimator` is not a function. """ if not callable(niche_estimator): raise ValueError("`niche_estimator` is not a function.") table, gradient = match(table, gradient) niche_estimator = partial(niche_estimator, gradient=gradient) # normalizes feature abundances to sum to 1, for each sample. # (i.e. scales values in each row to sum to 1). normtable = table.apply(lambda x: x/x.sum(), axis=1) # calculates estimated niche for each feature est_niche = normtable.apply(niche_estimator, axis=0) gradient = gradient.sort_values() est_niche = est_niche.sort_values() table = table.reindex(index=gradient.index, columns=est_niche.index) return table
def _intersect_of_table_metadata_tree(table, metadata, tree): """ Matches tips, features and samples between the table, metadata and tree. This module returns the features and samples that are contained in all 3 objects. Parameters ---------- table : pd.DataFrame Contingency table where samples correspond to rows and features correspond to columns. metadata: pd.DataFrame Metadata table that contains information about the samples contained in the `table` object. Samples correspond to rows and covariates correspond to columns. tree : skbio.TreeNode Tree object where the leaves correspond to the columns contained in the table. Returns ------- pd.DataFrame Subset of `table` with common row names as `metadata` and common columns as `tree.tips()` pd.DataFrame Subset of `metadata` with common row names as `table` skbio.TreeNode Subtree of `tree` with common tips as `table` """ if np.any(table <= 0): raise ValueError('Cannot handle zeros or negative values in `table`. ' 'Use pseudocounts or ``multiplicative_replacement``.') _table, _metadata = match(table, metadata) _table, _tree = match_tips(_table, tree) non_tips_no_name = [(n.name is None) for n in _tree.levelorder() if not n.is_tip()] if len(non_tips_no_name) == 0: raise ValueError('There are no internal nodes in `tree` after' 'intersection with `table`.') if len(_table.index) == 0: raise ValueError('There are no internal nodes in `table` after ' 'intersection with `metadata`.') if any(non_tips_no_name): _tree = rename_internal_nodes(_tree) return _table, _metadata, _tree
def test_biom_match_intersect(self): table = Table( np.array([[0, 0, 1, 1], [2, 3, 4, 4], [5, 5, 3, 3]]).T, ['a', 'b', 'c', 'd'], ['s1', 's2', 'y4']) md = pd.DataFrame([[0, 1], [1, 0], [1, 1]], index=['s2', 's1', 's3'], columns=['x1', 'x2']) exp_table = Table( np.array([[0, 0, 1, 1], [2, 3, 4, 4]]).T, ['a', 'b', 'c', 'd'], ['s1', 's2']) exp_md = pd.DataFrame([[1, 0], [0, 1]], columns=['x1', 'x2'], index=['s1', 's2']) res_table, res_md = match(table, md) pdt.assert_frame_equal(res_md, exp_md) exp_df = pd.DataFrame(exp_table.to_dataframe()) res_df = pd.DataFrame(res_table.to_dataframe()) pdt.assert_frame_equal(res_df, exp_df)
def test_biom_match(self): table = Table( np.array([[0, 0, 1, 1], [2, 3, 4, 4], [5, 5, 3, 3]]).T, ['a', 'b', 'c', 'd'], ['s2', 's3', 's4']) md = pd.DataFrame( { 'x1': [1, 3, 2], 'x2': [1, 1, 0] }, columns=['s1', 's2', 's3'] ).T exp_table = Table( np.array( [ [0, 0, 1, 1], [2, 3, 4, 4] ]).T, ['a', 'b', 'c', 'd'], ['s2', 's3']) exp_md = pd.DataFrame( { 'x1': [3, 2], 'x2': [1, 0] }, columns=['s2', 's3'] ).T res_table, res_md = match(table, md) exp_df = pd.DataFrame(exp_table.to_dataframe()) res_df = pd.DataFrame(res_table.to_dataframe()) exp_df = exp_df.reindex_axis(sorted(exp_df.columns), axis=1) res_df = res_df.reindex_axis(sorted(res_df.columns), axis=1) pdt.assert_frame_equal(exp_df, res_df) exp_md = exp_md.reindex_axis(sorted(exp_md.index), axis=0) res_md = res_md.reindex_axis(sorted(res_md.index), axis=0) pdt.assert_frame_equal(res_md, exp_md)
def gradient_clustering(table: pd.DataFrame, gradient: NumericMetadataColumn, ignore_missing_samples: bool = False, weighted: bool = True) -> skbio.TreeNode: """ Builds a tree for features based on a gradient. Parameters ---------- table : pd.DataFrame Contingency table where rows are samples and columns are features. gradient : qiime2.NumericMetadataColumn Continuous vector of measurements corresponding to samples. ignore_missing_samples: bool Whether to except or ignore when there are samples present in the table that are not present in the gradient metadata. weighted : bool Specifies if abundance or presence/absence information should be used to perform the clustering. Returns ------- skbio.TreeNode Represents the partitioning of features with respect to the gradient. """ c = gradient.to_series() if not ignore_missing_samples: difference = set(table.index) - set(c.index) if difference: raise KeyError("There are samples present in the table not " "present in the gradient metadata column. Override " "this error by using the `ignore_missing_samples` " "argument. Offending samples: %r" % ', '.join(sorted([str(i) for i in difference]))) if not weighted: table = (table > 0).astype(float) table, c = match(table, c) t = gradient_linkage(table, c, method='average') mean_g = mean_niche_estimator(table, c) mean_g = pd.Series(mean_g, index=table.columns) mean_g = mean_g.sort_values() t = gradient_sort(t, mean_g) return t
# Get OTU to taxa match taxonomy=table.metadata_to_dataframe('observation') taxonomy.columns=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] taxonomy['taxonomy'] = taxonomy[taxonomy.columns].apply(lambda x: ';'.join(x), axis=1) #mapping import map_file='cluster_models/keyboard.txt' #import metadata mappingdf= pd.read_table('%s'%map_file, index_col=0,low_memory=False) mappingdf=mappingdf.replace(np.nan,'Unknown', regex=True) mappingdf.index=list(map(str,mappingdf.index)) mappingdf=mappingdf.astype(str) mappingdf=mappingdf[~mappingdf.index.duplicated(keep='first')] #match the tables otutabledf,mappingdf=match(otutabledf,mappingdf[mappingdf['host_subject_id'].isin(['M2','M3','M9'])]) otutabledf=otutabledf.T[otutabledf.sum()>0].T otutabledf=otutabledf[otutabledf.T.sum()>0] otutabledf.columns=[str(x) for x in otutabledf.columns] sorting_map={'M9':2,'M2':3,'M3':1} mappingdf['host_num']=[int(sorting_map[x]) for x in mappingdf['host_subject_id']] mappingdf=mappingdf.apply(pd.to_numeric, errors='ignore') #sort by niche observed_table = niche_sort(otutabledf, mappingdf['host_num']) mappingdf=mappingdf.T[observed_table.index].T otutabledf=observed_table.copy()
def convert_biom_to_pandas(table): otu_table = pd.DataFrame(np.array(table.matrix_data.todense()).T, index=table.ids(axis='sample'), columns=table.ids(axis='observation')) return otu_table table = load_table('../data/dibd.biom') otu_table = convert_biom_to_pandas(table) mapping = pd.read_table("../data/dibd.map.txt", sep='\t', header=0, index_col=0) mapping = mapping.loc[mapping['disease_stat'].isin(['IBD', 'healthy'])] mapping, otu_table = match(mapping, otu_table) labels = np.array((mapping['disease_stat'] == 'IBD').astype(int)) dat = np.transpose(np.array(otu_table)) # normalization sample_reads = np.sum(dat, axis=0) # colSum: total reads in each sample norm_length = 10000 dat_norm = dat / sample_reads * norm_length # one group of the data same = dat_norm[:, labels == 0] def filtering(data, filterLev): otu_sum = np.sum(data, axis=1)
def dendrogram_heatmap(output_dir: str, table: pd.DataFrame, tree: TreeNode, metadata: qiime2.CategoricalMetadataColumn, pseudocount: float = 0.5, ndim: int = 10, method: str = 'clr', color_map: str = 'viridis'): table, tree = match_tips(add_pseudocount(table, pseudocount), tree) nodes = [n.name for n in tree.levelorder() if not n.is_tip()] nlen = min(ndim, len(nodes)) numerator_color, denominator_color = '#fb9a99', '#e31a1c' highlights = pd.DataFrame([[numerator_color, denominator_color]] * nlen, index=nodes[:nlen]) if method == 'clr': mat = pd.DataFrame(clr(centralize(table)), index=table.index, columns=table.columns) elif method == 'log': mat = pd.DataFrame(np.log(table), index=table.index, columns=table.columns) c = metadata.to_series() table, c = match(table, c) # TODO: There are a few hard-coded constants here # will need to have some adaptive defaults set in the future fig = heatmap(mat, tree, c, highlights, cmap=color_map, highlight_width=0.01, figsize=(12, 8)) fig.savefig(os.path.join(output_dir, 'heatmap.svg')) fig.savefig(os.path.join(output_dir, 'heatmap.pdf')) css = r""" .square { float: left; width: 100px; height: 20px; margin: 5px; border: 1px solid rgba(0, 0, 0, .2); } .numerator { background: %s; } .denominator { background: %s; } """ % (numerator_color, denominator_color) index_fp = os.path.join(output_dir, 'index.html') with open(index_fp, 'w') as index_f: index_f.write('<html><body>\n') index_f.write('<h1>Dendrogram heatmap</h1>\n') index_f.write('<img src="heatmap.svg" alt="heatmap">') index_f.write('<a href="heatmap.pdf">') index_f.write('Download as PDF</a><br>\n') index_f.write('<style>%s</style>' % css) index_f.write('<div class="square numerator">' 'Numerator<br/></div>') index_f.write('<div class="square denominator">' 'Denominator<br/></div>') index_f.write('</body></html>\n')
def balance_taxonomy(output_dir: str, table: pd.DataFrame, tree: TreeNode, taxonomy: pd.DataFrame, balance_name: str, pseudocount: float = 0.5, taxa_level: int = 0, n_features: int = 10, threshold: float = None, metadata: qiime2.MetadataColumn = None) -> None: if threshold is not None and isinstance(metadata, qiime2.CategoricalMetadataColumn): raise ValueError('Categorical metadata column detected. Only specify ' 'a threshold when using a numerical metadata column.') # make sure that the table and tree match up table, tree = match_tips(add_pseudocount(table, pseudocount), tree) # parse out headers for taxonomy taxa_data = list(taxonomy['Taxon'].apply(lambda x: x.split(';')).values) taxa_df = pd.DataFrame(taxa_data, index=taxonomy.index) # fill in NAs def f(x): y = np.array(list(map(lambda k: k is not None, x))) i = max(0, np.where(y)[0][-1]) x[np.logical_not(y)] = [x[i]] * np.sum(np.logical_not(y)) return x taxa_df = taxa_df.apply(f, axis=1) num_clade = tree.find(balance_name).children[NUMERATOR] denom_clade = tree.find(balance_name).children[DENOMINATOR] if num_clade.is_tip(): num_features = pd.DataFrame( {num_clade.name: taxa_df.loc[num_clade.name]} ).T r = 1 else: num_features = taxa_df.loc[num_clade.subset()] r = len(list(num_clade.tips())) if denom_clade.is_tip(): denom_features = pd.DataFrame( {denom_clade.name: taxa_df.loc[denom_clade.name]} ).T s = 1 else: denom_features = taxa_df.loc[denom_clade.subset()] s = len(list(denom_clade.tips())) b = (np.log(table.loc[:, num_features.index]).mean(axis=1) - np.log(table.loc[:, denom_features.index]).mean(axis=1)) b = b * np.sqrt(r * s / (r + s)) balances = pd.DataFrame(b, index=table.index, columns=[balance_name]) # the actual colors for the numerator and denominator num_color = sns.color_palette("Paired")[0] denom_color = sns.color_palette("Paired")[1] fig, (ax_num, ax_denom) = plt.subplots(2) balance_barplots(tree, balance_name, taxa_level, taxa_df, denom_color=denom_color, num_color=num_color, axes=(ax_num, ax_denom)) ax_num.set_title( r'$%s_{numerator} \; taxa \; (%d \; taxa)$' % ( balance_name, len(num_features))) ax_denom.set_title( r'$%s_{denominator} \; taxa \; (%d \; taxa)$' % ( balance_name, len(denom_features))) ax_denom.set_xlabel('Number of unique taxa') plt.tight_layout() fig.savefig(os.path.join(output_dir, 'barplots.svg')) fig.savefig(os.path.join(output_dir, 'barplots.pdf')) dcat = None multiple_cats = False if metadata is not None: fig2, ax = plt.subplots() c = metadata.to_series() data, c = match(balances, c) data[c.name] = c y = data[balance_name] # check if continuous if isinstance(metadata, qiime2.NumericMetadataColumn): ax.scatter(c.values, y) ax.set_xlabel(c.name) if threshold is None: threshold = c.mean() dcat = c.apply( lambda x: '%s < %f' % (c.name, threshold) if x < threshold else '%s > %f' % (c.name, threshold) ) sample_palette = pd.Series(sns.color_palette("Set2", 2), index=dcat.value_counts().index) elif isinstance(metadata, qiime2.CategoricalMetadataColumn): sample_palette = pd.Series( sns.color_palette("Set2", len(c.value_counts())), index=c.value_counts().index) try: pd.to_numeric(metadata.to_series()) except ValueError: pass else: raise ValueError('Categorical metadata column ' f'{metadata.name!r} contains only numerical ' 'values. At least one value must be ' 'non-numerical.') balance_boxplot(balance_name, data, y=c.name, ax=ax, palette=sample_palette) if len(c.value_counts()) > 2: warnings.warn( 'More than 2 categories detected in categorical metadata ' 'column. Proportion plots will not be displayed', stacklevel=2) multiple_cats = True else: dcat = c else: # Some other type of MetadataColumn raise NotImplementedError() ylabel = (r"$%s = \ln \frac{%s_{numerator}}" "{%s_{denominator}}$") % (balance_name, balance_name, balance_name) ax.set_title(ylabel, rotation=0) ax.set_ylabel('log ratio') fig2.savefig(os.path.join(output_dir, 'balance_metadata.svg')) fig2.savefig(os.path.join(output_dir, 'balance_metadata.pdf')) if not multiple_cats: # Proportion plots # first sort by clr values and calculate average fold change ctable = pd.DataFrame(clr(centralize(table)), index=table.index, columns=table.columns) left_group = dcat.value_counts().index[0] right_group = dcat.value_counts().index[1] lidx, ridx = (dcat == left_group), (dcat == right_group) if b.loc[lidx].mean() > b.loc[ridx].mean(): # double check ordering and switch if necessary # careful - the left group is also commonly associated with # the denominator. left_group = dcat.value_counts().index[1] right_group = dcat.value_counts().index[0] lidx, ridx = (dcat == left_group), (dcat == right_group) # we are not performing a statistical test here # we're just trying to figure out a way to sort the data. num_fold_change = ctable.loc[:, num_features.index].apply( lambda x: ttest_ind(x[ridx], x[lidx])[0]) num_fold_change = num_fold_change.sort_values( ascending=False ) denom_fold_change = ctable.loc[:, denom_features.index].apply( lambda x: ttest_ind(x[ridx], x[lidx])[0]) denom_fold_change = denom_fold_change.sort_values( ascending=True ) metadata = pd.DataFrame({dcat.name: dcat}) top_num_features = num_fold_change.index[:n_features] top_denom_features = denom_fold_change.index[:n_features] fig3, (ax_denom, ax_num) = plt.subplots(1, 2) proportion_plot( table, metadata, category=metadata.columns[0], left_group=left_group, right_group=right_group, feature_metadata=taxa_df, label_col=taxa_level, num_features=top_num_features, denom_features=top_denom_features, # Note that the syntax is funky and counter # intuitive. This will need to be properly # fixed here # https://github.com/biocore/gneiss/issues/244 num_color=sample_palette.loc[right_group], denom_color=sample_palette.loc[left_group], axes=(ax_num, ax_denom)) # The below is overriding the default colors in the # numerator / denominator this will also need to be fixed in # https://github.com/biocore/gneiss/issues/244 max_ylim, min_ylim = ax_denom.get_ylim() num_h, denom_h = n_features, n_features space = (max_ylim - min_ylim) / (num_h + denom_h) ymid = (max_ylim - min_ylim) * num_h ymid = ymid / (num_h + denom_h) - 0.5 * space ax_denom.axhspan(min_ylim, ymid, facecolor=num_color, zorder=0) ax_denom.axhspan(ymid, max_ylim, facecolor=denom_color, zorder=0) ax_num.axhspan(min_ylim, ymid, facecolor=num_color, zorder=0) ax_num.axhspan(ymid, max_ylim, facecolor=denom_color, zorder=0) fig3.subplots_adjust( # the left side of the subplots of the figure left=0.3, # the right side of the subplots of the figure right=0.9, # the bottom of the subplots of the figure bottom=0.1, # the top of the subplots of the figure top=0.9, # the amount of width reserved for blank space # between subplots wspace=0, # the amount of height reserved for white space # between subplots hspace=0.2, ) fig3.savefig(os.path.join(output_dir, 'proportion_plot.svg')) fig3.savefig(os.path.join(output_dir, 'proportion_plot.pdf')) index_fp = os.path.join(output_dir, 'index.html') with open(index_fp, 'w') as index_f: index_f.write('<html><body>\n') if metadata is not None: index_f.write('<h1>Balance vs %s </h1>\n' % c.name) index_f.write(('<img src="balance_metadata.svg" ' 'alt="barplots">\n\n' '<a href="balance_metadata.pdf">' 'Download as PDF</a><br>\n')) if not multiple_cats: index_f.write('<h1>Proportion Plot </h1>\n') index_f.write(('<img src="proportion_plot.svg" ' 'alt="proportions">\n\n' '<a href="proportion_plot.pdf">' 'Download as PDF</a><br>\n')) index_f.write(('<h1>Balance Taxonomy</h1>\n' '<img src="barplots.svg" alt="barplots">\n\n' '<a href="barplots.pdf">' 'Download as PDF</a><br>\n' '<h3>Numerator taxa</h3>\n' '<a href="numerator.csv">\n' 'Download as CSV</a><br>\n' '<h3>Denominator taxa</h3>\n' '<a href="denominator.csv">\n' 'Download as CSV</a><br>\n')) num_features.to_csv(os.path.join(output_dir, 'numerator.csv'), header=True, index=True) denom_features.to_csv(os.path.join(output_dir, 'denominator.csv'), header=True, index=True) index_f.write('</body></html>\n')
def main(_): opts = Options(save_path=FLAGS.save_path, train_biom=FLAGS.train_biom, test_biom=FLAGS.test_biom, train_metadata=FLAGS.train_metadata, test_metadata=FLAGS.test_metadata, formula=FLAGS.formula, learning_rate=FLAGS.learning_rate, clipping_size=FLAGS.clipping_size, beta_mean=FLAGS.beta_mean, beta_scale=FLAGS.beta_scale, gamma_mean=FLAGS.gamma_mean, gamma_scale=FLAGS.gamma_scale, epochs_to_train=FLAGS.epochs_to_train, num_neg_samples=FLAGS.num_neg_samples, batch_size=FLAGS.batch_size, min_sample_count=FLAGS.min_sample_count, min_feature_count=FLAGS.min_feature_count, statistics_interval=FLAGS.statistics_interval, summary_interval=FLAGS.summary_interval, checkpoint_interval=FLAGS.checkpoint_interval) # preprocessing train_table, train_metadata = opts.train_table, opts.train_metadata sample_filter = lambda val, id_, md: ( (id_ in train_metadata.index) and np.sum(val) > opts.min_sample_count) read_filter = lambda val, id_, md: np.sum(val) > opts.min_feature_count train_table = train_table.filter(sample_filter, axis='sample') train_table = train_table.filter(read_filter, axis='observation') train_metadata = dmatrix(opts.formula, train_metadata, return_type='dataframe') train_table, train_metadata = match(train_table, train_metadata) # hold out data preprocessing test_table, test_metadata = opts.test_table, opts.test_metadata metadata_filter = lambda val, id_, md: id_ in test_metadata.index obs_lookup = set(train_table.ids(axis='observation')) feat_filter = lambda val, id_, md: id_ in obs_lookup test_table = test_table.filter(metadata_filter, axis='sample') test_table = test_table.filter(feat_filter, axis='observation') test_metadata = dmatrix(opts.formula, test_metadata, return_type='dataframe') test_table, test_metadata = match(test_table, test_metadata) # pad extra columns with zeros, so that we can still make predictions extra_columns = list( set(train_metadata.columns) - set(test_metadata.columns)) df = pd.DataFrame( {C: np.zeros(test_metadata.shape[0]) for C in extra_columns}, index=test_metadata.index) test_metadata = pd.concat((test_metadata, df), axis=1) p = train_metadata.shape[1] # number of covariates G_data = train_metadata.values y_data = train_table.matrix_data.tocoo().T y_test = np.array(test_table.matrix_data.todense()).T N, D = y_data.shape save_path = opts.save_path learning_rate = opts.learning_rate batch_size = opts.batch_size gamma_mean, gamma_scale = opts.gamma_mean, opts.gamma_scale beta_mean, beta_scale = opts.beta_mean, opts.beta_scale num_neg = opts.num_neg_samples clipping_size = opts.clipping_size epoch = y_data.nnz // batch_size num_iter = int(opts.epochs_to_train * epoch) holdout_size = test_metadata.shape[0] checkpoint_interval = opts.checkpoint_interval # Model code with tf.Graph().as_default(), tf.Session() as session: Gpos_ph = tf.placeholder(tf.float32, [batch_size, p], name='G_pos') Gneg_ph = tf.placeholder(tf.float32, [num_neg, p], name='G_neg') G_holdout = tf.placeholder(tf.float32, [holdout_size, p], name='G_holdout') Y_holdout = tf.placeholder(tf.float32, [holdout_size, D], name='Y_holdout') Y_ph = tf.placeholder(tf.float32, [batch_size], name='Y_ph') pos_row = tf.placeholder(tf.int32, shape=[batch_size], name='pos_row') pos_col = tf.placeholder(tf.int32, shape=[batch_size], name='pos_col') neg_row = tf.placeholder(tf.int32, shape=[num_neg], name='neg_row') neg_col = tf.placeholder(tf.int32, shape=[num_neg], name='neg_col') neg_data = tf.zeros(shape=[num_neg], name='neg_data', dtype=tf.float32) total_zero = tf.constant(y_data.shape[0] * y_data.shape[1] - y_data.nnz, dtype=tf.float32) total_nonzero = tf.constant(y_data.nnz, dtype=tf.float32) qgamma = tf.Variable(tf.random_normal([1, D]), name='qgamma') # sample bias (for overdispersion) # theta = tf.Variable(tf.random_normal([N, 1]), name='theta') theta = tf.constant(np.log(train_table.sum(axis='sample')), dtype=tf.float32) qbeta = tf.Variable(tf.random_normal([p, D]), name='qB') # species bias gamma = Normal(loc=tf.zeros([1, D]) + gamma_mean, scale=tf.ones([1, D]) * gamma_scale, name='gamma') # regression coefficents distribution beta = Normal(loc=tf.zeros([p, D]) + beta_mean, scale=tf.ones([p, D]) * beta_scale, name='B') V = tf.concat([qgamma, qbeta], axis=0) # add bias terms for samples Gpos = tf.concat([tf.ones([batch_size, 1]), Gpos_ph], axis=1) Gneg = tf.concat([tf.ones([num_neg, 1]), Gneg_ph], axis=1) # sparse matrix multiplication for positive samples pos_prime = tf.reduce_sum(tf.multiply( Gpos, tf.transpose(tf.gather(V, pos_col, axis=1))), axis=1) pos_phi = tf.reshape(tf.gather(theta, pos_row), shape=[batch_size ]) + pos_prime Y = Poisson(log_rate=pos_phi, name='Y') # sparse matrix multiplication for negative samples neg_prime = tf.reduce_sum(tf.multiply( Gneg, tf.transpose(tf.gather(V, neg_col, axis=1))), axis=1) neg_phi = tf.reshape(tf.gather(theta, neg_row), shape=[num_neg ]) + neg_prime neg_poisson = Poisson(log_rate=neg_phi, name='neg_counts') loss = -( tf.reduce_sum(gamma.log_prob(qgamma)) + \ tf.reduce_sum(beta.log_prob(qbeta)) + \ tf.reduce_sum(Y.log_prob(Y_ph)) * (total_nonzero / batch_size) + \ tf.reduce_sum(neg_poisson.log_prob(neg_data)) * (total_zero / num_neg) ) optimizer = tf.train.AdamOptimizer(learning_rate) gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, clipping_size) train = optimizer.apply_gradients(zip(gradients, variables)) with tf.name_scope('accuracy'): holdout_count = tf.reduce_sum(Y_holdout, axis=1) pred = tf.reshape( holdout_count, [-1, 1]) * tf.nn.softmax(tf.matmul(G_holdout, qbeta) + qgamma) mse = tf.reduce_mean(tf.squeeze(tf.abs(pred - Y_holdout))) tf.summary.scalar('mean_absolute_error', mse) tf.summary.scalar('loss', loss) tf.summary.histogram('qbeta', qbeta) tf.summary.histogram('qgamma', qgamma) tf.summary.histogram('theta', theta) merged = tf.summary.merge_all() tf.global_variables_initializer().run() writer = tf.summary.FileWriter(save_path, session.graph) run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() losses = np.array([0.] * num_iter) idx = np.arange(train_metadata.shape[0]) log_handle = open(os.path.join(save_path, 'run.log'), 'w') gen = subsampler(y_data, num_pos=batch_size, num_neg=num_neg) start_time = time.time() last_checkpoint_time = 0 saver = tf.train.Saver() for i in range(num_iter): batch_idx = np.random.choice(idx, size=batch_size) batch = next(gen) (positive_row, positive_col, positive_data, negative_row, negative_col) = batch feed_dict = { Y_ph: positive_data, Y_holdout: y_test.astype(np.float32), G_holdout: test_metadata.values.astype(np.float32), Gpos_ph: G_data[positive_row, :], Gneg_ph: G_data[negative_row, :], pos_row: positive_row, pos_col: positive_col, neg_row: negative_row, neg_col: negative_col } if i % 1000 == 0: _, summary, train_loss, grads = session.run( [train, merged, loss, gradients], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) elif i % 5000 == 0: _, summary, err, train_loss, grads = session.run( [train, mse, merged, loss, gradients], feed_dict=feed_dict) writer.add_summary(summary, i) else: _, summary, train_loss, grads = session.run( [train, merged, loss, gradients], feed_dict=feed_dict) writer.add_summary(summary, i) now = time.time() if now - last_checkpoint_time > checkpoint_interval: saver.save(session, os.path.join(opts.save_path, "model.ckpt"), global_step=i) last_checkpoint_time = now losses[i] = train_loss elapsed_time = time.time() - start_time print('Elapsed Time: %f seconds' % elapsed_time) # Cross validation pred_beta = qbeta.eval() pred_gamma = qgamma.eval() mse, mrc = cross_validation(test_metadata.values, pred_beta, pred_gamma, y_test) print("MSE: %f, MRC: %f" % (mse, mrc))
def preprocess(formula, train_table, train_metadata, test_table, test_metadata, min_sample_count=10, min_feature_count=10): """ Performs data preprocessing. Parameters ---------- formula : str Statistical formula specifying the design matrix of covariates in the study design. train_table : biom.Table Biom table containing the feature counts within the training dataset. train_metadata : pd.DataFrame Sample metadata table containing all of the measured covariates in the training dataset. test_table : biom.Table Biom table containing the feature counts within the holdout dataset. test_metadata : pd.DataFrame Sample metadata table containing all of the measured covariates in the holdout test dataset. min_sample_counts : int Minimum number of total counts within a sample to be kept. min_feature_counts : int Minimum number of total counts within a feature to be kept. Returns ------- train_table : biom.Table Biom table containing the feature counts within the training dataset. train_metadata : pd.DataFrame Sample metadata table containing all of the measured covariates in the training dataset. test_table : biom.Table Biom table containing the feature counts within the holdout dataset. test_metadata : pd.DataFrame Sample metadata table containing all of the measured covariates in the holdout test dataset. Notes ----- This assumes that the biom tables can fit into memory - will require some extra consideration when this is no longer the case. """ # preprocessing train_table, train_metadata = train_table, train_metadata sample_filter = lambda val, id_, md: ( (id_ in train_metadata.index) and np.sum(val) > min_sample_count) read_filter = lambda val, id_, md: np.sum(val) > min_feature_count train_table = train_table.filter(sample_filter, axis='sample') train_table = train_table.filter(read_filter, axis='observation') train_metadata = dmatrix(formula, train_metadata, return_type='dataframe') train_table, train_metadata = match(train_table, train_metadata) # hold out data preprocessing test_table, test_metadata = test_table, test_metadata metadata_filter = lambda val, id_, md: id_ in test_metadata.index obs_lookup = set(train_table.ids(axis='observation')) feat_filter = lambda val, id_, md: id_ in obs_lookup test_table = test_table.filter(metadata_filter, axis='sample') test_table = test_table.filter(feat_filter, axis='observation') test_metadata = dmatrix(formula, test_metadata, return_type='dataframe') test_table, test_metadata = match(test_table, test_metadata) # pad extra columns with zeros, so that we can still make predictions extra_columns = list( set(train_metadata.columns) - set(test_metadata.columns)) df = pd.DataFrame( {C: np.zeros(test_metadata.shape[0]) for C in extra_columns}, index=test_metadata.index) test_metadata = pd.concat((test_metadata, df), axis=1) return train_table, test_table, train_metadata, test_metadata
from skbio import TreeNode from gneiss.util import match from sklearn.cross_decomposition import PLSSVD from skbio.stats.composition import clr, centralize, multiplicative_replacement from biplot import make_biplot plt.rcParams['svg.fonttype'] = 'none' args = sys.argv[1:] mapping = pd.read_table(args[0], index_col=0, sep='\t') microbes = qiime2.Artifact.load(args[1]).view(pd.DataFrame) metabolites = qiime2.Artifact.load(args[2]).view(pd.DataFrame) # do this match thing twice to make sure they are all matched mapping, microbes = match(mapping, microbes) mapping, metabolites = match(mapping, metabolites) microbes, metabolites = match(microbes, metabolites) mapping, microbes = match(mapping, microbes) mapping, metabolites = match(mapping, metabolites) microbes, metabolites = match(microbes, metabolites) catdict = { i + 1: val for i, val in enumerate(sorted(mapping['category'].unique().tolist())) } n = mapping.shape[0] print('Number of samples: %d' % n) print('Number of microbes: %d' % microbes.shape[1])
def balance_taxonomy(output_dir: str, balances: pd.DataFrame, tree: TreeNode, taxonomy: pd.DataFrame, balance_name: Str, taxa_level: Int = 0, metadata: MetadataCategory = None) -> None: # parse out headers for taxonomy taxa_data = list(taxonomy['Taxon'].apply(lambda x: x.split(';')).values) taxa_df = pd.DataFrame(taxa_data, index=taxonomy.index) # fill in NAs def f(x): y = np.array(list(map(lambda k: k is not None, x))) i = max(0, np.where(y)[0][-1]) x[np.logical_not(y)] = [x[i]] * np.sum(np.logical_not(y)) return x taxa_df = taxa_df.apply(f, axis=1) num_clade = tree.find(balance_name).children[NUMERATOR] denom_clade = tree.find(balance_name).children[DENOMINATOR] if num_clade.is_tip(): num_features = pd.DataFrame( {num_clade.name: taxa_df.loc[num_clade.name]} ).T else: num_features = taxa_df.loc[num_clade.subset()] if denom_clade.is_tip(): denom_features = pd.DataFrame( {denom_clade.name: taxa_df.loc[denom_clade.name]} ).T else: denom_features = taxa_df.loc[denom_clade.subset()] num_color, denom_color = '#4c72b0', '#4c72b0' fig, (ax_num, ax_denom) = plt.subplots(2) balance_barplots(tree, balance_name, taxa_level, taxa_df, denom_color=denom_color, num_color=num_color, axes=(ax_num, ax_denom)) ax_num.set_title( r'$%s_{numerator} \; taxa \; (%d \; taxa)$' % (balance_name, len(num_features))) ax_denom.set_title( r'$%s_{denominator} \; taxa \; (%d \; taxa)$' % (balance_name, len(denom_features))) ax_denom.set_xlabel('Number of unique taxa') plt.tight_layout() fig.savefig(os.path.join(output_dir, 'barplots.svg')) fig.savefig(os.path.join(output_dir, 'barplots.pdf')) if metadata is not None: fig2, ax = plt.subplots() c = metadata.to_series() data, c = match(balances, c) data[c.name] = c y = data[balance_name] # check if continuous try: c = c.astype(np.float64) ax.scatter(c.values, y) ax.set_xlabel(c.name) except: balance_boxplot(balance_name, data, y=c.name, ax=ax) ylabel = (r"$%s = \ln \frac{%s_{numerator}}" "{%s_{denominator}}$") % (balance_name, balance_name, balance_name) ax.set_title(ylabel, rotation=0) ax.set_ylabel('log ratio') fig2.savefig(os.path.join(output_dir, 'balance_metadata.svg')) fig2.savefig(os.path.join(output_dir, 'balance_metadata.pdf')) index_fp = os.path.join(output_dir, 'index.html') with open(index_fp, 'w') as index_f: index_f.write('<html><body>\n') if metadata is not None: index_f.write('<h1>Balance vs %s </h1>\n' % c.name) index_f.write(('<img src="balance_metadata.svg" ' 'alt="barplots">\n\n' '<a href="balance_metadata.pdf">' 'Download as PDF</a><br>\n')) index_f.write(('<h1>Balance Taxonomy</h1>\n' '<img src="barplots.svg" alt="barplots">\n\n' '<a href="barplots.pdf">' 'Download as PDF</a><br>\n' '<h3>Numerator taxa</h3>\n' '<a href="numerator.csv">\n' 'Download as CSV</a><br>\n' '<h3>Denominator taxa</h3>\n' '<a href="denominator.csv">\n' 'Download as CSV</a><br>\n')) num_features.to_csv(os.path.join(output_dir, 'numerator.csv'), header=True, index=True) denom_features.to_csv(os.path.join(output_dir, 'denominator.csv'), header=True, index=True) index_f.write('</body></html>\n')
def gradient_linkage(X, y, method='average'): r""" Hierarchical Clustering on known gradient. The hierarchy is built based on the values of the samples located along a gradient. Given a feature :math:`x`, the mean gradient values that :math:`x` was observed in is calculated by .. math:: f(g , x) = \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j} Where :math:`N` is the number of samples, :math:`x_i` is the proportion of feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value at sample `i`. The distance between two features :math:`x` and :math:`y` can be defined as .. math:: d(x, y) = (f(g, x) - f(g, y))^2 If :math:`d(x, y)` is very small, then :math:`x` and :math:`y` are expected to live in very similar positions across the gradient. A hierarchical clustering is then performed using :math:`d(x, y)` as the distance metric. This can be useful for constructing principal balances. Parameters ---------- X : pd.DataFrame Contingency table where the samples are rows and the features are columns. y : pd.Series Continuous vector representing some ordering of the samples in X. method : str Clustering method. (default='average') Returns ------- skbio.TreeNode Tree for constructing principal balances. See Also -------- mean_niche_estimator Examples -------- >>> import pandas as pd >>> from gneiss.cluster import gradient_linkage >>> table = pd.DataFrame([[1, 1, 0, 0, 0], ... [0, 1, 1, 0, 0], ... [0, 0, 1, 1, 0], ... [0, 0, 0, 1, 1]], ... columns=['s1', 's2', 's3', 's4', 's5'], ... index=['o1', 'o2', 'o3', 'o4']).T >>> gradient = pd.Series([1, 2, 3, 4, 5], ... index=['s1', 's2', 's3', 's4', 's5']) >>> tree = gradient_linkage(table, gradient) >>> print(tree.ascii_art()) /-o1 /y1------| | \-o2 -y0------| | /-o3 \y2------| \-o4 """ _X, _y = match(X, y) mean_X = mean_niche_estimator(_X, gradient=_y) t = rank_linkage(mean_X) return t