def test_input_types(self): actual_array = beta_diversity('euclidean', np.array([[1, 5], [2, 3]]), ids=['a', 'b']) actual_list = beta_diversity('euclidean', [[1, 5], [2, 3]], ids=['a', 'b']) self.assertEqual(actual_array, actual_list)
def test_euclidean(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available actual_dm = beta_diversity('euclidean', self.table1, self.sids1) self.assertEqual(actual_dm.shape, (3, 3)) npt.assert_almost_equal(actual_dm['A', 'A'], 0.0) npt.assert_almost_equal(actual_dm['B', 'B'], 0.0) npt.assert_almost_equal(actual_dm['C', 'C'], 0.0) npt.assert_almost_equal(actual_dm['A', 'B'], 2.23606798) npt.assert_almost_equal(actual_dm['B', 'A'], 2.23606798) npt.assert_almost_equal(actual_dm['A', 'C'], 4.12310563) npt.assert_almost_equal(actual_dm['C', 'A'], 4.12310563) npt.assert_almost_equal(actual_dm['B', 'C'], 2.82842712) npt.assert_almost_equal(actual_dm['C', 'B'], 2.82842712) actual_dm = beta_diversity('euclidean', self.table2, self.sids2) expected_data = [ [0., 80.8455317, 84.0297566, 36.3042697, 86.0116271, 78.9176786], [80.8455317, 0., 71.0844568, 74.4714710, 69.3397433, 14.422205], [84.0297566, 71.0844568, 0., 77.2851861, 8.3066238, 60.7536007], [36.3042697, 74.4714710, 77.2851861, 0., 78.7908624, 70.7389567], [86.0116271, 69.3397433, 8.3066238, 78.7908624, 0., 58.4807660], [78.9176786, 14.422205, 60.7536007, 70.7389567, 58.4807660, 0.] ] expected_dm = DistanceMatrix(expected_data, self.sids2) for id1 in self.sids2: for id2 in self.sids2: npt.assert_almost_equal(actual_dm[id1, id2], expected_dm[id1, id2], 6)
def test_qualitative_bug_issue_1549(self): mat = np.array([[42, 0, 37, 99, 1], [12, 1, 22, 88, 0], [25, 3, 23, 86, 0], [0, 0, 87, 12, 0]]) as_presence_absence = mat > 0 obs_mat = beta_diversity('jaccard', mat) obs_presence_absence = beta_diversity('jaccard', as_presence_absence) self.assertEqual(obs_mat, obs_presence_absence)
def test_alt_pairwise_func(self): # confirm that pairwise_func is actually being used def not_a_real_pdist(counts, metric): return [[0.0, 42.0], [42.0, 0.0]] dm1 = beta_diversity('unweighted_unifrac', self.table1, otu_ids=self.oids1, tree=self.tree1, pairwise_func=not_a_real_pdist) expected = DistanceMatrix([[0.0, 42.0], [42.0, 0.0]]) self.assertEqual(dm1, expected) dm1 = beta_diversity('weighted_unifrac', self.table1, otu_ids=self.oids1, tree=self.tree1, pairwise_func=not_a_real_pdist) expected = DistanceMatrix([[0.0, 42.0], [42.0, 0.0]]) self.assertEqual(dm1, expected) dm1 = beta_diversity(unweighted_unifrac, self.table1, otu_ids=self.oids1, tree=self.tree1, pairwise_func=not_a_real_pdist) expected = DistanceMatrix([[0.0, 42.0], [42.0, 0.0]]) self.assertEqual(dm1, expected) dm1 = beta_diversity("euclidean", self.table1, pairwise_func=not_a_real_pdist) expected = DistanceMatrix([[0.0, 42.0], [42.0, 0.0]]) self.assertEqual(dm1, expected)
def test_euclidean(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available actual_dm = beta_diversity('euclidean', self.table1, self.sids1) self.assertEqual(actual_dm.shape, (3, 3)) npt.assert_almost_equal(actual_dm['A', 'A'], 0.0) npt.assert_almost_equal(actual_dm['B', 'B'], 0.0) npt.assert_almost_equal(actual_dm['C', 'C'], 0.0) npt.assert_almost_equal(actual_dm['A', 'B'], 2.23606798) npt.assert_almost_equal(actual_dm['B', 'A'], 2.23606798) npt.assert_almost_equal(actual_dm['A', 'C'], 4.12310563) npt.assert_almost_equal(actual_dm['C', 'A'], 4.12310563) npt.assert_almost_equal(actual_dm['B', 'C'], 2.82842712) npt.assert_almost_equal(actual_dm['C', 'B'], 2.82842712) actual_dm = beta_diversity('euclidean', self.table2, self.sids2) expected_data = [ [0., 80.8455317, 84.0297566, 36.3042697, 86.0116271, 78.9176786], [80.8455317, 0., 71.0844568, 74.4714710, 69.3397433, 14.422205], [84.0297566, 71.0844568, 0., 77.2851861, 8.3066238, 60.7536007], [36.3042697, 74.4714710, 77.2851861, 0., 78.7908624, 70.7389567], [86.0116271, 69.3397433, 8.3066238, 78.7908624, 0., 58.4807660], [78.9176786, 14.422205, 60.7536007, 70.7389567, 58.4807660, 0.]] expected_dm = DistanceMatrix(expected_data, self.sids2) for id1 in self.sids2: for id2 in self.sids2: npt.assert_almost_equal(actual_dm[id1, id2], expected_dm[id1, id2], 6)
def test_braycurtis(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available actual_dm = beta_diversity('braycurtis', self.table1, self.sids1) self.assertEqual(actual_dm.shape, (3, 3)) npt.assert_almost_equal(actual_dm['A', 'A'], 0.0) npt.assert_almost_equal(actual_dm['B', 'B'], 0.0) npt.assert_almost_equal(actual_dm['C', 'C'], 0.0) npt.assert_almost_equal(actual_dm['A', 'B'], 0.27272727) npt.assert_almost_equal(actual_dm['B', 'A'], 0.27272727) npt.assert_almost_equal(actual_dm['A', 'C'], 0.71428571) npt.assert_almost_equal(actual_dm['C', 'A'], 0.71428571) npt.assert_almost_equal(actual_dm['B', 'C'], 0.66666667) npt.assert_almost_equal(actual_dm['C', 'B'], 0.66666667) actual_dm = beta_diversity('braycurtis', self.table2, self.sids2) expected_data = [ [0., 0.78787879, 0.86666667, 0.30927835, 0.85714286, 0.81521739], [0.78787879, 0., 0.78142077, 0.86813187, 0.75, 0.1627907], [0.86666667, 0.78142077, 0., 0.87709497, 0.09392265, 0.71597633], [0.30927835, 0.86813187, 0.87709497, 0., 0.87777778, 0.89285714], [0.85714286, 0.75, 0.09392265, 0.87777778, 0., 0.68235294], [0.81521739, 0.1627907, 0.71597633, 0.89285714, 0.68235294, 0.]] expected_dm = DistanceMatrix(expected_data, self.sids2) for id1 in self.sids2: for id2 in self.sids2: npt.assert_almost_equal(actual_dm[id1, id2], expected_dm[id1, id2], 6)
def test_braycurtis(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available actual_dm = beta_diversity('braycurtis', self.table1, self.sids1) self.assertEqual(actual_dm.shape, (3, 3)) npt.assert_almost_equal(actual_dm['A', 'A'], 0.0) npt.assert_almost_equal(actual_dm['B', 'B'], 0.0) npt.assert_almost_equal(actual_dm['C', 'C'], 0.0) npt.assert_almost_equal(actual_dm['A', 'B'], 0.27272727) npt.assert_almost_equal(actual_dm['B', 'A'], 0.27272727) npt.assert_almost_equal(actual_dm['A', 'C'], 0.71428571) npt.assert_almost_equal(actual_dm['C', 'A'], 0.71428571) npt.assert_almost_equal(actual_dm['B', 'C'], 0.66666667) npt.assert_almost_equal(actual_dm['C', 'B'], 0.66666667) actual_dm = beta_diversity('braycurtis', self.table2, self.sids2) expected_data = [ [0., 0.78787879, 0.86666667, 0.30927835, 0.85714286, 0.81521739], [0.78787879, 0., 0.78142077, 0.86813187, 0.75, 0.1627907], [0.86666667, 0.78142077, 0., 0.87709497, 0.09392265, 0.71597633], [0.30927835, 0.86813187, 0.87709497, 0., 0.87777778, 0.89285714], [0.85714286, 0.75, 0.09392265, 0.87777778, 0., 0.68235294], [0.81521739, 0.1627907, 0.71597633, 0.89285714, 0.68235294, 0.] ] expected_dm = DistanceMatrix(expected_data, self.sids2) for id1 in self.sids2: for id2 in self.sids2: npt.assert_almost_equal(actual_dm[id1, id2], expected_dm[id1, id2], 6)
def test_weighted_unifrac_normalized(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available # expected values calculated by hand dm1 = beta_diversity('weighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1, normalized=True) dm2 = beta_diversity(weighted_unifrac, self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1, normalized=True) self.assertEqual(dm1.shape, (3, 3)) self.assertEqual(dm1, dm2) expected_data = [[0.0, 0.128834, 0.085714], [0.128834, 0.0, 0.2142857], [0.085714, 0.2142857, 0.0]] expected_dm = DistanceMatrix(expected_data, ids=self.sids1) for id1 in self.sids1: for id2 in self.sids1: npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2], 6)
def test_scipy_kwargs(self): # confirm that p can be passed to SciPy's minkowski, and that it # gives a different result than not passing it (the off-diagonal # entries are not equal). dm1 = beta_diversity('minkowski', self.table1, self.sids1) dm2 = beta_diversity('minkowski', self.table1, self.sids1, p=42.0) for id1 in self.sids1: for id2 in self.sids1: if id1 != id2: self.assertNotEqual(dm1[id1, id2], dm2[id1, id2])
def __dapply__(self, experiment): otu_ids = experiment.data_df.index df = experiment.data_df.transpose() try: dm = beta_diversity(self.distance_metric, counts=df.as_matrix(), otu_ids=otu_ids, **self.kwargs) except TypeError as e: if 'takes no keyword arguments' in str(e): dm = beta_diversity(self.distance_metric, counts=df.as_matrix(), **self.kwargs) else: raise(e) distance_matrix_df = pd.DataFrame(dm.data, index=df.index, columns=df.index) return distance_matrix_df
def test_empty(self): # array of empty vectors actual = beta_diversity('euclidean', np.array([[], []], dtype=np.int64), ids=['a', 'b']) expected_dm = DistanceMatrix([[0.0, 0.0], [0.0, 0.0]], ['a', 'b']) npt.assert_array_equal(actual, expected_dm) actual = beta_diversity('unweighted_unifrac', np.array([[], []], dtype=np.int64), ids=['a', 'b'], tree=self.tree1, otu_ids=[]) expected_dm = DistanceMatrix([[0.0, 0.0], [0.0, 0.0]], ['a', 'b']) self.assertEqual(actual, expected_dm)
def compute_distance_matrices( biom, tree=None, metrics=['weighted_unifrac', 'unweighted_unifrac', 'braycurtis', 'jaccard']): dms = {} for metric in metrics: if metric in ['unweighted_unifrac', 'weighted_unifrac']: dms[metric] = beta_diversity(metric, counts=np.asarray(biom.T), ids=biom.columns, otu_ids=biom.index, tree=tree) else: dms[metric] = beta_diversity(metric, counts=np.asarray(biom.T), ids=biom.columns) return dms
def compute_beta(self, metric="unweighted_unifrac"): if "unifrac" not in metric: dist_mat = beta_diversity(metric, self.otu_df, self.sample_ids) dist_mat = pd.DataFrame(dist_mat.data) else: dist_mat = self.__beta_unifrac(metric) return dist_mat
def diversity(df_sv_list): """ use skbio to compute different diversity metrics""" richness = pd.DataFrame(index=allsamples) shannon = pd.DataFrame(index=allsamples) bc_dm_list = [] for i, df in enumerate(df_sv_list): data = df.iloc[:, 1:].T.values #columns are the SVs and rows are the samples ids = df.columns[1:] #ids should have the same order as the data rows #richness richness = richness.merge(pd.DataFrame( alpha_diversity("observed_otus", data, ids)), how="left", left_index=True, right_index=True) richness.rename(columns={0: df_sv_list_names[i]}, inplace=True) #shannon shannon = shannon.merge(pd.DataFrame( alpha_diversity("shannon", data, ids)), how="left", left_index=True, right_index=True) shannon.rename(columns={0: df_sv_list_names[i]}, inplace=True) #bray-curtis distance matrix: bc_dm = beta_diversity("braycurtis", data, ids) temp_bc = pd.DataFrame(index=bc_dm.ids, columns=bc_dm.ids) temp_bc.iloc[:, :] = bc_dm.data bc_dm_list.append(temp_bc) return richness, shannon, bc_dm_list
def beta_diversity_heatmap(self, working_samples, samples_list, tax_level): """ """ from skbio.diversity import beta_diversity import seaborn as sns if self.abundance_df.groupAbsoluteSamples() is not None: data0 = self.abundance_df.groupAbsoluteSamples( )[samples_list].astype('int') ids = list(data0.columns) data = data0.transpose().values.tolist() bc_dm = beta_diversity("braycurtis", data, ids) g = sns.clustermap(pd.DataFrame(bc_dm.data, index=ids, columns=ids), metric='braycurtis', annot_kws={"size": 8}) self.save_high_resolution_figure( g, 'Select file to save the beta diversity heatmap', 'beta_diversity_heatmap', defaultextension='.png') import matplotlib.pyplot as plt plt.close("all")
def test_weighted_unifrac_partial_full(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available # expected values calculated by hand dm1 = partial_beta_diversity('weighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1, id_pairs=[('A', 'B'), ('A', 'C'), ('B', 'C')]) dm2 = beta_diversity('weighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1) self.assertEqual(dm1.shape, (3, 3)) self.assertEqual(dm1, dm2) expected_data = [[0.0, 0.1750000, 0.12499999], [0.1750000, 0.0, 0.3000000], [0.12499999, 0.3000000, 0.0]] expected_dm = DistanceMatrix(expected_data, ids=self.sids1) for id1 in self.sids1: for id2 in self.sids1: npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2], 6)
def test_block_beta_diversity(self): exp = beta_diversity('unweighted_unifrac', self.table1, self.sids1, tree=self.tree1, otu_ids=self.oids1) obs = block_beta_diversity('unweighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1, k=2) npt.assert_equal(obs.data, exp.data) self.assertEqual(obs.ids, exp.ids)
def data_diversity(data): ids = list((np.linspace(1, data.shape[0], num=data.shape[0])).astype('int')) bc_dm = beta_diversity("braycurtis", np.absolute(data), ids) q = bc_dm.to_data_frame().to_numpy() t = np.mean(q, axis=0) avg_div_score = np.mean(t) return t, avg_div_score
def test_unweighted_unifrac(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available # expected values calculated by hand dm1 = beta_diversity('unweighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1) dm2 = beta_diversity(unweighted_unifrac, self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1) self.assertEqual(dm1.shape, (3, 3)) self.assertEqual(dm1, dm2) expected_data = [[0.0, 0.0, 0.25], [0.0, 0.0, 0.25], [0.25, 0.25, 0.0]] expected_dm = DistanceMatrix(expected_data, ids=self.sids1) for id1 in self.sids1: for id2 in self.sids1: npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2], 6)
def compute_distance_matrices(biom, tree=None, metrics=[ 'weighted_unifrac', 'unweighted_unifrac', 'braycurtis', 'jaccard' ]): dms = {} for metric in metrics: if metric in ['unweighted_unifrac', 'weighted_unifrac']: dms[metric] = beta_diversity(metric, counts=np.asarray(biom.T), ids=biom.columns, otu_ids=biom.index, tree=tree) else: dms[metric] = beta_diversity(metric, counts=np.asarray(biom.T), ids=biom.columns) return dms
def __dapply__(self, experiment): otu_ids = experiment.data_df.index df = experiment.data_df.transpose() try: dm = beta_diversity(self.distance_metric, counts=df.as_matrix(), otu_ids=otu_ids, **self.kwargs) except TypeError as e: if 'takes no keyword arguments' in str(e): dm = beta_diversity(self.distance_metric, counts=df.as_matrix(), **self.kwargs) else: raise (e) distance_matrix_df = pd.DataFrame(dm.data, index=df.index, columns=df.index) return distance_matrix_df
def compute_beta_braycurtis(df): from skbio.diversity import beta_diversity l = [] ids = [] for i in range(0, len(df.columns)): l.append(df.iloc[:, i]) ids.append(df.columns[i]) array = np.array(l) counts = array.astype(int) r = beta_diversity(metric="braycurtis", counts = counts, ids = ids) bm = [] r_df = r.to_data_frame() for i in range(0, len(r_df)): bm.append(list(r_df.iloc[i, :])) return bm
def make_distance_matrix(biom_fp, method="braycurtis"): '''biom.Table --> skbio.DistanceMatrix''' table = load_table(biom_fp) # extract sample metadata from table, put in df table_md = {s_id: dict(table.metadata(s_id)) for s_id in table.ids()} s_md = pd.DataFrame.from_dict(table_md, orient='index') # extract data from table and multiply, assuming that table contains # relative abundances (which cause beta_diversity to fail) table_data = [[int(num * 100000) for num in table.data(s_id)] for s_id in table.ids()] # beta diversity dm = beta_diversity(method, table_data, table.ids()) return dm, s_md
def get_dist(metric, mtx): #print(mtx) #print(distance.squareform(mtx)) if metric == 'bray_curtis': #dtvar = dt.dist_bray_curtis(mtx, strict=False) #dtvar = distance.braycurtis(mtx, strict=False) dm1 = beta_diversity("braycurtis", mtx ) elif metric == 'morisita_horn': #dtvar = dt.dist_morisita_horn(mtx, strict=False) #dtvar = distance.dist_morisita_horn(mtx, strict=False) dm1 = beta_diversity("braycurtis", mtx ) elif metric == 'canberra': #dtvar = dt.dist_canberra(mtx, strict=False) #dtvar = distance.canberra(mtx, strict=False) dm1 = beta_diversity("canberra", mtx ) elif metric == 'jaccard': #dtvar = dt.binary_dist_jaccard(mtx, strict=False) #dtvar = distance.jaccard(mtx) dm1 = beta_diversity("jaccard", mtx ) elif metric == 'kulczynski': #dtvar = dt.dist_kulczynski(mtx, strict=False) #dtvar = distance.kulczynski(mtx, strict=False) dm1 = beta_diversity("kulczynski", mtx ) elif metric == "yue-clayton": # not availible in python? # R-code #stand <-decostand(data.matrix(biods),"total"); #dis<-designdist(stand, method="1-(J/(A+B-J))",terms = c( "quadratic"), abcd = FALSE) dm1 = beta_diversity("braycurtis", mtx ) elif metric == "correlation": dm1 = beta_diversity("correlation", mtx ) else: # default #dtvar = dt.dist_bray_curtis(mtx, strict=False) #dtvar = dt.dist_bray_curtis(mtx, strict=False) dm1 = beta_diversity("braycurtis", mtx ) ## http://stackoverflow.com/questions/38384329/how-to-get-skbio-pcoa-principal-coordinate-analysis-results return dm1
def beta(self, table: biom.Table, metric: str, pseudocount: int = 1, n_jobs: int = 1): counts = table.matrix_data.toarray().T if table.is_empty(): raise ValueError("The provided table object is empty") sample_ids = table.ids(axis='sample') beta_dv = beta_diversity( metric=metric, counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs) return beta_dv
def compute_beta_diversity(self): """Compute and cache beta diversity values This method calculates a beta diversity distance matrix and saves it to a folder for re-use. The matrices are calculated based on the full dataset so that any subsample drawn from the full dataset can be fetched from these precomputed matrices. See Also -------- Sculptor.compute_alpha_diversity """ dir_fp = 'roc-curves/%s/cached-matrices/' % self.name os.makedirs(dir_fp, exist_ok=True) X = self._original_bt.matrix_data.toarray().astype(np.int).T self._beta_diversity_matrices = {} for metric in self._beta_metrics: fp = os.path.join(dir_fp, metric + '.full.txt') if os.path.exists(fp): distance_matrix = DistanceMatrix.read(fp) else: if metric in {'unweighted_unifrac', 'weighted_unifrac'}: kws = { 'tree': self.tree, 'otu_ids': self._original_bt.ids('observation') } else: kws = {} distance_matrix = beta_diversity(metric, X, self._original_bt.ids(), **kws) distance_matrix.write(fp) self._beta_diversity_matrices[metric] = distance_matrix
def test_weighted_unifrac_partial_full(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available # expected values calculated by hand dm1 = partial_beta_diversity('weighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1, id_pairs=[('A', 'B'), ('A', 'C'), ('B', 'C')]) dm2 = beta_diversity('weighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1) self.assertEqual(dm1.shape, (3, 3)) self.assertEqual(dm1, dm2) expected_data = [ [0.0, 0.1750000, 0.12499999], [0.1750000, 0.0, 0.3000000], [0.12499999, 0.3000000, 0.0]] expected_dm = DistanceMatrix(expected_data, ids=self.sids1) for id1 in self.sids1: for id2 in self.sids1: npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2], 6)
def get_pairwise_dist_mat(deblur_biom, dist_type): """Returns pairwise distance matrix for deblurred seqs Parameters ---------- deblur_biom: biom.Table Sequences we want pairwise distances (by sample) for dist_type: str Distance metric we want. Usually "jaccard" or "braycurtis" Returns ------- numpy matrix of pairwise distances """ if(dist_type == "jaccard"): deblur_biom = deblur_biom.pa(inplace=False) print("starting beta_diversity") dist_mat = beta_diversity(dist_type, deblur_biom.transpose().matrix_data.astype("int64").todense(), ids = deblur_biom.ids(axis="sample")) print("end beta_diversity") return dist_mat
##################### adiv_obs_otuss = alpha_diversity('observed_otus', datas, idss) adiv_faith_pds = alpha_diversity('faith_pd', datas, ids=idss, otu_ids=dfs1.columns, tree=tree, validate=False) #bc_dm = beta_diversity("braycurtis", data, ids, validate=False) wu_dms = beta_diversity("weighted_unifrac", datas, idss, tree=tree, otu_ids=dfs1.columns, validate=False) print(wu_dms) o = open("beta.tsv", "w") o.write(str(wu_dms)) wu_pcs = pcoa(wu_dms) print(wu_pcs) #wu_pc.write("eigen.tsv", format='ordination') #subprocess.call("sed -n '2p' eigen.tsv > eigen_input.tsv", shell=True) #subprocess.call("sed -1 '1 i /PC1\tPC2\tPC3' eigen_input.tsv", shell=True) eigen = pd.read_csv("eigen_input.tsv", sep="\t", header=None) eigen.columns = ['PC1', 'PC2', 'PC3'] eigen = eigen.round(3)
def betadiv_clustering(TaXon_table_xlsx, height, width, threshold, betadiv_linkage, taxonomic_level, path_to_outdirs, template, font_size, diss_metric): from scipy.cluster.hierarchy import dendrogram, linkage import plotly.figure_factory as ff import numpy as np import pandas as pd from skbio.diversity import beta_diversity from pathlib import Path import PySimpleGUI as sg import webbrowser ## import table TaXon_table_xlsx = Path(TaXon_table_xlsx) TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0).fillna("unidentified") ## create a y axis title text taxon_title = taxonomic_level.lower() ## adjust taxonomic level if neccessary if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]: taxon_title = taxonomic_level taxonomic_level = "ID" ## collect samples for plot samples = TaXon_table_df.columns.tolist()[10:] ## extract the relevant data TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples] ## define an aggregation function to combine multiple hit of one taxonimic level aggregation_functions = {} ## define samples functions for sample in samples: ## 'sum' will calculate the sum of p/a data aggregation_functions[sample] = 'sum' ## define taxon level function aggregation_functions[taxonomic_level] = 'first' ## create condensed dataframe df_new = TaXon_table_df.groupby( TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions) if 'unidentified' in df_new.index: df_new = df_new.drop('unidentified') ## collect reads data = df_new[samples].transpose().values.tolist() ## calculate jaccard distances dissimilarity_dm = beta_diversity(diss_metric, data, samples) ## convert to distance matrix X1 = dissimilarity_dm.data matrix_df = pd.DataFrame(X1) matrix_df.columns = samples matrix_df.index = samples ## convert to 2D array X2 = dissimilarity_dm.condensed_form() ## cluster dendrogram fig = ff.create_dendrogram( X1, labels=samples, color_threshold=float(threshold), orientation="left", linkagefun=lambda x: linkage(X2, betadiv_linkage, metric=diss_metric)) fig.update_yaxes(ticks="") fig.update_xaxes(title="A") title = str(diss_metric) + " distance" fig.update_layout(xaxis_title=title, height=int(height), width=int(width), template=template, font_size=font_size, title_font_size=font_size) # finish script output_pdf = Path( str(path_to_outdirs) + "/" + "Beta_diversity" + "/" + TaXon_table_xlsx.stem + "_" + taxon_title + "_dendrogram_" + diss_metric + ".pdf") output_html = Path( str(path_to_outdirs) + "/" + "Beta_diversity" + "/" + TaXon_table_xlsx.stem + "_" + taxon_title + "_dendrogram_" + diss_metric + ".html") output_xlsx = Path( str(path_to_outdirs) + "/" + "Beta_diversity" + "/" + TaXon_table_xlsx.stem + "_" + taxon_title + "_dendrogram_" + diss_metric + ".xlsx") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) matrix_df.to_excel(output_xlsx) ## ask to show plot answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## write to log file sg.Popup(diss_metric + " clustering dendrograms are found in", path_to_outdirs, "/Beta_diversity/", title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log ttt_log(diss_metric + " clustering", "analysis", TaXon_table_xlsx.name, output_pdf.name, "", path_to_outdirs)
def beta_diversity(TaXon_table_xlsx, width, heigth, cmap, meta_data_to_test, taxonomic_level, path_to_outdirs, template, font_size, diss_metric): import pandas as pd import numpy as np from skbio.diversity import beta_diversity from skbio.stats.distance import anosim import plotly.express as px from pathlib import Path import PySimpleGUI as sg import webbrowser TaXon_table_xlsx = Path(TaXon_table_xlsx) Meta_data_table_xlsx = Path( str(path_to_outdirs) + "/" + "Meta_data_table" + "/" + TaXon_table_xlsx.stem + "_metadata.xlsx") TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0).fillna("unidentified") TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx, header=0).fillna("nan") Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist() metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test) ## drop samples with metadata called nan (= empty) drop_samples = [ i[0] for i in Meta_data_table_df.values.tolist() if i[metadata_loc] == "nan" ] if drop_samples != []: ## filter the TaXon table TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1) TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] ## also remove empty OTUs row_filter_list = [] for row in TaXon_table_df.values.tolist(): reads = set(row[10:]) if reads != {0}: row_filter_list.append(row) columns = TaXon_table_df.columns.tolist() TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns) Meta_data_table_df = pd.DataFrame( [ i for i in Meta_data_table_df.values.tolist() if i[0] not in drop_samples ], columns=Meta_data_table_df.columns.tolist()) Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist() ## create a y axis title text taxon_title = taxonomic_level ## adjust taxonomic level if neccessary if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]: taxon_title = taxonomic_level taxonomic_level = "ID" # check if the meta data differs if len(set(Meta_data_table_df[meta_data_to_test])) == len( Meta_data_table_df['Samples'].tolist()): sg.Popup( "The meta data is unique for all samples. Please adjust the meta data table!", title=("Error")) raise RuntimeError # check if the meta data differs if len(set(Meta_data_table_df[meta_data_to_test])) == 1: sg.Popup( "The meta data is similar for all samples. Please adjust the meta data table!", title=("Error")) raise RuntimeError if sorted(TaXon_table_samples) == sorted(Meta_data_table_samples): ## collect samples for plot samples = Meta_data_table_samples ## extract the relevant data TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples] ## define an aggregation function to combine multiple hit of one taxonimic level aggregation_functions = {} ## define samples functions for sample in samples: ## 'sum' will calculate the sum of p/a data aggregation_functions[sample] = 'sum' ## define taxon level function aggregation_functions[taxonomic_level] = 'first' ## create condensed dataframe df_new = TaXon_table_df.groupby( TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions) if 'unidentified' in df_new.index: df_new = df_new.drop('unidentified') ## collect reads data = df_new[samples].transpose().values.tolist() ## calculate dissimilarity distances dissimilarity_dm = beta_diversity(diss_metric, data, samples) anosim_results = anosim(dissimilarity_dm, metadata_list, permutations=999) anosim_r = round(anosim_results['test statistic'], 5) anosim_p = anosim_results['p-value'] textbox = "Anosim (" + meta_data_to_test + ", " + taxon_title + ")<br>" + "R = " + str( anosim_r) + "<br>" + "p = " + str(anosim_p) matrix = dissimilarity_dm.data matrix_df = pd.DataFrame(matrix) matrix_df.columns = samples matrix_df.index = samples # create plot color_label = diss_metric + " distance" fig = px.imshow(matrix, x=samples, y=samples, color_continuous_scale=cmap, labels=dict(color=color_label)) fig.update_layout(height=int(heigth), width=int(width), template=template, showlegend=True, title=textbox, font_size=font_size, title_font_size=font_size) # finish script output_pdf = Path( str(path_to_outdirs) + "/" + "Beta_diversity" + "/" + TaXon_table_xlsx.stem + "_" + meta_data_to_test + "_" + taxon_title + "_" + diss_metric + ".pdf") output_html = Path( str(path_to_outdirs) + "/" + "Beta_diversity" + "/" + TaXon_table_xlsx.stem + "_" + meta_data_to_test + "_" + taxon_title + "_" + diss_metric + ".html") output_xlsx = Path( str(path_to_outdirs) + "/" + "Beta_diversity" + "/" + TaXon_table_xlsx.stem + "_" + meta_data_to_test + "_" + taxon_title + "_" + diss_metric + ".xlsx") fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) matrix_df.to_excel(output_xlsx) ## ask to show plot answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## write to log file sg.Popup("Beta diversity estimate are found in", path_to_outdirs, "/Beta_diversity/", title="Finished", keep_on_top=True) from taxontabletools.create_log import ttt_log ttt_log("beta diversity", "analysis", TaXon_table_xlsx.name, output_pdf.name, meta_data_to_test, path_to_outdirs) else: sg.PopupError( "Error: The samples between the taxon table and meta table do not match!", keep_on_top=True)
def __main__(): parser = optparse.OptionParser( usage="%prog [options]" ) parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' ) parser.add_option( '-i', '--input', dest='input', action='store', type="string", default=None, help='Input abundance Filename' ) parser.add_option( '', '--otu_column', dest='otu_column', action='store', type="int", default=None, help='OTU ID Column (1 based)' ) parser.add_option( '', '--sample_columns', dest='sample_columns', action='store', type="string", default=None, help='Comma separated list of sample columns, unset to use all.' ) parser.add_option( '', '--header', dest='header', action='store_true', default=False, help='Abundance file has a header line' ) parser.add_option( '', '--distance_metric', dest='distance_metric', action='store', type="string", default=None, help='Distance metric to use' ) parser.add_option( '', '--tree', dest='tree', action='store', type="string", default=None, help='Newick Tree Filename' ) parser.add_option( '-o', '--output', dest='output', action='store', type="string", default=None, help='Output Filename' ) (options, args) = parser.parse_args() if options.version: print >> sys.stderr, "scikit-bio betadiversity from tabular file", __VERSION__ sys.exit() if options.otu_column is not None: otu_column = options.otu_column - 1 else: otu_column = None if options.sample_columns is None: with open( options.input, 'rb' ) as fh: line = fh.readline() columns = range( len( line.split( DELIMITER ) ) ) if otu_column in columns: columns.remove( otu_column ) else: columns = map( lambda x: int( x ) - 1, options.sample_columns.split( "," ) ) max_col = max( columns + [otu_column] ) counts = [ [] for x in columns ] sample_names = [] otu_names = [] with open( options.input, 'rb' ) as fh: if options.header: header = fh.readline().rstrip('\n\r').split( DELIMITER ) sample_names = [ header[i] for i in columns ] else: sample_names = [ "SAMPLE_%i" % x for x in range( len( columns ) ) ] for i, line in enumerate( fh ): fields = line.rstrip('\n\r').split( DELIMITER ) if len(fields) <= max_col: print >> sys.stederr, "Bad data line: ", fields continue if otu_column is not None: otu_names.append( fields[ otu_column ] ) else: otu_names.append( "OTU_%i" % i ) for j, col in enumerate( columns ): counts[ j ].append( int( fields[ col ] ) ) extra_kwds = {} if options.distance_metric in NEEDS_OTU_NAMES: extra_kwds['otu_ids'] = otu_names if options.distance_metric in NEEDS_TREE: assert options.tree, Exception( "You must provide a newick tree when using '%s'" % options.distance_metric ) # NB: TreeNode apparently needs unicode files with codecs.open( options.tree, 'rb', 'utf-8' ) as fh: extra_kwds['tree'] = TreeNode.read( fh ) bd_dm = beta_diversity( options.distance_metric, counts, ids=sample_names, **extra_kwds ) bd_dm.write( options.output )
lambda x: pd.Series( subsample_counts(x.astype("int"), depth), index=counts.columns ), axis=1, ) return rare log.info("Reading genus-level data.") genera = pd.read_csv( path.join("..", "data", "american_gut_genus.csv"), dtype={"id": str} ) libsize = genera.groupby("id")["count"].sum() mat = pd.pivot_table( genera, columns="Genus", index="id", values="count", fill_value=0, aggfunc="sum", ) mat = rarefy_counts(mat, 1000) log.info("Calculating beta diversity and PCoA.") D = beta_diversity("braycurtis", mat.values, mat.index, validate=True) red = pcoa(D, number_of_dimensions=2) log.info("Saving results to `pcoa.csv`.") red.samples.to_csv("pcoa.csv")
def analyse(self, user_request, base, headers, sample_labels, metadata_vals, phylogenetic_tree): logger.info("Starting NMDS analysis") type = user_request.get_custom_attr("type") if type == "weighted_unifrac" or type == "unweighted_unifrac": if phylogenetic_tree == "": return {"no_tree": True} project_map = Map(user_request.user_id, user_request.pid) if project_map.matrix_type == "float": return {"has_float": True} base = base.astype(int) tree = TreeNode.read(StringIO(phylogenetic_tree)) if len(tree.root().children) > 2: # Ensure that the tree is rooted if it is not already rooted tree = tree.root_at_midpoint() dist_matrix = beta_diversity(type, base, ids=sample_labels, otu_ids=headers, tree=tree) elif type == "euclidean": dist_matrix = euclidean_distances(base) else: base = base.astype(int) dist_matrix = beta_diversity(type, base) similarities = [] i = 0 while i < dist_matrix.shape[0]: new_row = [] j = 0 while j < dist_matrix.shape[0]: new_row.append(dist_matrix[i][j]) j += 1 similarities.append(new_row) i += 1 # Use traditional MDS to determine the initial position mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(similarities).embedding_ # Use NMDS to adjust the original positions to optimize for stress nmds = manifold.MDS(n_components=2, metric=False, dissimilarity="precomputed", max_iter=3000, eps=1e-12) npos = nmds.fit_transform(similarities, init=pos) ret_table = [] i = 0 while i < len(npos): meta = "" if metadata_vals and len(metadata_vals) > 0: meta = metadata_vals[i] obj = { "s": sample_labels[i], "m": meta, "nmds1": npos[i][0], "nmds2": npos[i][1], } ret_table.append(obj) i += 1 logger.info("After NMDS plotting") buffer = 1.5 abundancesObj = { "nmds": ret_table, "nmds1Max": np.max(npos[:, 0]) * buffer, "nmds1Min": np.min(npos[:, 0]) * buffer, "nmds2Max": np.max(npos[:, 1]) * buffer, "nmds2Min": np.min(npos[:, 1]) * buffer } return abundancesObj
otu_ids = X.columns.tolist() X = X.reset_index().melt(id_vars=['index'], value_vars=X.columns, var_name='taxonomy', value_name='abundance') taxa = pd.DataFrame(X.taxonomy.apply(lambda x: dict(map(lambda y: y.split('__'), filter(lambda x: not x.endswith('__'), x.split(';'))))).tolist()) X = pd.concat([X.drop(columns=['taxonomy']), taxa], axis=1) X = X.melt(id_vars=['index','abundance'], value_vars=taxa.columns, var_name='rank', value_name='taxonomy') X = X.groupby(by=['index', 'taxonomy'], as_index=False).sum().pivot_table(columns='taxonomy', index='index', values='abundance') if use_phylogeny: X = X.loc[:, X.columns.to_series().isin(names)] ids = X.index.tolist() otu_ids = X.columns.tolist() try: print('Trying calculating {} beta_diversity using scikit-bio & scikit-learn package...'.format(args.metric)) print('This could be time-consuming.') if use_phylogeny: mat = beta_diversity(args.metric, X, ids, tree=tree, otu_ids=otu_ids, validate=False).data else: mat = beta_diversity(args.metric, X, ids, otu_ids=otu_ids, validate=False).data except ValueError: print('Failed, the metric you selected is not supported by neither scikit-bio nor scikit-learn.') print('Trying using SciPy...') mat = squareform(pdist(X, metric=args.metric)) print('Succeeded!') pcs = pd.DataFrame(pcoa(mat, number_of_dimensions=2).samples.values.tolist(), index=X.index, columns=['PC1', 'PC2']) pcs = pd.concat([pcs, Y], axis=1) print('Visualizing the data using plotnine package...') p = (ggplot(pcs, aes(x='PC1', y='PC2', color='Env')) + geom_point(size=0.2) + scale_color_manual(['#E64B35FF','#4DBBD5FF','#00A087FF','#3C5488FF','#F39B7FFF','#8491B4FF','#91D1C2FF']) + theme(panel_grid_major = element_blank(), panel_grid_minor = element_blank(), panel_background = element_blank()) + theme(axis_line = element_line(color="gray", size = 1))
def PCoA_analysis(TaXon_table_xlsx, meta_data_to_test, taxonomic_level, width, height, pcoa_s, path_to_outdirs, template, font_size, color_discrete_sequence, pcoa_dissimilarity): import pandas as pd import numpy as np from skbio.diversity import beta_diversity from skbio.stats.ordination import pcoa from skbio.stats.distance import anosim import plotly.graph_objects as go from plotly.subplots import make_subplots import plotly.express as px from pathlib import Path import PySimpleGUI as sg import os, webbrowser from itertools import combinations TaXon_table_xlsx = Path(TaXon_table_xlsx) Meta_data_table_xlsx = Path( str(path_to_outdirs) + "/" + "Meta_data_table" + "/" + TaXon_table_xlsx.stem + "_metadata.xlsx") TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0).fillna("unidentified") TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx, header=0).fillna("nan") Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist() metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test) ## drop samples with metadata called nan (= empty) drop_samples = [ i[0] for i in Meta_data_table_df.values.tolist() if i[metadata_loc] == "nan" ] if drop_samples != []: ## filter the TaXon table TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1) TaXon_table_samples = TaXon_table_df.columns.tolist()[10:] ## also remove empty OTUs row_filter_list = [] for row in TaXon_table_df.values.tolist(): reads = set(row[10:]) if reads != {0}: row_filter_list.append(row) columns = TaXon_table_df.columns.tolist() TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns) Meta_data_table_df = pd.DataFrame( [ i for i in Meta_data_table_df.values.tolist() if i[0] not in drop_samples ], columns=Meta_data_table_df.columns.tolist()) Meta_data_table_samples = Meta_data_table_df['Samples'].tolist() ## create a y axis title text taxon_title = taxonomic_level.lower() ## adjust taxonomic level if neccessary if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]: taxon_title = taxonomic_level taxonomic_level = "ID" # check if the meta data differs if len(set(Meta_data_table_df[meta_data_to_test])) == len( Meta_data_table_df['Samples'].tolist()): sg.Popup( "The meta data is unique for all samples. Please adjust the meta data table!", title=("Error")) raise RuntimeError # check if the meta data differs if len(set(Meta_data_table_df[meta_data_to_test])) == 1: sg.Popup( "The meta data is similar for all samples. Please adjust the meta data table!", title=("Error")) raise RuntimeError if sorted(TaXon_table_samples) == sorted(Meta_data_table_samples): samples = Meta_data_table_samples ## extract the relevant data TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples] ## define an aggregation function to combine multiple hit of one taxonimic level aggregation_functions = {} ## define samples functions for sample in samples: ## 'sum' will calculate the sum of p/a data aggregation_functions[sample] = 'sum' ## define taxon level function aggregation_functions[taxonomic_level] = 'first' ## create condensed dataframe TaXon_table_df = TaXon_table_df.groupby( TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions) if 'unidentified' in TaXon_table_df.index: TaXon_table_df = TaXon_table_df.drop('unidentified') data = TaXon_table_df[samples].transpose().values.tolist() jc_dm = beta_diversity(pcoa_dissimilarity, data, samples) ordination_result = pcoa(jc_dm) metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist() anosim_results = anosim(jc_dm, metadata_list, permutations=999) anosim_r = round(anosim_results['test statistic'], 5) anosim_p = anosim_results['p-value'] textbox = meta_data_to_test + ", " + taxon_title + "<br>Anosim " + "R = " + str( anosim_r) + " " + "p = " + str(anosim_p) ####################################################################################### # create window to ask for PCoA axis to test def slices(list, slice): for i in range(0, len(list), slice): yield list[i:i + slice] # collect the PCoA proportion explained values proportion_explained_list = [] for i, pcoa_axis in enumerate(ordination_result.proportion_explained): if round(pcoa_axis * 100, 2) >= 1: proportion_explained_list.append("PC" + str(i + 1) + " (" + str(round(pcoa_axis * 100, 2)) + " %)") pcoa_axis_checkboxes = list( slices([ sg.Checkbox(name, key=name, size=(15, 1)) for name in proportion_explained_list ], 10)) pcoa_window_layout = [ [sg.Text('Check up to four axes to be displayed')], [sg.Frame(layout=pcoa_axis_checkboxes, title='')], [sg.Text('Only axes >= 1 % explained variance are shown')], [sg.CB("Connect categories", default=True, key="draw_mesh")], [sg.Text('')], [sg.Button('Plot', key='Plot')], [sg.Button('Back')], ] pcoa_window = sg.Window('PCoA axis', pcoa_window_layout, keep_on_top=True) while True: event, values = pcoa_window.read() draw_mesh = values["draw_mesh"] if event is None or event == 'Back': break if event == 'Plot': ## create a subfolder for better sorting and overview dirName = Path( str(path_to_outdirs) + "/" + "PCoA_plots" + "/" + TaXon_table_xlsx.stem + "/") if not os.path.exists(dirName): os.mkdir(dirName) # collect the pcoa axis values axis_to_plot = [ key for key, value in values.items() if value == True and "PC" in key ] # pass on only if two pcoa axes were checked if len(axis_to_plot) == 2: cat1 = axis_to_plot[1].split()[0] cat2 = axis_to_plot[0].split()[0] df_pcoa = ordination_result.samples[[cat1, cat2]] df_pcoa.insert( 2, "Metadata", Meta_data_table_df[meta_data_to_test].values.tolist(), True) df_pcoa.insert( 3, "Samples", Meta_data_table_df["Samples"].values.tolist(), True) if draw_mesh == True: combinations_list = [] for metadata in df_pcoa["Metadata"]: ## collect all entries for the respective metadata arr = df_pcoa.loc[df_pcoa['Metadata'] == metadata][ [cat1, cat2, "Metadata", "Samples"]].to_numpy() ## create a df for all possible combinations using itertools combinations for entry in list(combinations(arr, 2)): combinations_list.append(list(entry[0])) combinations_list.append(list(entry[1])) ## create a dataframe to draw the plot from df = pd.DataFrame(combinations_list) df.columns = [cat1, cat2, "Metadata", "Samples"] fig = px.scatter( df, x=cat1, y=cat2, color="Metadata", text="Samples", title=textbox, color_discrete_sequence=color_discrete_sequence) fig.update_traces(marker_size=int(pcoa_s), mode="markers+lines") fig.update_layout(height=int(height), width=int(width), template=template, showlegend=True, font_size=font_size, title_font_size=font_size) fig.update_xaxes(title=axis_to_plot[1]) fig.update_yaxes(title=axis_to_plot[0]) else: fig = px.scatter( df_pcoa, x=cat1, y=cat2, color="Metadata", text="Samples", title=textbox, color_discrete_sequence=color_discrete_sequence) fig.update_traces(marker_size=int(pcoa_s), mode="markers") fig.update_layout(height=int(height), width=int(width), template=template, showlegend=True, font_size=font_size, title_font_size=font_size) fig.update_xaxes(title=axis_to_plot[1]) fig.update_yaxes(title=axis_to_plot[0]) ## define output files output_pdf = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + ".pdf") output_html = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + ".html") output_xlsx = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + ".xlsx") ## write files fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ordination_result.samples[[cat1, cat2]].to_excel(output_xlsx) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "\n" + "PCoA plots are found in: " + str( path_to_outdirs) + "/PCoA_plots/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write to log from taxontabletools.create_log import ttt_log ttt_log("pcoa analysis", "analysis", TaXon_table_xlsx.name, output_pdf.name, meta_data_to_test, path_to_outdirs) break elif len(axis_to_plot) == 3: cat1 = axis_to_plot[0].split()[0] cat2 = axis_to_plot[1].split()[0] cat3 = axis_to_plot[2].split()[0] df_pcoa = ordination_result.samples[[cat1, cat2, cat3]] df_pcoa.insert( 3, "Metadata", Meta_data_table_df[meta_data_to_test].values.tolist(), True) df_pcoa.insert( 4, "Samples", Meta_data_table_df["Samples"].values.tolist(), True) ## check if lines are to be drawn between the dots if draw_mesh == True: combinations_list = [] for metadata in df_pcoa["Metadata"]: ## collect all entries for the respective metadata arr = df_pcoa.loc[df_pcoa['Metadata'] == metadata][ [cat1, cat2, cat3, "Metadata", "Samples"]].to_numpy() ## create a df for all possible combinations using itertools combinations for entry in list(combinations(arr, 2)): combinations_list.append(list(entry[0])) combinations_list.append(list(entry[1])) ## create a dataframe to draw the plot from df = pd.DataFrame(combinations_list) df.columns = [cat1, cat2, cat3, "Metadata", "Samples"] ## draw the plot fig = px.scatter_3d( df, x=cat1, y=cat2, z=cat3, color="Metadata", text="Samples", title=textbox, color_discrete_sequence=color_discrete_sequence) fig.update_traces(marker_size=int(pcoa_s), mode="markers+lines", line=dict(width=0.5)) fig.update_layout(height=int(height), width=int(width), template=template, title=textbox, showlegend=True, font_size=font_size, title_font_size=font_size) fig.update_layout( scene=dict(xaxis_title=axis_to_plot[0], yaxis_title=axis_to_plot[1], zaxis_title=axis_to_plot[2])) else: fig = px.scatter_3d( df_pcoa, x=cat1, y=cat2, z=cat3, color="Metadata", text="Samples", color_discrete_sequence=color_discrete_sequence) fig.update_traces(marker_size=int(pcoa_s), mode="markers") fig.update_layout(height=int(height), width=int(width), template=template, showlegend=True, title=textbox, font_size=font_size, title_font_size=font_size) fig.update_layout( scene=dict(xaxis_title=axis_to_plot[0], yaxis_title=axis_to_plot[1], zaxis_title=axis_to_plot[2])) ## define output files output_pdf = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_3d.pdf") output_html = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_3d.html") output_xlsx = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_3d.xlsx") ## write output files fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ordination_result.samples[[cat1, cat2]].to_excel(output_xlsx) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "PCoA plots are found in: " + str( path_to_outdirs) + "/PCoA_plots/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write log file from taxontabletools.create_log import ttt_log ttt_log("pcoa analysis", "analysis", TaXon_table_xlsx.name, output_pdf.name, meta_data_to_test, path_to_outdirs) break else: sg.Popup("Please choose not more than 3 PCoA axes", title="Error", keep_on_top=True) if event == 'Plot matrix': if len(proportion_explained_list) >= 4: ## create a subfolder for better sorting and overview dirName = Path( str(path_to_outdirs) + "/" + "PCoA_plots" + "/" + TaXon_table_xlsx.stem + "/") if not os.path.exists(dirName): os.mkdir(dirName) df_pcoa = ordination_result.samples[[ "PC1", "PC2", "PC3", "PC4" ]] df_pcoa.insert( 4, "Metadata", Meta_data_table_df[meta_data_to_test].values.tolist(), True) df_pcoa.insert( 5, "Sample", Meta_data_table_df["Samples"].values.tolist(), True) fig = make_subplots(rows=4, cols=4) ########### 1 ########### fig.add_trace(go.Scatter(), row=1, col=1) fig.update_layout(template=template, font_size=font_size, title_font_size=font_size) text = "PC1 (" + str( round( ordination_result.proportion_explained["PC1"] * 100, 2)) + " %)" fig.add_annotation(text=text, showarrow=False) fig.update_xaxes(showticklabels=False, showgrid=False) fig.update_yaxes(showticklabels=False, showgrid=False) ########### 2 ########### df = df_pcoa[["PC1", "PC2", "Metadata", "Sample"]] for metadata in set(metadata_list): df_metadata = df[df['Metadata'] == metadata] #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , ) fig.add_trace(go.Scatter( x=df_metadata["PC1"].values.tolist(), y=df_metadata["PC2"].values.tolist(), mode='markers', name=metadata, text=df_metadata["Sample"].values.tolist()), row=1, col=2) ########### 3 ########### df = df_pcoa[["PC1", "PC3", "Metadata", "Sample"]] for metadata in set(metadata_list): df_metadata = df[df['Metadata'] == metadata] #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , ) fig.add_trace(go.Scatter( x=df_metadata["PC1"].values.tolist(), y=df_metadata["PC3"].values.tolist(), mode='markers', name=metadata, showlegend=False, text=df_metadata["Sample"].values.tolist()), row=1, col=3) ########### 4 ########### df = df_pcoa[["PC1", "PC4", "Metadata", "Sample"]] for metadata in set(metadata_list): df_metadata = df[df['Metadata'] == metadata] fig.add_trace(go.Scatter( x=df_metadata["PC1"].values.tolist(), y=df_metadata["PC4"].values.tolist(), mode='markers', name=metadata, showlegend=False, text=df_metadata["Sample"].values.tolist()), row=1, col=4) fig.update_traces(marker_size=int(pcoa_s), mode="markers") fig.update_xaxes(showgrid=False, row=1, col=4) fig.update_yaxes(showgrid=False, row=1, col=4) ########### 5 ########### fig.add_trace(go.Scatter(), row=2, col=2) fig.update_layout(template=template, font_size=font_size, title_font_size=font_size) text = "PC2 (" + str( round( ordination_result.proportion_explained["PC2"] * 100, 2)) + " %)" fig.add_annotation(text=text, showarrow=False, row=2, col=2) ########### 6 ########### df = df_pcoa[["PC2", "PC3", "Metadata", "Sample"]] for metadata in set(metadata_list): df_metadata = df[df['Metadata'] == metadata] #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , ) fig.add_trace(go.Scatter( x=df_metadata["PC2"].values.tolist(), y=df_metadata["PC3"].values.tolist(), mode='markers', name=metadata, showlegend=False, text=df_metadata["Sample"].values.tolist()), row=2, col=3) ########### 7 ########### df = df_pcoa[["PC2", "PC4", "Metadata", "Sample"]] for metadata in set(metadata_list): df_metadata = df[df['Metadata'] == metadata] fig.add_trace(go.Scatter( x=df_metadata["PC2"].values.tolist(), y=df_metadata["PC4"].values.tolist(), mode='markers', name=metadata, showlegend=False, text=df_metadata["Sample"].values.tolist()), row=2, col=4) ########### 8 ########### fig.add_trace(go.Scatter(), row=3, col=3) fig.update_layout(template=template, font_size=font_size, title_font_size=font_size) text = "PC3 (" + str( round( ordination_result.proportion_explained["PC3"] * 100, 2)) + " %)" fig.add_annotation(text=text, showarrow=False, row=3, col=3) ########### 9 ########### df = df_pcoa[["PC3", "PC4", "Metadata", "Sample"]] for metadata in set(metadata_list): df_metadata = df[df['Metadata'] == metadata] #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , ) fig.add_trace(go.Scatter( x=df_metadata["PC3"].values.tolist(), y=df_metadata["PC4"].values.tolist(), mode='markers', name=metadata, showlegend=False, text=df_metadata["Sample"].values.tolist()), row=3, col=4) ########### 5 ########### fig.add_trace(go.Scatter(), row=4, col=4) fig.update_layout(template=template, font_size=font_size, title_font_size=font_size) text = "PC4 (" + str( round( ordination_result.proportion_explained["PC4"] * 100, 2)) + " %)" fig.add_annotation(text=text, showarrow=False, row=4, col=4) ###################### fig.update_xaxes(showline=True, mirror=True, linewidth=1, linecolor='black') fig.update_yaxes(showline=True, mirror=True, linewidth=1, linecolor='black') fig.update_traces(marker_size=int(pcoa_s), mode="markers") # finish plot matrix fig.update_layout(height=1000, width=1000, title_text=textbox) ## define output files output_pdf = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_matrix.pdf") output_html = Path( str(dirName) + "/" + meta_data_to_test + "_" + taxon_title + "_matrix.html") ## write output files fig.write_image(str(output_pdf)) fig.write_html(str(output_html)) ## ask to show file answer = sg.PopupYesNo('Show plot?', keep_on_top=True) if answer == "Yes": webbrowser.open('file://' + str(output_html)) ## print closing text closing_text = "\n" + "PCoA plots are found in: " + str( path_to_outdirs) + "/PCoA_plots/" sg.Popup(closing_text, title="Finished", keep_on_top=True) ## write to log file from taxontabletools.create_log import ttt_log ttt_log("pcoa analysis", "analysis", TaXon_table_xlsx.name, output_pdf.name, meta_data_to_test, path_to_outdirs) break else: sg.Popup( "There must be at least 4 PCoA axis available to plot the matrix!" ) pcoa_window.close() else: sg.PopupError( "The sample of both the TaXon table and the metadata table have to match!" )
def analyse_other(self, user_request, otu_table, headers, sample_labels, metaVals, phylogenetic_tree): type = user_request.get_custom_attr("type") otu_table = otu_table.astype(int) if type == "weighted_unifrac" or type == "unweighted_unifrac": if phylogenetic_tree == "": return {"no_tree": True} tree = TreeNode.read(StringIO(phylogenetic_tree)) if len(tree.root().children) > 2: # Ensure that the tree is rooted if it is not already rooted tree = tree.root_at_midpoint() dist_matrix = beta_diversity(type, otu_table, ids=sample_labels, otu_ids=headers, tree=tree) else: dist_matrix = beta_diversity(type, otu_table) results = pcoa(dist_matrix) pcaVals = results.samples pcaVariances = results.proportion_explained logger.info("After running the R PCA") pca1Min = 1000000 pca2Min = 1000000 pca3Min = 1000000 pca1Max = 0 pca2Max = 0 pca3Max = 0 pca1 = user_request.get_custom_attr("pca1") pca2 = user_request.get_custom_attr("pca2") pca3 = user_request.get_custom_attr("pca3") pcaRow = [] i = 0 while i < len(pcaVals): meta = "" if metaVals and len(metaVals) == len(pcaVals): meta = metaVals[i] pcaObj = { "s": sample_labels[i], "m": meta, "pca1": round(pcaVals.iloc[i]["PC" + pca1], 8), "pca2": round(pcaVals.iloc[i]["PC" + pca2], 8), "pca3": round(pcaVals.iloc[i]["PC" + pca3], 8) } if pcaObj["pca1"] > pca1Max: pca1Max = pcaObj["pca1"] if pcaObj["pca1"] < pca1Min: pca1Min = pcaObj["pca1"] if pcaObj["pca2"] > pca2Max: pca2Max = pcaObj["pca2"] if pcaObj["pca2"] < pca2Min: pca2Min = pcaObj["pca2"] if pcaObj["pca3"] > pca3Max: pca3Max = pcaObj["pca3"] if pcaObj["pca3"] < pca3Min: pca3Min = pcaObj["pca3"] pcaRow.append(pcaObj) i += 1 i = 0 pcaVarRow = [] for p in pcaVariances: pcaVarRow.append(float(p) * 100) if i > 10: break i += 1 abundancesObj = {} abundancesObj["pca"] = pcaRow abundancesObj["pcaVar"] = pcaVarRow abundancesObj["pca1Max"] = pca1Max abundancesObj["pca1Min"] = pca1Min abundancesObj["pca2Max"] = pca2Max abundancesObj["pca2Min"] = pca2Min abundancesObj["pca3Max"] = pca3Max abundancesObj["pca3Min"] = pca3Min return abundancesObj
def test_invalid_input(self): # number of ids doesn't match the number of samples error_msg = ("Number of rows") with self.assertRaisesRegex(ValueError, error_msg): beta_diversity(self.table1, list('AB'), 'euclidean') # unknown metric provided error_msg = "not-a-metric" with self.assertRaisesRegex(ValueError, error_msg): beta_diversity('not-a-metric', self.table1) # 3-D list provided as input error_msg = ("Only 1-D and 2-D") with self.assertRaisesRegex(ValueError, error_msg): beta_diversity('euclidean', [[[43]]]) # negative counts error_msg = "negative values." with self.assertRaisesRegex(ValueError, error_msg): beta_diversity('euclidean', [[0, 1, 3, 4], [0, 3, -12, 42]]) with self.assertRaisesRegex(ValueError, error_msg): beta_diversity('euclidean', [[0, 1, 3, -4], [0, 3, 12, 42]]) # additional kwargs error_msg = ("'not_a_real_kwarg'") with self.assertRaisesRegex(TypeError, error_msg): beta_diversity('euclidean', [[0, 1, 3], [0, 3, 12]], not_a_real_kwarg=42.0) with self.assertRaisesRegex(TypeError, error_msg): beta_diversity('unweighted_unifrac', [[0, 1, 3], [0, 3, 12]], not_a_real_kwarg=42.0, tree=self.tree1, otu_ids=['O1', 'O2', 'O3']) with self.assertRaisesRegex(TypeError, error_msg): beta_diversity('weighted_unifrac', [[0, 1, 3], [0, 3, 12]], not_a_real_kwarg=42.0, tree=self.tree1, otu_ids=['O1', 'O2', 'O3']) with self.assertRaisesRegex(TypeError, error_msg): beta_diversity(weighted_unifrac, [[0, 1, 3], [0, 3, 12]], not_a_real_kwarg=42.0, tree=self.tree1, otu_ids=['O1', 'O2', 'O3'])