def test_permutted(self): dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) # this should not throw pcoa(dm1, method="fsvd", number_of_dimensions=3, inplace=False) # some operations, like permute, will change memory structure # we want to test that this does not break pcoa dm2 = dm1.permute() # we just want to assure it does not throw pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=False)
def test_fsvd_inplace(self): dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3, inplace=True) results = pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=True) assert_ordination_results_equal(results, expected_results, ignore_directionality=True, ignore_method_names=True)
def get_pcoa(args, dm1): #from cogent.cluster.metric_scaling import PCoA from skbio.stats.ordination import pcoa PCoA_result = pcoa(dm1) print (PCoA_result.samples()) #dt = np.dtype(float) #print(type(PCoA_result)) a = np.array(PCoA_result)[0:,0:5] # capture only the first three vectors #print a json_array = {} json_array["P1"] = a[:,2].tolist()[:-2] # [:-2] is to remove the last two which are not eigen vectors json_array["P2"] = a[:,3].tolist()[:-2] try: json_array["P3"] = a[:,4].tolist()[:-2] except IndexError: sys.exit('IndexError - try selecting more data or deeper taxonomy') json_array["names"] = a[:,1].tolist()[:-2] #json['v2'] = [x[0] for x in np.array(PCoA_result[:,3])[:-2]] #json['v3'] = [x[0] for x in np.array(PCoA_result[:,4])[:-2]] #json['v3'] = [x[0] for x in np.array(PCoA_result[:,4])[:-2]] # sprint json_array if args.function == 'pcoa_3d': create_emperor_pc_file(args, json_array, PCoA_result) return json_array
def fit(self, X, y=None): """ Parameters ---------- X : array-like Feature table or distance matrix y : None ignored Returns ------- self fitted pcoa """ X_to_ordinate = X if self.metric != 'precomputed': X_to_ordinate = cdist( X_to_ordinate, X_to_ordinate, metric=self.metric, ) self.ordination_ = pcoa(X_to_ordinate) self.embedding_ = self.ordination_.samples return self
def setUp(self): # Crawford dataset for unweighted UniFrac fp = get_data_path('PCoA_sample_data_3') self.ordination = pcoa(DistanceMatrix.read(fp)) fp = get_data_path('PCoA_biplot_descriptors') self.descriptors = pd.read_table(fp, index_col='Taxon').T
def test_compare_to_rcode(): windows, _ = ls.parse_vcf(vcf_file, "chr1", 95) covmat, total_variance, eigenvals, eigenvecs = ls.cov_pca(windows[0].todense(), 10, 1) results = np.loadtxt("lostruct-results/chr1.filtered.pca.csv", delimiter=",", skiprows=1) totalandvalsR = results[0][0:11] totalandvalsPy = np.concatenate(([total_variance], eigenvals)), # Comes out as 0.9999921929150888 assert(np.corrcoef(totalandvalsR, totalandvalsPy)[0][1] >= 0.99999) # Squared here, because signs are often opposite between the two analyses. eigenvecsR = np.square(results[0][11:61]) eigenvecsPy = np.square(eigenvecs[0]) # Comes out as 0.9999921929150888 assert(np.corrcoef(eigenvecsR, eigenvecsPy)[0][1] >= 0.99999) assert(covmat.shape == (50, 50)) mds_coords = np.loadtxt("lostruct-results/mds_coords.csv", delimiter=",", skiprows=1, usecols=[2]) result = list() for x in windows: result.append(ls.eigen_windows(x, 10, 1)) result = np.vstack(result) pc_dists = ls.get_pc_dists(result) mds = pcoa(pc_dists) # Comes out as 0.9971509982243156 assert(np.corrcoef(mds.samples['PC1'], mds_coords)[0][1] >= 0.995)
def generate_aitchison_pcoa(self, df): """ Takes in count dataframe and generate pcoa matrix using aitchison distance matrix as input (aitchison distance is calculated in this function) Parameters ------------ dataframe: pandas dataframe object where rows = samples columns = OTU and each datapoint is the either read count or relative abundance. Returns ------------ pcoa.sample = pandas dataframe object this stores the pcoa generated values to be visualised dist_matrix = pandas dataframe object this stores the distance scores used to generate the pcoa values. """ dist_matrix = self.aitchison_distance_matrix(df) dm = DistanceMatrix(dist_matrix, dist_matrix.index) pcoa = ordination.pcoa(dm) return pcoa.samples, dist_matrix
def test_simple(self): eigvals = [ 0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895, 0.16054235, 0.15017696, 0.12245775, 0.0 ] proportion_explained = [ 0.2675738328, 0.157044696, 0.1399118638, 0.1091402725, 0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0 ] sample_ids = [ 'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634' ] axis_labels = ['PC%d' % i for i in range(1, 10)] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(np.loadtxt( get_data_path('exp_PCoAEigenResults_site')), index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def test_simple(self): eigvals = [0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895, 0.16054235, 0.15017696, 0.12245775, 0.0] proportion_explained = [0.2675738328, 0.157044696, 0.1399118638, 0.1091402725, 0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0] sample_ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634'] axis_labels = ['PC%d' % i for i in range(1, 10)] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame( np.loadtxt(get_data_path('exp_PCoAEigenResults_site')), index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def dm_to_pcoa(dm, sample_md, category): title = "Samples colored by %s." % category pcoa_results = pcoa(dm) _ = pcoa_results.plot(df=sample_md, column=category, axis_labels=['PC 1', 'PC 2', 'PC 3'], title=title, s=35)
def on_update(category, metadata, metric): dm = dms[metric] filtered_dm, _ = filter_dm_and_map(dm, metadata) pc = pcoa(filtered_dm) pc.plot(df=metadata, column=category, axis_labels=['PC 1', 'PC 2', 'PC 3'], s=35).set_size_inches(12, 9)
def diversity_analysis(wu_dm_list,bc_dm_list): from skbio.stats.distance import mantel #do the UniFrac and Bray-Curtis distances correlate? r, p_value, n = mantel(wu_dm_list[0],bc_dm_list[0]) print("Mantel Correlation COEF=",r) print("At significance of 0.05, the p-value for the correlation is = ",p_value) #next perform principle coordinate analysis (PCoA) on the weighted UniFrac distance matrix: from skbio.stats.ordination import pcoa wu_pc = pcoa(wu_dm_list[0])
def __eapply__(self, experiment): dm = experiment.data_df pcoa_results = pcoa(dm) pcoa_df = pcoa_results.samples pcoa_df.index = dm.index #sample names pcoa_df = pcoa_df.transpose() pcoa_exp = experiment.with_data_df(pcoa_df) pcoa_exp.metadata['pcoa'] = pcoa_results return pcoa_exp
def create_emperor_pc_file(args, dist, ds_list): #from cogent3.cluster.metric_scaling import PCoA from skbio.stats.ordination import pcoa PCoA_result = pcoa(dist) PCoA_result.samples.index = ds_list pcfile = os.path.join(args.basedir, 'tmp',args.prefix+'_pc.txt') PCoA_result.write(pcfile) return pcfile
def do_pcoa(infile): samples, distmtx = parse_distmat(infile) # coords, each row is an axis distmtx = DistanceMatrix(distmtx, ids=samples) ord_res = pcoa(distmtx) coords = ord_res.samples eigvals = ord_res.eigvals pcnts = ord_res.proportion_explained #Write results to output ord_res.write(sys.stdout)
def create_emperor_pc_file(args, dist, ds_list): #from cogent3.cluster.metric_scaling import PCoA from skbio.stats.ordination import pcoa PCoA_result = pcoa(dist) PCoA_result.samples.index = ds_list pcfile = os.path.join(args.basedir, args.prefix + '_pc.txt') PCoA_result.write(pcfile) try: os.chmod(pcfile, 0o664) except: pass return pcfile
def test_centroids_eq_groups(self): exp = [[1.2886811963240687, 1.890538910062923, 1.490527658097728], [2.17349240061718, 2.3192679626679946, 2.028338553903792]] exp_stat, _ = f_oneway(*exp) dm = pcoa(self.eq_mat) dm = dm.samples obs = _compute_groups(dm, 'centroid', self.grouping_eq) self.assertAlmostEqual(obs, exp_stat, places=6) obs_relab = _compute_groups(dm, 'centroid', self.grouping_eq_relab) self.assertAlmostEqual(obs_relab, obs, places=6)
def test_centroids_mixedgroups(self): exp = [[2.5847022428144935, 2.285624595858895, 1.7022431146340287], [1.724817266046108, 1.724817266046108], [2.4333280644972795, 2.389000390879655, 2.8547180589306036, 3.218568759338847]] dm = pcoa(self.uneq_mat) dm = dm.samples exp_stat, _ = f_oneway(*exp) obs_mixed = _compute_groups(dm, 'centroid', self.grouping_un_mixed) self.assertAlmostEqual(exp_stat, obs_mixed, places=6)
def plot_pcoas(metric): mpl.rcParams['figure.dpi'] = 100 mpl.rcParams['figure.figsize'] = 9, 6 df = pd.read_csv(glob.glob('diversity_core_metrics/' + ref_db + '/rpt_' + metric + '_dist/*/data/distance-matrix.tsv')[0],sep='\t',index_col=0) sample_ids = df.index.values dist = df.to_numpy() dm = DistanceMatrix(dist, sample_ids) pc = pcoa(dm) var1 = str(round(pc.proportion_explained[0]*100, 2)) var2 = str(round(pc.proportion_explained[1]*100, 2)) var3 = str(round(pc.proportion_explained[2]*100, 2)) for i in m.columns: ax = pc.plot(m, i, cmap='Accent', axis_labels=('PC1, '+var1+'%', 'PC2, '+var2+'%', 'PC3, '+var3+'%'), title= metric + " PCoA colored by " + i)
def beta_diversity_pcoa(biom_fp, method="braycurtis", permutations=99, dim=2, col='method', colormap={'expected': 'red', 'rdp': 'seagreen', 'sortmerna': 'gray', 'uclust': 'blue', 'blast': 'purple'}): '''From biom table, compute Bray-Curtis distance; generate PCoA plot; and calculate adonis differences. biom_fp: path Path to biom.Table containing sample metadata. method: str skbio.Diversity method to use for ordination. permutations: int Number of permutations to perform for anosim tests. dim: int Number of dimensions to plot. Currently supports only 2-3 dimensions. col: str metadata name to use for distinguishing groups for anosim tests and pcoa plots. colormap: dict map groups names (must be group names in col) to colors used for plots. ''' dm, s_md = make_distance_matrix(biom_fp, method=method) # pcoa pc = pcoa(dm) # anosim tests results = anosim(dm, s_md, column=col, permutations=permutations) print('R = ', results['test statistic'], '; P = ', results['p-value']) if dim == 2: # bokeh pcoa plots pc123 = pc.samples.ix[:, ["PC1", "PC2", "PC3"]] smd_merge = s_md.merge(pc123, left_index=True, right_index=True) smd_merge['Color'] = [colormap[x] for x in smd_merge['method']] title = smd_merge['reference'][0] labels = ['PC {0} ({1:.2f})'.format(d + 1, pc.proportion_explained[d]) for d in range(0, 2)] circle_plot_from_dataframe(smd_merge, "PC1", "PC2", title, columns=["method", "sample_id", "params"], color="Color", labels=labels) else: # skbio pcoa plots pcoa_plot_skbio(pc, s_md, col='method') return s_md, results, pc, dm
def pcoa(args, dist, ds_list): #from cogent3.cluster.metric_scaling import PCoA from skbio.stats.ordination import pcoa, OrdinationResults PCoA_result = pcoa(dist) #,index=['a','b','c','d','e','f','g','h','i','j','k','l','m','n'] # <class 'skbio.stats.ordination._base.OrdinationResults'> sc_2 = PCoA_result #print(dist) #print(type(sc_2)) df2 = PCoA_result.samples df2.index = ds_list print(PCoA_result) #dt = np.dtype(float) # print('sc_2.proportion_explained') # print(sc_2.proportion_explained) # print('eigvals') # print(sc_2.eigvals) # print('species') # print(sc_2.species) # print('site') # print(sc_2.site) ordfile = os.path.join(args.outdir, args.prefix + '.pc') sc_2.write(ordfile) sys.exit() # print('end pcoa result') a = np.array( PCoA_result) #[0:,0:5] # capture only the first three vectors #print a json_array = {} json_array["P1"] = a[:, 2].tolist( )[:-2] # [:-2] is to remove the last two which are not eigen vectors json_array["P2"] = a[:, 3].tolist()[:-2] try: json_array["P3"] = a[:, 4].tolist()[:-2] except IndexError: sys.exit('IndexError - try selecting more data or deeper taxonomy') json_array["names"] = a[:, 1].tolist()[:-2] #json['v2'] = [x[0] for x in np.array(PCoA_result[:,3])[:-2]] #json['v3'] = [x[0] for x in np.array(PCoA_result[:,4])[:-2]] #json['v3'] = [x[0] for x in np.array(PCoA_result[:,4])[:-2]] # sprint json_array if args.function == 'pcoa_3d': create_emperor_pc_file(args, json_array, PCoA_result) return json_array
def fit_transform(self, data): # try: from skbio.stats.ordination import pcoa # except: # sys.exit("PCOA is using the pcoa implemened at scikit-bio.") data = self._check_data(data) if self.metric.name != "precomputed": data = squareform(pdist(data, metric=self.metric.name)) data = self.metric.fit_transform(data) else: data = self.metric.fit_transform(data) projected_data = pcoa(data) self.pcoa = projected_data return self.pcoa.samples.values[:, self.components]
def pcoa(args, dist, ds_list): #from cogent3.cluster.metric_scaling import PCoA from skbio.stats.ordination import pcoa, OrdinationResults PCoA_result = pcoa(dist) #,index=['a','b','c','d','e','f','g','h','i','j','k','l','m','n'] # <class 'skbio.stats.ordination._base.OrdinationResults'> sc_2 = PCoA_result #print(dist) #print(type(sc_2)) df2 = PCoA_result.samples df2.index = ds_list print(PCoA_result) #dt = np.dtype(float) # print('sc_2.proportion_explained') # print(sc_2.proportion_explained) # print('eigvals') # print(sc_2.eigvals) # print('species') # print(sc_2.species) # print('site') # print(sc_2.site) ordfile = os.path.join(args.outdir,args.prefix+'.pc') sc_2.write(ordfile) sys.exit() # print('end pcoa result') a = np.array(PCoA_result) #[0:,0:5] # capture only the first three vectors #print a json_array = {} json_array["P1"] = a[:,2].tolist()[:-2] # [:-2] is to remove the last two which are not eigen vectors json_array["P2"] = a[:,3].tolist()[:-2] try: json_array["P3"] = a[:,4].tolist()[:-2] except IndexError: sys.exit('IndexError - try selecting more data or deeper taxonomy') json_array["names"] = a[:,1].tolist()[:-2] #json['v2'] = [x[0] for x in np.array(PCoA_result[:,3])[:-2]] #json['v3'] = [x[0] for x in np.array(PCoA_result[:,4])[:-2]] #json['v3'] = [x[0] for x in np.array(PCoA_result[:,4])[:-2]] # sprint json_array if args.function == 'pcoa_3d': create_emperor_pc_file(args, json_array, PCoA_result) return json_array
def plot_pcoa(bs_iter = 10000): # get F stat and p value df = pd.read_csv(mt.get_path() + '/data/mult_by_pop.txt', sep = '\t', index_col=0) #mt.get_F_2(df,4,4) df = df/df.sum(axis=1)[:,None] df_bc = pairwise_distances(df, metric='braycurtis') df_pcoa = pcoa(df_bc , number_of_dimensions=3) ord_matrix = df_pcoa.samples F = mt.get_F_2(ord_matrix, 4,4) F_nulls = [] for i in range(bs_iter): F_nulls.append(mt.get_F_2(ord_matrix.sample(frac=1), 4,4)[0]) p_value = len([F_null for F_null in F_nulls if F_null > F[0]]) / bs_iter print("F = " + str(round(F[0], 4))) print("p = " + str(round(p_value, 4))) #fig = plt.figure() fig, ax = plt.subplots(figsize=(6, 6)) # Scatterplot on main ax ax.axhline(y=0, color='k', linestyle=':', alpha = 0.8, zorder=1) ax.axvline(x=0, color='k', linestyle=':', alpha = 0.8, zorder=2) ax.scatter(0, 0, marker = "o", edgecolors='none', c = 'darkgray', s = 120, zorder=3) ax.scatter(ord_matrix.ix[0:4,0],ord_matrix.ix[0:4,1], marker = "o", edgecolors='#244162', c = 'blue', alpha = 0.8, s = 120, zorder=4, label='Wildtype') ax.scatter(ord_matrix.ix[4:,0],ord_matrix.ix[4:,1], marker = "o", edgecolors='#244162', c = 'r', alpha = 0.8, s = 120, zorder=4, label='Minimal cell') confidence_ellipse(ord_matrix.ix[0:4,0],ord_matrix.ix[0:4,1], ax, n_std=2, edgecolor='blue', linestyle='--', lw=3) confidence_ellipse(ord_matrix.ix[4:,0],ord_matrix.ix[4:,1], ax, n_std=2, edgecolor='red', linestyle='--', lw=3) #ax1.xlim([-0.7,0.7]) #ax1.set_ylim([-0.7,0.7]) ax.set_xlabel('PCo 1 (' + str(round(df_pcoa.proportion_explained[0],3)*100) + '%)' , fontsize = 14) ax.set_ylabel('PCo 2 (' + str(round(df_pcoa.proportion_explained[1],3)*100) + '%)' , fontsize = 14) plt.legend(loc="upper right") fig_name = mt.get_path() + '/figures/pcoa.png' fig.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
def test_extensive(self): eigvals = [ 0.3984635, 0.36405689, 0.28804535, 0.27479983, 0.19165361, 0.0 ] proportion_explained = [ 0.2626621381, 0.2399817314, 0.1898758748, 0.1811445992, 0.1263356565, 0.0 ] sample_ids = [str(i) for i in range(6)] axis_labels = ['PC%d' % i for i in range(1, 7)] samples = [ [-0.028597, 0.22903853, 0.07055272, 0.26163576, 0.28398669, 0.0], [ 0.37494056, 0.22334055, -0.20892914, 0.05057395, -0.18710366, 0.0 ], [ -0.33517593, -0.23855979, -0.3099887, 0.11521787, -0.05021553, 0.0 ], [0.25412394, -0.4123464, 0.23343642, 0.06403168, -0.00482608, 0.0], [ -0.28256844, 0.18606911, 0.28875631, -0.06455635, -0.21141632, 0.0 ], [0.01727687, 0.012458, -0.07382761, -0.42690292, 0.1695749, 0.0] ] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(samples, index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) data = np.loadtxt(get_data_path('PCoA_sample_data_2')) # test passing a numpy.ndarray and a DistanceMatrix to pcoa # gives same results for dm in (data, DistanceMatrix(data)): results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def js_PCoA(distributions): """Dimension reduction via Jensen-Shannon Divergence & Principal Components Parameters ---------- distributions : array-like, shape (`n_dists`, `k`) Matrix of distributions probabilities. Returns ------- pcoa : array, shape (`n_dists`, 2) """ dist_matrix = DistanceMatrix(dist.squareform(dist.pdist(distributions.values, _jensen_shannon))) if skbio_old: data = PCoA(dist_matrix).scores() return data.site[:,0:2] else: return pcoa(dist_matrix).samples.values[:, 0:2]
def test_centroids_uneq_groups(self): """ the expected result here was calculated by hand """ exp = [[2.5847022428144935, 2.285624595858895, 1.7022431146340287], [1.724817266046108, 1.724817266046108], [2.4333280644972795, 2.389000390879655, 2.8547180589306036, 3.218568759338847]] exp_stat, _ = f_oneway(*exp) dm = pcoa(self.uneq_mat) dm = dm.samples obs = _compute_groups(dm, 'centroid', self.grouping_uneq) self.assertAlmostEqual(obs, exp_stat, places=6) obs_relab = _compute_groups(dm, 'centroid', self.grouping_uneq_relab) self.assertAlmostEqual(obs, obs_relab, places=6)
def PCoA_total_from_matrix(distance_matrix, biom_file, metadata_file, plot=False): sk_distance_matrix = DistanceMatrix(distance_matrix, BW.extract_samples(biom_file)) metadata = meta.extract_metadata(metadata_file) pd_metadata = pd.DataFrame.from_dict(metadata, orient='index') result = pcoa(sk_distance_matrix) fig = result.plot(df=pd_metadata, column='body_site', axis_labels=('PC 1 (' + str(round(result.proportion_explained.iloc[0]*100, 2)) + '%)', 'PC 2 (' + str(round(result.proportion_explained.iloc[1]*100, 2)) + '%)', 'PC 3 (' + str(round(result.proportion_explained.iloc[2]*100, 2)) + '%)'), title='Samples colored by body site', cmap='Set1', s=50) fig.set_size_inches(18.5, 10.5) if plot: plt.show() else: return fig
def test_median_normal(self): exp = pd.Series(index=self.exp_index, data=['PERMDISP', 'F-value', 9, 2, 0.139475441876, 0.61, 99], name='PERMDISP results') np.random.seed(0) obs = permdisp(self.unifrac_dm, self.unif_grouping, test='median', permutations=99) self.assert_series_equal(obs, exp) np.random.seed(0) po = pcoa(self.unifrac_dm) obs2 = permdisp(po, self.unif_grouping, test='median', permutations=99) self.assert_series_equal(obs2, exp)
def test_median_fsvd(self): exp = pd.Series(index=self.exp_index, data=['PERMDISP', 'F-value', 9, 2, 0.04078077215673714, 0.8, 99], name='PERMDISP results') np.random.seed(0) obs = permdisp(self.unifrac_dm, self.unif_grouping, test='median', permutations=99, method='fsvd', number_of_dimensions=3) self.assert_series_equal(obs, exp) np.random.seed(0) po = pcoa(self.unifrac_dm, method='fsvd', number_of_dimensions=3) obs = permdisp(po, self.unif_grouping, test='median', permutations=99) self.assert_series_equal(obs, exp)
def PCoA_group_from_matrix(distance_matrix, biom_file, groups, plot=False): sk_distance_matrix = DistanceMatrix(distance_matrix, [str(i) for i in range(len(groups))]) metadata = {str(i): {'body_site': groups[i]} for i in range(len(groups))} pd_metadata = pd.DataFrame.from_dict(metadata, orient='index') result = pcoa(sk_distance_matrix) fig = result.plot(df=pd_metadata, column='body_site', axis_labels=('PC 1 (' + str(round(result.proportion_explained.iloc[0]*100, 2)) + '%)', 'PC 2 (' + str(round(result.proportion_explained.iloc[1]*100, 2)) + '%)', 'PC 3 (' + str(round(result.proportion_explained.iloc[2]*100, 2)) + '%)'), title='Samples colored by body site', cmap='Set1', s=50) fig.set_size_inches(18.5, 10.5) if plot: plt.show() else: return fig
def PCoA_total_from_matrix_clustering(distance_matrix, biom_file, assignments, plot=False): samples = BW.extract_samples(biom_file) sk_distance_matrix = DistanceMatrix(distance_matrix, BW.extract_samples(biom_file)) metadata = {samples[i]: {'body_site': 'Group ' + str(assignments[i]+1)} for i in range(len(assignments))} pd_metadata = pd.DataFrame.from_dict(metadata, orient='index') result = pcoa(sk_distance_matrix) fig = result.plot(df=pd_metadata, column='body_site', axis_labels=('PC 1 (' + str(round(result.proportion_explained.iloc[0]*100, 2)) + '%)', 'PC 2 (' + str(round(result.proportion_explained.iloc[1]*100, 2)) + '%)', 'PC 3 (' + str(round(result.proportion_explained.iloc[2]*100, 2)) + '%)'), title='Samples colored by body site', cmap='Set1', s=50) fig.set_size_inches(18.5, 10.5) if plot: plt.show() else: return fig
def fit(self, X, y=None): """ Parameters ---------- X : array-like Feature table or distance matrix y : None ignored Returns ------- self fitted pcoa """ if self.metric == 'precomputed': self.embedding_ = pcoa(X).samples else: raise NotImplementedError() return self
def test_extensive(self): eigvals = [0.3984635, 0.36405689, 0.28804535, 0.27479983, 0.19165361, 0.0] proportion_explained = [0.2626621381, 0.2399817314, 0.1898758748, 0.1811445992, 0.1263356565, 0.0] sample_ids = [str(i) for i in range(6)] axis_labels = ['PC%d' % i for i in range(1, 7)] samples = [[-0.028597, 0.22903853, 0.07055272, 0.26163576, 0.28398669, 0.0], [0.37494056, 0.22334055, -0.20892914, 0.05057395, -0.18710366, 0.0], [-0.33517593, -0.23855979, -0.3099887, 0.11521787, -0.05021553, 0.0], [0.25412394, -0.4123464, 0.23343642, 0.06403168, -0.00482608, 0.0], [-0.28256844, 0.18606911, 0.28875631, -0.06455635, -0.21141632, 0.0], [0.01727687, 0.012458, -0.07382761, -0.42690292, 0.1695749, 0.0]] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(samples, index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) data = np.loadtxt(get_data_path('PCoA_sample_data_2')) # test passing a numpy.ndarray and a DistanceMatrix to pcoa # gives same results for dm in (data, DistanceMatrix(data)): results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
import pandas as pd import matplotlib.pyplot as plt import json from sklearn.metrics import jaccard_similarity_score from sklearn.metrics.pairwise import pairwise_distances from skbio.stats.ordination import pcoa # Compute Jaccard distances # bcto_matrix = bcto_cover.as_matrix() # bcto_distances = pairwise_distances(bcto_matrix,metric='jaccard') # pcoa_of_distance = pcoa(bcto_distances) # pcoa_of_distance.plot() # plt.show() # print pcoa_of_distance # firm_matrix = firm_cover.as_matrix() # firm_distances = pairwise_distances(firm_matrix,metric='jaccard') # pcoa_of_distance = pcoa(firm_distances) # pcoa_of_distance.plot() # plt.show() # print pcoa_of_distance asf_bcto_firm_cover = pd.read_csv('../results/metagenome_coverage/asf_bcto_firm_mg_coverage.tsv',sep='\t') asf_bcto_firm_matrix = asf_bcto_firm_cover.as_matrix() asf_bcto_firm_distances = pairwise_distances(asf_bcto_firm_matrix,metric='jaccard') pcoa_of_distance = pcoa(asf_bcto_firm_distances) pcoa_of_distance.plot() plt.show()
def test_centroids_null(self): dm = pcoa(self.null_mat) dm = dm.samples obs_null = _compute_groups(dm, 'centroid', self.grouping_eq) np.isnan(obs_null)
xls_file = "../data/Coral_ChemiFRAC_test.xlsx" table = pd.read_excel(xls_file, sheetname=1, index_col=0).T edges = pd.read_excel(xls_file, sheetname=0) maxID = max([edges["CLUSTERID1"].max(), edges["CLUSTERID2"].max()]) + 1 spm = coo_matrix( (edges["Cosine"].values, (edges["CLUSTERID1"].values, edges["CLUSTERID2"].values)), shape=(maxID, maxID) ) coral_nwk = nx.from_scipy_sparse_matrix(spm) meta_map = pd.read_table("../data/%s" % meta_file) small_table = table dm = pd.DataFrame(columns=meta_map.index, index=meta_map.index) for i in range(len(meta_map.index)): for j in range(i): sampIDs = meta_map["#SampleID"].values _x, _y = sampIDs[i], sampIDs[j] x = small_table.loc[_x, :] y = small_table.loc[_y, :] dm.loc[_x, _y] = rig(coral_nwk, x, y) dm.to_csv("../results/rig.txt", sep="\t") dm = pd.read_csv("../results/rig.txt", index_col=0) dm = dm.loc[meta_map["#SampleID"].values, meta_map["#SampleID"].values] dm = dm.fillna(0) dmpc = pcoa(dm + dm.T) dmpc.samples.index = meta_map["#SampleID"].values dmpc.write("../results/rig_pc.txt")
def test_fsvd(self): dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) dm3 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) # Test eigh vs. fsvd pcoa and inplace parameter expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3, inplace=False) results = pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=False) results_inplace = pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=True) assert_ordination_results_equal(results, expected_results, ignore_directionality=True, ignore_method_names=True) assert_ordination_results_equal(results, results_inplace, ignore_directionality=True, ignore_method_names=True) # Test number_of_dimensions edge cases results2 = pcoa(dm3, method="fsvd", number_of_dimensions=0, inplace=False) expected_results2 = pcoa(dm3, method="fsvd", number_of_dimensions=dm3.data.shape[0], inplace=False) assert_ordination_results_equal(results2, expected_results2, ignore_directionality=True, ignore_method_names=True) with self.assertRaises(ValueError): dim_too_large = dm1.data.shape[0] + 10 pcoa(dm2, method="fsvd", number_of_dimensions=dim_too_large) with self.assertRaises(ValueError): pcoa(dm2, method="fsvd", number_of_dimensions=-1) with self.assertRaises(ValueError): dim_too_large = dm1.data.shape[0] + 10 pcoa(dm2, method="eigh", number_of_dimensions=dim_too_large) with self.assertRaises(ValueError): pcoa(dm2, method="eigh", number_of_dimensions=-1) dm_big = DistanceMatrix.read(get_data_path('PCoA_sample_data_12dim')) with self.assertWarnsRegex(RuntimeWarning, "no value for number_of_dimensions"): pcoa(dm_big, method="fsvd", number_of_dimensions=0)
def plot_mds( self, rank="auto", metric="braycurtis", method="pcoa", title=None, xlabel=None, ylabel=None, color=None, size=None, tooltip=None, return_chart=False, label=None, ): """Plot beta diversity distance matrix using multidimensional scaling (MDS). Parameters ---------- rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. metric : {'braycurtis', 'manhattan', 'jaccard', 'unifrac', 'unweighted_unifrac}, optional Function to use when calculating the distance between two samples. method : {'pcoa', 'smacof'} Algorithm to use for ordination. PCoA uses eigenvalue decomposition and is not well suited to non-euclidean distance functions. SMACOF is an iterative optimization strategy that can be used as an alternative. title : `string`, optional Text label at the top of the plot. xlabel : `string`, optional Text label along the horizontal axis. ylabel : `string`, optional Text label along the vertical axis. size : `string` or `tuple`, optional A string or a tuple containing strings representing metadata fields. The size of points in the resulting plot will change based on the metadata associated with each sample. color : `string` or `tuple`, optional A string or a tuple containing strings representing metadata fields. The color of points in the resulting plot will change based on the metadata associated with each sample. tooltip : `string` or `list`, optional A string or list containing strings representing metadata fields. When a point in the plot is hovered over, the value of the metadata associated with that sample will be displayed in a modal. label : `string` or `callable`, optional A metadata field (or function) used to label each analysis. If passing a function, a dict containing the metadata for each analysis is passed as the first and only positional argument. The callable function must return a string. Examples -------- Scatter plot of weighted UniFrac distance between all our samples, using counts at the genus level. >>> plot_mds(rank='genus', metric='unifrac') Notes ----- **For `smacof`**: The values reported on the axis labels are Pearson's correlations between the distances between points on each axis alone, and the corresponding distances in the distance matrix calculated using the user-specified metric. These values are related to the effectiveness of the MDS algorithm in placing points on the scatter plot in such a way that they truly represent the calculated distances. They do not reflect how well the distance metric captures similarities between the underlying data (in this case, an OTU table). """ if len(self._results) < 2: raise OneCodexException("`plot_mds` requires 2 or more valid classification results.") dists = self._compute_distance(rank, metric).to_data_frame() # here we figure out what to put in the tooltips and get the appropriate data if tooltip: if not isinstance(tooltip, list): tooltip = [tooltip] else: tooltip = [] tooltip.insert(0, "Label") if color and color not in tooltip: tooltip.insert(1, color) if size and size not in tooltip: tooltip.insert(2, size) magic_metadata, magic_fields = self._metadata_fetch(tooltip, label=label) if method == "smacof": # adapted from https://scikit-learn.org/stable/auto_examples/manifold/plot_mds.html x_field = "MDS1" y_field = "MDS2" seed = np.random.RandomState(seed=3) mds = manifold.MDS( max_iter=3000, eps=1e-12, random_state=seed, dissimilarity="precomputed", n_jobs=1 ) pos = mds.fit(dists).embedding_ plot_data = pd.DataFrame(pos, columns=[x_field, y_field], index=dists.index) plot_data = plot_data.div(plot_data.abs().max(axis=0), axis=1) # normalize to [0,1] # determine how much of the original distance is captured by each of the axes after MDS. # this implementation of MDS does not use eigen decomposition and so there's no simple # way of returning a 'percent of variance explained' value r_squared = [] for axis in [0, 1]: mds_dist = pos.copy() mds_dist[::, axis] = 0 mds_dist = squareform(euclidean_distances(mds_dist).round(6)) r_squared.append(pearsonr(mds_dist, squareform(dists))[0]) # label the axes x_extra_label = "r² = %.02f" % (r_squared[0],) y_extra_label = "r² = %.02f" % (r_squared[1],) elif method == "pcoa": # suppress eigenvalue warning from skbio--not because it's an invalid warning, but # because lots of folks in the field run pcoa on these distances functions, even if # statistically inappropriate. perhaps this will change if we ever become more # opinionated about the analyses that we allow our users to do (roo) with warnings.catch_warnings(): warnings.simplefilter("ignore") ord_result = ordination.pcoa( dists.round(6) ) # round to avoid float precision errors plot_data = ord_result.samples.iloc[:, [0, 1]] # get first two components plot_data = plot_data.div(plot_data.abs().max(axis=0), axis=1) # normalize to [0,1] plot_data.index = dists.index x_field, y_field = plot_data.columns.tolist() # name of first two components x_extra_label = "%0.02f%%" % (ord_result.proportion_explained[0] * 100,) y_extra_label = "%0.02f%%" % (ord_result.proportion_explained[1] * 100,) else: raise OneCodexException("MDS method must be one of: smacof, pcoa") # label the axes if xlabel is None: xlabel = "{} ({})".format(x_field, x_extra_label) if ylabel is None: ylabel = "{} ({})".format(y_field, y_extra_label) plot_data = pd.concat([plot_data, magic_metadata], axis=1).reset_index() alt_kwargs = dict( x=alt.X(x_field, axis=alt.Axis(title=xlabel)), y=alt.Y(y_field, axis=alt.Axis(title=ylabel)), tooltip=[magic_fields[t] for t in tooltip], href="url:N", url="https://app.onecodex.com/classification/" + alt.datum.classification_id, ) # only add these parameters if they are in use if color: alt_kwargs["color"] = magic_fields[color] if size: alt_kwargs["size"] = magic_fields[size] chart = ( alt.Chart(plot_data) .transform_calculate(url=alt_kwargs.pop("url")) .mark_circle() .encode(**alt_kwargs) ) if title: chart = chart.properties(title=title) if return_chart: return chart else: chart.interactive().display()
def test_invalid_input(self): with npt.assert_raises(DissimilarityMatrixError): pcoa([[1, 2], [3, 4]])
#svgfile = os.path.join('/Users/avoorhis/programming/jupyter/VAMPS_API',args.prefix+'_dendrogram.svg') svgfile = os.path.join(args.outdir,args.prefix+'_dendrogram.svg') print(os.getcwd()) #print svgfile print('rendering0') rooted_tree.render(svgfile, tree_style=ts) # writes file to tmp if args.function == 'pcoa_3d': #print('starting pcoa_3d') from skbio import DistanceMatrix dm = DistanceMatrix(dm1) #print(dm1) #print('end pcoa_3d') pcoa_data = pcoa(args, dm, datasets) #test_PCoA() if args.function == 'pcoa_2d': # if not args.metadata: # print "ERROR: In PCoA and no metadata recieved" # sys.exit() pcoa_data = pcoa(args, dm3) #print json.dumps(pcoa_data) #metadata = json.loads( args.metadata.strip("'") ) pcoa_pdf(args, pcoa_data) #print pcoa_data
def permdisp(distance_matrix, grouping, column=None, test='median', permutations=999): """Test for Homogeneity of Multivariate Groups Disperisons using Marti Anderson's PERMDISP2 procedure. PERMDISP is a multivariate analogue of Levene's test for homogeneity of multivariate variances. Distances are handled by reducing the original distances to principal coordinates. PERMDISP calculates an F-statistic to assess whether the dispersions between groups is significant Parameters ---------- distance_matrix : DistanceMatrix Distance matrix containing distances between objects (e.g., distances between samples of microbial communities). grouping : 1-D array_like or pandas.DataFrame Vector indicating the assignment of objects to groups. For example, these could be strings or integers denoting which group an object belongs to. If `grouping` is 1-D ``array_like``, it must be the same length and in the same order as the objects in `distance_matrix`. If `grouping` is a ``DataFrame``, the column specified by `column` will be used as the grouping vector. The ``DataFrame`` must be indexed by the IDs in `distance_matrix` (i.e., the row labels must be distance matrix IDs), but the order of IDs between `distance_matrix` and the ``DataFrame`` need not be the same. All IDs in the distance matrix must be present in the ``DataFrame``. Extra IDs in the ``DataFrame`` are allowed (they are ignored in the calculations). column : str, optional Column name to use as the grouping vector if `grouping` is a ``DataFrame``. Must be provided if `grouping` is a ``DataFrame``. Cannot be provided if `grouping` is 1-D ``array_like``. test : {'centroid', 'median'} determines whether the analysis is done using centroid or spaitial median. permutations : int, optional Number of permutations to use when assessing statistical significance. Must be greater than or equal to zero. If zero, statistical significance calculations will be skipped and the p-value will be ``np.nan``. Returns ------- pandas.Series Results of the statistical test, including ``test statistic`` and ``p-value``. Raises ------ TypeError If, when using the spatial median test, the pcoa ordination is not of type np.float32 or np.float64, the spatial median function will fail and the centroid test should be used instead ValueError If the test is not centroid or median. TypeError If the distance matrix is not an instance of a ``skbio.DistanceMatrix``. ValueError If there is only one group ValueError If a list and a column name are both provided ValueError If a list is provided for `grouping` and it's length does not match the number of ids in distance_matrix ValueError If all of the values in the grouping vector are unique KeyError If there are ids in grouping that are not in distance_matrix See Also -------- permanova anosim Notes ----- The significance of the results from this function will be the same as the results found in vegan's betadisper, however due to floating point variability the F-statistic results may vary slightly. See [1]_ for the original method reference, as well as ``vegan::betadisper``, available in R's vegan package [2]_. References ---------- .. [1] Anderson, Marti J. "Distance-Based Tests for Homogeneity of Multivariate Dispersions." Biometrics 62 (2006):245-253 .. [2] http://cran.r-project.org/web/packages/vegan/index.html Examples -------- Load a 6x6 distance matrix and grouping vector denoting 2 groups of objects: >>> from skbio import DistanceMatrix >>> dm = DistanceMatrix([[0, 0.5, 0.75, 1, 0.66, 0.33], ... [0.5, 0, 0.25, 0.33, 0.77, 0.61], ... [0.75, 0.25, 0, 0.1, 0.44, 0.55], ... [1, 0.33, 0.1, 0, 0.75, 0.88], ... [0.66, 0.77, 0.44, 0.75, 0, 0.77], ... [0.33, 0.61, 0.55, 0.88, 0.77, 0]], ... ['s1', 's2', 's3', 's4', 's5', 's6']) >>> grouping = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2'] Run PERMDISP using 99 permutations to caluculate the p-value: >>> from skbio.stats.distance import permdisp >>> import numpy as np >>> #make output deterministic, should not be included during normal use >>> np.random.seed(0) >>> permdisp(dm, grouping, permutations=99) method name PERMDISP test statistic name F-value sample size 6 number of groups 2 test statistic 1.03296 p-value 0.35 number of permutations 99 Name: PERMDISP results, dtype: object The return value is a ``pandas.Series`` object containing the results of the statistical test. To suppress calculation of the p-value and only obtain the F statistic, specify zero permutations: >>> permdisp(dm, grouping, permutations=0) method name PERMDISP test statistic name F-value sample size 6 number of groups 2 test statistic 1.03296 p-value NaN number of permutations 0 Name: PERMDISP results, dtype: object PERMDISP computes variances based on two types of tests, using either centroids or spatial medians, also commonly referred to as a geometric median. The spatial median is thought to yield a more robust test statistic, and this test is used by default. Spatial medians are computed using an iterative algorithm to find the optimally minimum point from all other points in a group while centroids are computed using a deterministic formula. As such the two different tests yeild slightly different F statistics. >>> np.random.seed(0) >>> permdisp(dm, grouping, test='centroid', permutations=6) method name PERMDISP test statistic name F-value sample size 6 number of groups 2 test statistic 3.67082 p-value 0.428571 number of permutations 6 Name: PERMDISP results, dtype: object You can also provide a ``pandas.DataFrame`` and a column denoting the grouping instead of a grouping vector. The following DataFrame's Grouping column specifies the same grouping as the vector we used in the previous examples.: >>> import pandas as pd >>> df = pd.DataFrame.from_dict( ... {'Grouping': {'s1': 'G1', 's2': 'G1', 's3': 'G1', 's4': 'G2', ... 's5': 'G2', 's6': 'G2'}}) >>> permdisp(dm, df, 'Grouping', permutations=6, test='centroid') method name PERMDISP test statistic name F-value sample size 6 number of groups 2 test statistic 3.67082 p-value 0.428571 number of permutations 6 Name: PERMDISP results, dtype: object Note that when providing a ``DataFrame``, the ordering of rows and/or columns does not affect the grouping vector that is extracted. The ``DataFrame`` must be indexed by the distance matrix IDs (i.e., the row labels must be distance matrix IDs). If IDs (rows) are present in the ``DataFrame`` but not in the distance matrix, they are ignored. The previous example's ``s7`` ID illustrates this behavior: note that even though the ``DataFrame`` had 7 objects, only 6 were used in the test (see the "Sample size" row in the results above to confirm this). Thus, the ``DataFrame`` can be a superset of the distance matrix IDs. Note that the reverse is not true: IDs in the distance matrix *must* be present in the ``DataFrame`` or an error will be raised. PERMDISP should be used to determine whether the dispersions between the groups in your distance matrix are significantly separated. A non-significant test result indicates that group dispersions are similar to each other. PERMANOVA or ANOSIM should then be used in conjunction to determine whether clustering within groups is significant. """ if test not in ['centroid', 'median']: raise ValueError('Test must be centroid or median') ordination = pcoa(distance_matrix) samples = ordination.samples sample_size, num_groups, grouping, tri_idxs, distances = _preprocess_input( distance_matrix, grouping, column) test_stat_function = partial(_compute_groups, samples, test) stat, p_value = _run_monte_carlo_stats(test_stat_function, grouping, permutations) return _build_results('PERMDISP', 'F-value', sample_size, num_groups, stat, p_value, permutations)
index=samples, columns=samples) graph_dm.to_csv('../results/aitchison.txt', '\t') # Read in graph_dm graph_dm = pd.read_csv('../results/unconnected_aitchison.txt', sep='\t', index_col=0) # table = pd.read_table('../data/skinmap_chemiFrac_test.txt', # sep='\t', index_col=0) graph_dm.index = table.columns graph_dm.columns = table.columns # _dm = pw_distances('braycurtis', table.values, table.index.values) # _dm.write('../results/braycurtis.txt') _dm = DistanceMatrix(graph_dm.values + graph_dm.values.T) _dm.ids = graph_dm.index pcoa_v = pcoa(_dm) fig = plt.figure(3) plt.plot(pcoa_v.samples['PC1'], pcoa_v.samples['PC2'], 'ob') # plt.plot(pcoa_v.eigvecs[not_stressed, 0], # pcoa_v.eigvecs[not_stressed, 1], # 'o', c='#FFFFFF', label='Before stress') # plt.plot(pcoa_v.eigvecs[stressed, 0], # pcoa_v.eigvecs[stressed, 1], # 'o', c='#999999', label='After stress') # plt.legend(loc=3) #plt.title('Weighted Aitchison on Coral data') #fig.savefig('../results/coral_chemifrac.png') pcoa_v.write('../results/coral_unconnected_aitchison_pcoa.txt')