def pcoa(file): samples, distmtx = parse_distmat(file) # coords, each row is an axis coords, eigvals = ms.principal_coordinates_analysis(distmtx) pcnts = (numpy.abs(eigvals) / sum(numpy.abs(eigvals))) * 100 idxs_descending = pcnts.argsort()[::-1] coords = coords[idxs_descending] eigvals = eigvals[idxs_descending] pcnts = pcnts[idxs_descending] return format_coords(samples, coords.T, eigvals, pcnts)
def test_principal_coordinate_analysis(self): """principal_coordinate_analysis returns array of principal coors""" #I took the example in the book (see intro info), and did the #principal coordinates analysis, plotted the data and it looked #right matrix = self.real_matrix pcs, eigvals = principal_coordinates_analysis(matrix) bigfirstorder = eigvals.argsort()[::-1] pcs = pcs[bigfirstorder] eigvals = eigvals[bigfirstorder] self.assertEqual(len(pcs), 14) self.assertFloatEqual(abs(pcs[0, 0]), 0.240788133045) self.assertFloatEqual(abs(pcs[1, 0]), 0.233677162)
def test_principal_coordinate_analysis(self): """principal_coordinate_analysis returns array of principal coors""" #I took the example in the book (see intro info), and did the #principal coordinates analysis, plotted the data and it looked #right matrix = self.real_matrix pcs, eigvals= principal_coordinates_analysis(matrix) bigfirstorder = eigvals.argsort()[::-1] pcs = pcs[bigfirstorder] eigvals = eigvals[bigfirstorder] self.assertEqual(len(pcs), 14) self.assertFloatEqual(abs(pcs[0,0]), 0.240788133045) self.assertFloatEqual(abs(pcs[1,0]), 0.233677162)
def hmp_pcoa(biom_path, map_path, distance="hellinger"): """ @biom_path @map_path @distance """ data,labn,labs,classes = load_data(biom_path, map_path) dist_mtrx_fcn = getattr(distance_transform, 'dist_'+distance) dist_mtrx = dist_mtrx_fcn(data) coords, eigvals=ms.principal_coordinates_analysis(dist_mtrx) returgvals = np.abs(eigvals) pcnts = (np.abs(eigvals)/sum(np.abs(eigvals)))*100 idxs_descending = pcnts.argsort()[::-1] coords = coords[idxs_descending] eigvals = eigvals[idxs_descending] pcnts = pcnts[idxs_descending] return None
def __init__(self, dissimilarity_mtx, initial_pts="pcoa", dimension=2, rand_seed=None, optimization_method=1, verbosity=1, max_iterations=50, setup_only=False, min_rel_improvement=1e-3, min_abs_stress=1e-5): """ Arguments: - dissimilarity_mtx: an n by n numpy float array representing the pairwise dissimilarity of items. 0 on diagonals, symmetric under (i,j) -> (j,i) - initial_pts: "random" => random starting points, "pcoa" => pts from pcoa, or a numpy 2d array, ncols = dimension - dimension: the desired dimension k of the constructed - rand_seed: used for testing - optimization_method: used when points are adjusted to minimize stress: 0 => justin k's ad hoc method of steepest descent 1 => cogent's scipy_optimize fmin_bfgs """ self.min_rel_improvement = min_rel_improvement self.min_abs_stress = min_abs_stress if dimension >= len(dissimilarity_mtx) - 1: raise RuntimeError("NMDS requires N-1 dimensions or fewer, "+\ "where N is the number of samples, or rows in the dissim matrix"+\ " got %s rows for a %s dimension NMDS" % \ (len(dissimilarity_mtx), dimension)) if rand_seed != None: seed(rand_seed) self.verbosity = verbosity num_points = len(dissimilarity_mtx) point_range = list(range(num_points)) self.dimension = dimension self.optimization_method = optimization_method self._calc_dissim_order(dissimilarity_mtx, point_range) # sets self.order # note that in the rest of the code, only the order matters, the values # of the dissimilarity matrix aren't used if initial_pts == "random": self.points = self._get_initial_pts(dimension, point_range) elif initial_pts == "pcoa": pcoa_pts, pcoa_eigs = principal_coordinates_analysis(\ dissimilarity_mtx) order = argsort(pcoa_eigs)[::-1] # pos to small/neg pcoa_pts = pcoa_pts[order].T self.points = pcoa_pts[:, :dimension] else: self.points = initial_pts self.points = self._center(self.points) self._rescale() self._calc_distances() # dists relates to points, not to input data self._update_dhats() # dhats are constrained to be monotonic self._calc_stress() # self.stress is calculated from dists and dhats self.stresses = [self.stress] # stress is the metric of badness of fit used in this code # index 0 is the initial stress, with a initial set of # datapoints. index 1 corresponds to iteration 0 of the loop below if setup_only: return for i in range(max_iterations): if self.verbosity >= 1: print(("nonmetric broad iteration, stress: ", i, self.stresses[-1])) if (self.stresses[-1] < self.min_abs_stress): if self.verbosity >= 1: print("stress below cutoff, done") break self._move_points() self._calc_distances() self._update_dhats() self._calc_stress() self.stresses.append(self.stress) if (self.stresses[-2]-self.stresses[-1]) / self.stresses[-2] <\ self.min_rel_improvement: if self.verbosity >= 1: print("iteration improvement minimal. converged.") break # center and rotate the points, since pos, rotation is arbitrary # rotation is to align to principal axes of self.points self.points = self._center(self.points) u, s, vh = svd(self.points, full_matrices=False) S = diag(s) self.points = dot(u, S) # normalize the scaling, which should not change the stress self._rescale()
for i in range(len(distance_matrix)-1,-1,-1): if distance_matrix[i][0] == '#': del distance_matrix[i] #split each line by tabs distance_matrix = [i.split('\t') for i in distance_matrix] #convert each element to a number distance_matrix = array([[float(i) for i in j] for j in distance_matrix]) print distance_matrix else: #create distance matrix distance_matrix = dist_functions[dist_metric](ptmtx) o = open("distmtx.txt", 'w') o.write(distance_matrix) o.close(); print("1") aa = pcoa.principal_coordinates_analysis(distance_matrix) sample_coords = aa[0].transpose() sp_coords = species_coords(aa[0], ptmtx, dims=len(sample_coords[0])) * 3 print("1") evals = aa[1]/sum(aa[1]) #scale axes by eigenvalues sp_coords = sp_coords*array([list(evals)]*len(sp_coords)); sample_coords = sample_coords*array([list(evals)]*len(sample_coords)) o = open('sample_coords.txt', 'w') for i in sample_coords: for j in i: o.write(str(j) + '\t') o.write('\n')
for i in range(len(distance_matrix) - 1, -1, -1): if distance_matrix[i][0] == '#': del distance_matrix[i] #split each line by tabs distance_matrix = [i.split('\t') for i in distance_matrix] #convert each element to a number distance_matrix = array([[float(i) for i in j] for j in distance_matrix]) print distance_matrix else: #create distance matrix distance_matrix = dist_functions[dist_metric](ptmtx) o = open("distmtx.txt", 'w') o.write(distance_matrix) o.close() print("1") aa = pcoa.principal_coordinates_analysis(distance_matrix) sample_coords = aa[0].transpose() sp_coords = species_coords(aa[0], ptmtx, dims=len(sample_coords[0])) * 3 print("1") evals = aa[1] / sum(aa[1]) #scale axes by eigenvalues sp_coords = sp_coords * array([list(evals)] * len(sp_coords)) sample_coords = sample_coords * array([list(evals)] * len(sample_coords)) o = open('sample_coords.txt', 'w') for i in sample_coords: for j in i: o.write(str(j) + '\t') o.write('\n')
def __init__(self, dissimilarity_mtx, initial_pts="pcoa", dimension=2, rand_seed=None, optimization_method=1, verbosity=1, max_iterations=50, setup_only=False, min_rel_improvement = 1e-3, min_abs_stress = 1e-5): """ Arguments: - dissimilarity_mtx: an n by n numpy float array representing the pairwise dissimilarity of items. 0 on diagonals, symmetric under (i,j) -> (j,i) - initial_pts: "random" => random starting points, "pcoa" => pts from pcoa, or a numpy 2d array, ncols = dimension - dimension: the desired dimension k of the constructed - rand_seed: used for testing - optimization_method: used when points are adjusted to minimize stress: 0 => justin k's ad hoc method of steepest descent 1 => cogent's scipy_optimize fmin_bfgs """ self.min_rel_improvement = min_rel_improvement self.min_abs_stress = min_abs_stress if dimension >= len(dissimilarity_mtx) - 1: raise RuntimeError("NMDS requires N-1 dimensions or fewer, "+\ "where N is the number of samples, or rows in the dissim matrix"+\ " got %s rows for a %s dimension NMDS" % \ (len(dissimilarity_mtx), dimension)) if rand_seed != None: seed(rand_seed) self.verbosity = verbosity num_points = len(dissimilarity_mtx) point_range = list(range(num_points)) self.dimension = dimension self.optimization_method = optimization_method self._calc_dissim_order(dissimilarity_mtx, point_range) # sets self.order # note that in the rest of the code, only the order matters, the values # of the dissimilarity matrix aren't used if initial_pts == "random": self.points = self._get_initial_pts(dimension, point_range) elif initial_pts == "pcoa": pcoa_pts, pcoa_eigs = principal_coordinates_analysis(\ dissimilarity_mtx) order = argsort(pcoa_eigs)[::-1] # pos to small/neg pcoa_pts = pcoa_pts[order].T self.points = pcoa_pts[:,:dimension] else: self.points = initial_pts self.points = self._center(self.points) self._rescale() self._calc_distances() # dists relates to points, not to input data self._update_dhats() # dhats are constrained to be monotonic self._calc_stress() # self.stress is calculated from dists and dhats self.stresses = [self.stress] # stress is the metric of badness of fit used in this code # index 0 is the initial stress, with a initial set of # datapoints. index 1 corresponds to iteration 0 of the loop below if setup_only: return for i in range(max_iterations): if self.verbosity >= 1: print(("nonmetric broad iteration, stress: ", i, self.stresses[-1])) if (self.stresses[-1] < self.min_abs_stress): if self.verbosity >= 1: print("stress below cutoff, done") break self._move_points() self._calc_distances() self._update_dhats() self._calc_stress() self.stresses.append(self.stress) if (self.stresses[-2]-self.stresses[-1]) / self.stresses[-2] <\ self.min_rel_improvement: if self.verbosity >= 1: print("iteration improvement minimal. converged.") break # center and rotate the points, since pos, rotation is arbitrary # rotation is to align to principal axes of self.points self.points = self._center(self.points) u,s,vh = svd(self.points, full_matrices=False) S = diag(s) self.points = dot(u,S) # normalize the scaling, which should not change the stress self._rescale()
def generate_pcoa_file(distmtx, m_n_sample_ids, n_sample_ids, filepath): """Make PCoA-related file for D3.js drawings. Generates CSV file. :param distmtx: Numpy array matrix of distances :param m_n_sample_ids: List of strings containing m and n sample IDs :param n_sample_ids: List of strings containing user (or n) sample IDs :param filepath: User directory path to which the file will be written """ coords, eigvals = ms.principal_coordinates_analysis(distmtx) pcnts = (np.abs(eigvals) / float(sum(np.abs(eigvals)))) * 100 idxs_descending = pcnts.argsort()[::-1] coords = coords[idxs_descending] # from google10c #print "Distance Matrix distmtx" #print distmtx, coords print 'm_n_sample_ids' + str(len(m_n_sample_ids)), m_n_sample_ids print "n_sample_ids" + str(len(n_sample_ids)), n_sample_ids colormap = [ "#3366cc", "#dc3912", "#ff9900", "#109618", "#990099", "#0099c6", "#dd4477", "#66aa00", "#b82e2e", "#316395", "#994499", "#22aa99", "#aaaa11", "#6633cc", "#e67300", "#8b0707", "#651067", "#329262", "#5574a6", "#3b3eac" ] tooltip_html = """ Sample: {}<br> Ecosystem: {}<br> Envo ID: {}<br> Envo Term: {}<br> Study: {} <br> Study Source: {} """ """Okay messy indexing coming up! eco_samples_idx holds a list of sample indices for each ecosystem that is queried (along with color) e.g. { ("Biofilm", "grey"): [1,12,...], ("Soil", "gold"): [3,16,...] }. envo_samples_idx holds a list of sample indices for each envo that is queried (along with color) e.g. { ("ENVO:00009003", "blue"): [1,12,...], ("ENVO:00000073", "red"): [3,16,...] }. tooltip_htmls contains the html formatted string of the metadata for each sample. Metdata is a tuple of (title, ontology_ids, ontology_terms, ecosystem, study_source, color) """ m_sample_ids = [ sample_id for sample_id in m_n_sample_ids if not sample_id in n_sample_ids ] user_key = ("User Samples", "red") eco_samples_idx = {user_key: []} envo_samples_idx = {user_key: []} tooltip_htmls = [""] * len(m_n_sample_ids) # all other samples excluding user samples for sample_id in m_sample_ids: metadata = query_pcoa_metadata(sample_id) tooltip_htmls[m_n_sample_ids.index(sample_id)] = tooltip_html.format( sample_id, metadata[3], ", ".join(metadata[1]), ", ".join(metadata[2]), metadata[0], metadata[4]) color_id = len(envo_samples_idx) eco_term = metadata[3] eco_color = metadata[5] envo_term = metadata[1][0] eco_key = (eco_term, eco_color) # if the envo has already been encountered before, don't add a new sample if envo_term in map(lambda x: x[0], envo_samples_idx): envo_color = filter(lambda x: x[0] == envo_term, envo_samples_idx)[0][1] envo_key = (envo_term, envo_color) else: envo_key = (envo_term, colormap[color_id % len(colormap)]) if eco_key in eco_samples_idx: eco_samples_idx[eco_key].append(m_n_sample_ids.index(sample_id)) else: eco_samples_idx[eco_key] = [m_n_sample_ids.index(sample_id)] if envo_key in envo_samples_idx: envo_samples_idx[envo_key].append(m_n_sample_ids.index(sample_id)) else: envo_samples_idx[envo_key] = [m_n_sample_ids.index(sample_id)] # add user samples to eco_samples_idx, envo_samples_idx and html_tooltips for sample_id_j in n_sample_ids: envo_samples_idx[user_key].append(m_n_sample_ids.index(sample_id_j)) eco_samples_idx[user_key].append(m_n_sample_ids.index(sample_id_j)) tooltip_htmls[m_n_sample_ids.index(sample_id_j)] = tooltip_html.format( sample_id_j, user_key[0], user_key[0], user_key[0], "", "") """ PLOTTING """ plots = {} # loop through groupings for group in ["ecosystem", "envo"]: # loop through each top 3 pairs of principal coordinates for pc1, pc2 in itertools.combinations(range(3), 2): # start the plot fig, ax = plt.subplots() fig.set_figwidth(11) # plot all the points except for the users samples, they go last group_samples_idx = eco_samples_idx non_user_group_samples_idx = [ e for e in eco_samples_idx.keys() if not e == user_key ] if group is "envo": group_samples_idx = envo_samples_idx non_user_group_samples_idx = [ e for e in envo_samples_idx.keys() if not e == user_key ] # scatter the ecosystem-labelled points # remember that the keys are in the format # ("Ecosystem/Envo", "Color") for key in non_user_group_samples_idx: ax.scatter(coords.T[group_samples_idx[key], pc1], coords.T[group_samples_idx[key], pc2], marker="o", label=key[0], color=key[1], alpha=1) # plot user samples ax.scatter(coords.T[group_samples_idx[user_key], pc1], coords.T[group_samples_idx[user_key], pc2], marker="*", s=96, label=user_key[0], color=user_key[1], alpha=1) # draw PC axis labels ax.set_xlabel("PC%d" % (pc1 + 1)) ax.set_ylabel("PC%d" % (pc2 + 1)) ax.set_title("PCoA Plot grouped by %s" % (group.capitalize())) # adjust the plot abit for the legend for sample legend box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) """INTERACTIVITY""" # make interactive legends for sample groupings if group is "ecosystem": handles, legend_labels = ax.get_legend_handles_labels() interactive_legend = InteractiveLegendPlugin( zip(handles, ax.collections), legend_labels, alpha_unsel=0.3, alpha_over=1, start_visible=False) mpld3.plugins.connect(fig, interactive_legend) # make interactive html labels for non-user samples first, since # they are now ordered html_labels = np.array(tooltip_htmls) for i, key in enumerate(non_user_group_samples_idx): tooltip = PointHTMLTooltip( ax.collections[i], labels=list(html_labels[group_samples_idx[key]])) mpld3.plugins.connect(fig, tooltip) # make interactive html labels for user samples tooltip = PointHTMLTooltip( ax.collections[-1], labels=list(html_labels[group_samples_idx[user_key]])) mpld3.plugins.connect(fig, tooltip) plot_name = (pc1 + 1, pc2 + 1, group.capitalize()) plots["PC%d%d%s" % plot_name] = mpld3.fig_to_dict(fig) svgfile = "%s_PC%s%s_%s.svg" % ( (os.path.splitext(filepath)[0], ) + plot_name) print "Saving PCoA in %s" % svgfile plt.savefig(svgfile) """ FINISH! """ with open(filepath, "w") as f_pcoa: json.dump(plots, f_pcoa)