def process(tree_string): """ @param tree_string: a newick string @return: a multi-line string that summarizes the results """ np.set_printoptions(linewidth=200) out = StringIO() # build the newick tree from the string tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get ordered names and ids ordered_ids, ordered_names = get_ordered_ids_and_names(tree) # get the distance matrix with ordered indices including all nodes in the tree nvertices = len(list(tree.preorder())) nleaves = len(list(tree.gen_tips())) id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids)) D = np.array(tree.get_partial_distance_matrix(ordered_ids)) # define mass vectors m_uniform_unscaled = [1]*nvertices m_degenerate_unscaled = [1]*nleaves + [0]*(nvertices-nleaves) m_uniform = np.array(m_uniform_unscaled, dtype=float) / sum(m_uniform_unscaled) m_degenerate = np.array(m_degenerate_unscaled, dtype=float) / sum(m_degenerate_unscaled) # show some of the distance matrices print >> out, 'ordered names:' print >> out, ordered_names print >> out print >> out, 'embedded points with mass uniformly distributed among all vertices:' print >> out, Euclid.edm_to_weighted_points(D, m_uniform) print >> out print >> out, 'embedded points with mass uniformly distributed among the leaves:' print >> out, Euclid.edm_to_weighted_points(D, m_degenerate) print >> out # return the response return out.getvalue().strip()
def process(tree_string): """ @param tree_string: a newick string @return: a multi-line string that summarizes the results """ np.set_printoptions(linewidth=200) out = StringIO() # build the newick tree from the string tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get ordered names and ids ordered_ids, ordered_names = get_ordered_ids_and_names(tree) # get the distance matrix with ordered indices including all nodes in the tree nvertices = len(list(tree.preorder())) nleaves = len(list(tree.gen_tips())) id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids)) D = np.array(tree.get_partial_distance_matrix(ordered_ids)) # define mass vectors m_uniform_unscaled = [1] * nvertices m_degenerate_unscaled = [1] * nleaves + [0] * (nvertices - nleaves) m_uniform = np.array(m_uniform_unscaled, dtype=float) / sum(m_uniform_unscaled) m_degenerate = np.array(m_degenerate_unscaled, dtype=float) / sum(m_degenerate_unscaled) # show some of the distance matrices print >> out, 'ordered names:' print >> out, ordered_names print >> out print >> out, 'embedded points with mass uniformly distributed among all vertices:' print >> out, Euclid.edm_to_weighted_points(D, m_uniform) print >> out print >> out, 'embedded points with mass uniformly distributed among the leaves:' print >> out, Euclid.edm_to_weighted_points(D, m_degenerate) print >> out # return the response return out.getvalue().strip()
def get_response_content(fs): # build the newick tree from the string tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree) nvertices = len(list(tree.preorder())) nleaves = len(list(tree.gen_tips())) ninternal = nvertices - nleaves # get ordered ids with the internal nodes first ordered_ids = get_ordered_ids(tree) leaf_ids = [id(node) for node in tree.gen_tips()] # get the distance matrix and the augmented distance matrix D_leaf = np.array(tree.get_partial_distance_matrix(leaf_ids)) D = np.array(tree.get_partial_distance_matrix(ordered_ids)) D_aug = get_augmented_distance(D, nleaves, fs.ndups) # analyze the leaf distance matrix X_leaf = Euclid.edm_to_points(D_leaf) # get the eigendecomposition of the centered augmented distance matrix X_aug = Euclid.edm_to_points(D_aug, nvertices - 1) # explicitly compute the points for the given number of dups using weights m = [1] * ninternal + [1 + fs.ndups] * nleaves m = np.array(m, dtype=float) / sum(m) X_weighted = Euclid.edm_to_weighted_points(D, m) # explicitly compute the points for 10x dups m = [1] * ninternal + [1 + fs.ndups * 10] * nleaves m = np.array(m, dtype=float) / sum(m) X_weighted_10x = Euclid.edm_to_weighted_points(D, m) # explicitly compute the limiting points as the number of dups increases X = Euclid.edm_to_points(D) X -= np.mean(X[-nleaves:], axis=0) XL = X[-nleaves:] U, s, Vt = np.linalg.svd(XL) Z = np.dot(X, Vt.T) # report the results np.set_printoptions(linewidth=300, threshold=10000) out = StringIO() print >> out, 'leaf distance matrix:' print >> out, D_leaf print >> out print >> out, 'points derived from the leaf distance matrix' print >> out, '(the first column is proportional to the Fiedler vector):' print >> out, X_leaf print >> out if fs.show_aug: print >> out, 'augmented distance matrix:' print >> out, D_aug print >> out print >> out, 'points derived from the augmented distance matrix' print >> out, '(the first column is proportional to the Fiedler vector):' print >> out, get_ugly_matrix(X_aug, ninternal, nleaves) print >> out print >> out, 'points computed using masses:' print >> out, X_weighted print >> out print >> out, 'points computed using masses with 10x dups:' print >> out, X_weighted_10x print >> out print >> out, 'limiting points:' print >> out, Z print >> out return out.getvalue()
def get_response_content(fs): # build the newick tree from the string tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree) nvertices = len(list(tree.preorder())) nleaves = len(list(tree.gen_tips())) ninternal = nvertices - nleaves # get ordered ids with the internal nodes first ordered_ids = get_ordered_ids(tree) leaf_ids = [id(node) for node in tree.gen_tips()] # get the distance matrix and the augmented distance matrix D_leaf = np.array(tree.get_partial_distance_matrix(leaf_ids)) D = np.array(tree.get_partial_distance_matrix(ordered_ids)) D_aug = get_augmented_distance(D, nleaves, fs.ndups) # analyze the leaf distance matrix X_leaf = Euclid.edm_to_points(D_leaf) # get the eigendecomposition of the centered augmented distance matrix X_aug = Euclid.edm_to_points(D_aug, nvertices-1) # explicitly compute the points for the given number of dups using weights m = [1]*ninternal + [1+fs.ndups]*nleaves m = np.array(m, dtype=float) / sum(m) X_weighted = Euclid.edm_to_weighted_points(D, m) # explicitly compute the points for 10x dups m = [1]*ninternal + [1+fs.ndups*10]*nleaves m = np.array(m, dtype=float) / sum(m) X_weighted_10x = Euclid.edm_to_weighted_points(D, m) # explicitly compute the limiting points as the number of dups increases X = Euclid.edm_to_points(D) X -= np.mean(X[-nleaves:], axis=0) XL = X[-nleaves:] U, s, Vt = np.linalg.svd(XL) Z = np.dot(X, Vt.T) # report the results np.set_printoptions(linewidth=300, threshold=10000) out = StringIO() print >> out, 'leaf distance matrix:' print >> out, D_leaf print >> out print >> out, 'points derived from the leaf distance matrix' print >> out, '(the first column is proportional to the Fiedler vector):' print >> out, X_leaf print >> out if fs.show_aug: print >> out, 'augmented distance matrix:' print >> out, D_aug print >> out print >> out, 'points derived from the augmented distance matrix' print >> out, '(the first column is proportional to the Fiedler vector):' print >> out, get_ugly_matrix(X_aug, ninternal, nleaves) print >> out print >> out, 'points computed using masses:' print >> out, X_weighted print >> out print >> out, 'points computed using masses with 10x dups:' print >> out, X_weighted_10x print >> out print >> out, 'limiting points:' print >> out, Z print >> out return out.getvalue()
def process(): """ @return: a multi-line string that summarizes the results """ np.set_printoptions(linewidth=200) out = StringIO() # define a degenerate mass vector m_degenerate = np.array([0.25, 0.25, 0.25, 0.25, 0, 0]) # define some distance matrices D_leaves = Euclid.g_D_b D_all = Euclid.g_D_c nvertices = 6 nleaves = 4 # get the projection and the weighted multidimensional scaling X = Euclid.edm_to_points(D_all) Y = Euclid.edm_to_weighted_points(D_all, m_degenerate) D_X = np.array([[np.dot(pb - pa, pb - pa) for pa in X] for pb in X]) D_Y = np.array([[np.dot(pb - pa, pb - pa) for pa in Y] for pb in Y]) # get the embedding using only the leaves print >> out, 'embedding of leaves from the leaf distance matrix:' print >> out, Euclid.edm_to_points(D_leaves) print >> out, 'projection of all vertices onto the MDS space of the leaves:' print >> out, do_projection(D_all, nleaves) print >> out, 'embedding of all vertices using uniform weights:' print >> out, X print >> out, 'corresponding distance matrix:' print >> out, D_X print >> out, 'embedding of all vertices using degenerate weights:' print >> out, Y print >> out, 'corresponding distance matrix:' print >> out, D_Y return out.getvalue().strip()
def process(): """ @return: a multi-line string that summarizes the results """ np.set_printoptions(linewidth=200) out = StringIO() # define a degenerate mass vector m_degenerate = np.array([0.25, 0.25, 0.25, 0.25, 0, 0]) # define some distance matrices D_leaves = Euclid.g_D_b D_all = Euclid.g_D_c nvertices = 6 nleaves = 4 # get the projection and the weighted multidimensional scaling X = Euclid.edm_to_points(D_all) Y = Euclid.edm_to_weighted_points(D_all, m_degenerate) D_X = np.array([[np.dot(pb-pa, pb-pa) for pa in X] for pb in X]) D_Y = np.array([[np.dot(pb-pa, pb-pa) for pa in Y] for pb in Y]) # get the embedding using only the leaves print >> out, 'embedding of leaves from the leaf distance matrix:' print >> out, Euclid.edm_to_points(D_leaves) print >> out, 'projection of all vertices onto the MDS space of the leaves:' print >> out, do_projection(D_all, nleaves) print >> out, 'embedding of all vertices using uniform weights:' print >> out, X print >> out, 'corresponding distance matrix:' print >> out, D_X print >> out, 'embedding of all vertices using degenerate weights:' print >> out, Y print >> out, 'corresponding distance matrix:' print >> out, D_Y return out.getvalue().strip()
def get_canonical_2d_mds(D, m, reference_points): """ This function is about projecting the points. It is like MDS except the reflections across the axes are not arbitrary. Also it only uses the first two axes. @param D: the full distance matrix @param m: the mass vector @param reference_points: a 2D reference projection of vertices of the tree @return: the weighted MDS points as a numpy matrix """ X = Euclid.edm_to_weighted_points(D, m) return reflect_to_reference(X.T[:2].T, reference_points)
def get_canonical_3d_mds(D, m, reference_points): """ This function is about projecting the points. It is like MDS except the reflections across the axes are not arbitrary. Also it only uses the first three axes. @param D: the full distance matrix @param m: the mass vector @param reference_points: a 3D reference projection of vertices of the tree @return: the weighted MDS points as a numpy matrix """ X = Euclid.edm_to_weighted_points(D, m) X_3d = X.T[:3].T sign_vector = MatrixUtil.get_best_reflection(X_3d, reference_points) return X_3d * sign_vector
def process(): """ @return: a multi-line string that summarizes the results """ np.set_printoptions(linewidth=200) out = StringIO() # define some distance matrices D_leaves = Euclid.g_D_b D_all = Euclid.g_D_c nvertices = 6 nleaves = 4 # define mass vectors m_degenerate = np.array([0.25, 0.25, 0.25, 0.25, 0, 0]) m_interesting = np.array([.2, .2, .2, .2, .1, .1]) m_uniform = np.ones(nvertices) / float(nvertices) # augment a distance matrix by adding leaflets D_augmented = add_leaflets(D_all, nleaves) # create the projection of points X_projected = do_projection(D_all, nleaves) # show some of the distance matrices print >> out, 'pairwise distances among vertices in the original tree:' print >> out, D_all print >> out, 'pairwise distance matrix augmented with one leaflet per leaf:' print >> out, D_augmented # get the distance matrices corresponding to the cases in the docstring print >> out, 'case 1: embedding of all vertices:' print >> out, Euclid.edm_to_points(D_all) print >> out, 'case 2: embedding of leaves and leaflets from the leaflet-augmented distance matrix:' print >> out, Euclid.edm_to_points(D_augmented) print >> out, 'case 3: projection of all vertices onto the MDS space of the leaves:' print >> out, X_projected # another embedding print >> out, 'embedding of leaves from the leaf distance matrix:' print >> out, Euclid.edm_to_points(D_leaves) # show embeddings of a tree augmented with leaflets print >> out, 'first few coordinates of the original vertices of the embedded tree with lots of leaflets per leaf:' D_super_augmented = D_all.copy() for i in range(20): D_super_augmented = add_leaflets(D_super_augmented, nleaves) X_super = Euclid.edm_to_points(D_super_augmented) X_super_block_small = X_super[:6].T[:3].T print >> out, X_super_block_small print >> out, 'ratio of coordinates of projected points to coordinates of this block of the embedding of the augmented tree:' print >> out, X_projected / X_super_block_small # test Z = Euclid.edm_to_weighted_points(D_all, m_uniform) print >> out, 'generalized case 1:' print >> out, Z # test Z = Euclid.edm_to_weighted_points(D_all, m_interesting) print >> out, 'generalized case 2:' print >> out, Z # test Z = Euclid.edm_to_weighted_points(D_all, m_degenerate) print >> out, 'generalized case 3:' print >> out, Z # test Z = get_weighted_embedding_b(D_all, m_uniform) print >> out, 'eric formula case 1:' print >> out, Z # test Z = get_weighted_embedding_b(D_all, m_interesting) print >> out, 'eric formula case 2:' print >> out, Z # test Z = get_weighted_embedding_b(D_all, m_degenerate) print >> out, 'eric formula case 3:' print >> out, Z # test stuff print >> out, 'testing random stuff:' D = D_all m = m_degenerate nvertices = len(m) sqrtm = np.sqrt(m) M = np.diag(sqrtm) cross_product_matrix = Euclid.edm_to_weighted_cross_product(D, m) U_cross, S_cross, VT_cross = np.linalg.svd(cross_product_matrix, full_matrices=False) Q = np.dot(M, np.dot(cross_product_matrix, M.T)) U, B, VT = np.linalg.svd(Q, full_matrices=False) S = np.sqrt(np.diag(B)) US = np.dot(U, S) M_pinv = np.linalg.pinv(M) M_pinv_narrow = M_pinv.T[:-2].T US_short = US[:-2] print >> out, 'eigenvalues of the abdi cross product:', S_cross print >> out, 'eigenvalues of the eric cross product:', B print >> out, M_pinv print >> out, US print >> out, M_pinv_narrow print >> out, US_short Z = np.dot(M_pinv_narrow, US_short) print >> out, Z # return the response return out.getvalue().strip()