def get_response_content(fs): # build the newick tree from the string tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree) nvertices = len(list(tree.preorder())) nleaves = len(list(tree.gen_tips())) # get ordered ids with the leaves first ordered_ids = get_ordered_ids(tree) # get the adjacency matrix and the augmented adjacency matrix A = np.array(tree.get_affinity_matrix(ordered_ids)) A_aug = get_augmented_adjacency(A, nleaves, fs.ndups, fs.strength) # get the laplacian matrices L = Euclid.adjacency_to_laplacian(A) L_aug = Euclid.adjacency_to_laplacian(A_aug) # get the schur complement R = SchurAlgebra.mschur(L, set(range(nleaves, nvertices))) R_pinv = np.linalg.pinv(R) vals, vecs = EigUtil.eigh(R_pinv) # get the scaled Fiedler vector for the Schur complement w, v = EigUtil.principal_eigh(R_pinv) fiedler = v * math.sqrt(w) # get the eigendecomposition of the augmented Laplacian L_aug_pinv = np.linalg.pinv(L_aug) vals_aug, vecs_aug = EigUtil.eigh(L_aug_pinv) # get the scaled Fiedler vector for the augmented Laplacian w_aug, v_aug = EigUtil.principal_eigh(L_aug_pinv) fiedler_aug = v_aug * math.sqrt(w_aug) # report the results np.set_printoptions(linewidth=300) out = StringIO() print >> out, 'Laplacian matrix:' print >> out, L print >> out print >> out, 'Schur complement of Laplacian matrix:' print >> out, R print >> out print >> out, 'scaled Fiedler vector of Schur complement:' print >> out, fiedler print >> out print >> out, 'eigenvalues of pinv of Schur complement:' print >> out, vals print >> out print >> out, 'corresponding eigenvectors of pinv of Schur complement:' print >> out, np.array(vecs).T print >> out print >> out print >> out, 'augmented Laplacian matrix:' print >> out, L_aug print >> out print >> out, 'scaled Fiedler vector of augmented Laplacian:' print >> out, fiedler_aug print >> out print >> out, 'eigenvalues of pinv of augmented Laplacian:' print >> out, vals_aug print >> out print >> out, 'rows are eigenvectors of pinv of augmented Laplacian:' print >> out, np.array(vecs_aug) return out.getvalue()
def get_response_content(fs): # build the newick tree from the string tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree) nvertices = len(list(tree.preorder())) nleaves = len(list(tree.gen_tips())) # get ordered ids with the leaves first ordered_ids = get_ordered_ids(tree) # get the distance matrix and the augmented distance matrix D = np.array(tree.get_partial_distance_matrix(ordered_ids)) D_aug = get_augmented_distance(D, nleaves, fs.ndups) # get the laplacian matrix L = Euclid.edm_to_laplacian(D) # get the schur complement R = SchurAlgebra.mschur(L, set(range(nleaves, nvertices))) R_pinv = np.linalg.pinv(R) vals, vecs = EigUtil.eigh(R_pinv) # get the scaled Fiedler vector for the Schur complement w, v = EigUtil.principal_eigh(R_pinv) fiedler = v * math.sqrt(w) # get the eigendecomposition of the centered augmented distance matrix L_aug_pinv = Euclid.edm_to_dccov(D_aug) vals_aug, vecs_aug = EigUtil.eigh(L_aug_pinv) # get the scaled Fiedler vector for the augmented Laplacian w_aug, v_aug = EigUtil.principal_eigh(L_aug_pinv) fiedler_aug = v_aug * math.sqrt(w_aug) # report the results np.set_printoptions(linewidth=300, threshold=10000) out = StringIO() print >> out, "Laplacian matrix:" print >> out, L print >> out print >> out, "Schur complement of Laplacian matrix:" print >> out, R print >> out print >> out, "scaled Fiedler vector of Schur complement:" print >> out, fiedler print >> out print >> out, "eigenvalues of pinv of Schur complement:" print >> out, vals print >> out print >> out, "corresponding eigenvectors of pinv of Schur complement:" print >> out, np.array(vecs).T print >> out print >> out print >> out, "augmented distance matrix:" print >> out, D_aug print >> out print >> out, "scaled Fiedler vector of augmented Laplacian limit:" print >> out, fiedler_aug print >> out print >> out, "eigenvalues of pinv of augmented Laplacian limit:" print >> out, vals_aug print >> out print >> out, "rows are eigenvectors of pinv of augmented Laplacian limit:" print >> out, np.array(vecs_aug) return out.getvalue()
def get_eval_evec_pairs(C_full, diploid_and_biallelic): """ Input rows are OTUs and columns are loci. Each element of the input data is a count. @param C_full: matrix of float counts where each row represents an OTU @param diploid_and_biallelic: a flag @return: (eigenvalues, eigenvectors) """ # create the floating point count matrix m_full, n_full = C_full.shape # check compatibility of counts and ploidy if diploid_and_biallelic: if np.max(C_full) > 2: raise ValueError( 'no count should be greater than two for diploid data') # remove invariant columns C = np.vstack([v for v in C_full.T if len(set(v))>1]).T # get the shape of the matrix m, n = C.shape # get the column means u = C.mean(axis=0) # get the centered and normalized counts matrix M = (C - u) # normalize if diploid and biallelic if diploid_and_biallelic: p = u/2 variances = p * (1 - p) M /= np.sqrt(variances) # construct the sample covariance matrix # FIXME this should probably use a singular value decomposition instead X = np.dot(M, M.T) / n # get the eigendecomposition of the covariance matrix return EigUtil.eigh(X)
def get_grant_proposal_points_b(lfdi): M, p, q = lfdi.M, lfdi.p, lfdi.q G = -.5 * M GQ, GX, GXT, GP = ProofDecoration.get_corners(G, q, p) # Get the eigendecomposition of the leaf-only Gower matrix. ws, vs = EigUtil.eigh(GQ) S = np.diag(ws) U = np.vstack(vs).T USUT = np.dot(np.dot(U, S), U.T) if not np.allclose(USUT, GQ): raise ValueError('eigenfail') S_sqrt = np.diag(np.sqrt(ws)) X = np.dot(U, S_sqrt) # Find the imputed internal points. S_sqrt_pinv = np.linalg.pinv(S_sqrt) #W = np.dot(np.dot(S_sqrt_pinv, GX.T), U) try: W = np.dot(np.dot(GX.T, U), S_sqrt_pinv) except ValueError as e: arr = [GX.shape, U.shape, S_sqrt_pinv.shape] raise ValueError(', '.join(str(x) for x in arr)) # put them together and get only the first coordinates full_points = np.vstack([X, W]) X = full_points.T[0] Y = full_points.T[1] Z = full_points.T[2] return X, Y, Z
def get_grant_proposal_points_b(lfdi): M, p, q = lfdi.M, lfdi.p, lfdi.q G = -.5 * M GQ, GX, GXT, GP = ProofDecoration.get_corners(G, q, p) # Get the eigendecomposition of the leaf-only Gower matrix. ws, vs = EigUtil.eigh(GQ) S = np.diag(ws) U = np.vstack(vs).T USUT = np.dot(np.dot(U, S), U.T) if not np.allclose(USUT, GQ): raise ValueError('eigenfail') S_sqrt = np.diag(np.sqrt(ws)) X = np.dot(U, S_sqrt) # Find the imputed internal points. S_sqrt_pinv = np.linalg.pinv(S_sqrt) #W = np.dot(np.dot(S_sqrt_pinv, GX.T), U) try: W = np.dot(np.dot(GX.T, U), S_sqrt_pinv) except ValueError as e: arr = [ GX.shape, U.shape, S_sqrt_pinv.shape] msg = ', '.join(str(x) for x in arr) raise ValueError(msg) # put them together and get only the first coordinates full_points = np.vstack([X, W]) points = full_points.T[:2].T return points
def do_pca(hud_lines): """ @param hud_lines: lines of a .hud file @return: names, scaled vectors """ # get the ordered names from the .hud file names, data = hud.decode(hud_lines) # create the floating point count matrix C_full = np.array(data) m_full, n_full = C_full.shape # remove invariant columns C = np.vstack([v for v in C_full.T if len(set(v))>1]).T # get the shape of the matrix m, n = C.shape # get the column means u = C.mean(axis=0) # get the centered and normalized counts matrix M = (C - u) / np.sqrt(u * (1 - u)) # construct the sample covariance matrix X = np.dot(M, M.T) / n # get the eigendecomposition of the covariance matrix evals, evecs = EigUtil.eigh(X) # scale the eigenvectos by the eigenvalues pcs = [w*v for w, v in zip(evals, evecs)] return names, pcs
def do_pca(hud_lines): """ @param hud_lines: lines of a .hud file @return: names, scaled vectors """ # get the ordered names from the .hud file names, data = hud.decode(hud_lines) # create the floating point count matrix C_full = np.array(data) m_full, n_full = C_full.shape # remove invariant columns C = np.vstack([v for v in C_full.T if len(set(v)) > 1]).T # get the shape of the matrix m, n = C.shape # get the column means u = C.mean(axis=0) # get the centered and normalized counts matrix M = (C - u) / np.sqrt(u * (1 - u)) # construct the sample covariance matrix X = np.dot(M, M.T) / n # get the eigendecomposition of the covariance matrix evals, evecs = EigUtil.eigh(X) # scale the eigenvectos by the eigenvalues pcs = [w * v for w, v in zip(evals, evecs)] return names, pcs
def get_response_content(fs): # check input compatibility if fs.nvertices < fs.naxes+1: raise ValueError( 'attempting to plot too many eigenvectors ' 'for the given number of vertices') # construct the path Laplacian matrix N = fs.nvertices L = create_laplacian_matrix(N) # compute the eigendecomposition ws, vs = EigUtil.eigh(L) # reorder the eigenvalues and eigenvectors ws = ws[:-1][::-1] vs = vs[:-1][::-1] # write the report np.set_printoptions(linewidth=200, threshold=10000) out = StringIO() for i in range(fs.naxes): w = ws[i] v = vs[i] n = i+1 #scaled_eigenvector = v / math.sqrt(w) scaled_eigenvector = v * math.sqrt(N * 0.5) print >> out, scaled_eigenvector prediction = np.array([ sinusoidal_approximation_b(N, n, k) for k in range(N)]) print >> out, prediction print >> out, scaled_eigenvector / prediction print >> out return out.getvalue()
def get_response_content(fs): # use a fixed seed if requested if fs.seed: random.seed(fs.seed) # define the max number of rejection iterations limit = fs.npoints * 100 # validate input if fs.axis < 0: raise ValueError('the mds axis must be nonnegative') # get points defining the boundary of africa nafrica = len(g_africa_poly) africa_edges = [(i, (i + 1) % nafrica) for i in range(nafrica)] # get some points and edges inside africa points = sample_with_rejection(fs.npoints, g_africa_poly, limit) x_list, y_list = zip(*points) tri = Triangulation(x_list, y_list) tri_edges = [(i + nafrica, j + nafrica) for i, j in tri.edge_db.tolist()] # get the whole list of points allpoints = g_africa_poly + points # refine the list of edges tri_edges = list(gen_noncrossing_edges(tri_edges, africa_edges, allpoints)) tri_edges = get_mst(tri_edges, allpoints) alledges = africa_edges + tri_edges # make the graph laplacian A = np.zeros((len(points), len(points))) for ia, ib in tri_edges: xa, ya = allpoints[ia] xb, yb = allpoints[ib] d = math.hypot(xb - xa, yb - ya) A[ia - nafrica, ib - nafrica] = 1 / d A[ib - nafrica, ia - nafrica] = 1 / d L = Euclid.adjacency_to_laplacian(A) ws, vs = EigUtil.eigh(np.linalg.pinv(L)) if fs.axis >= len(ws): raise ValueError('choose a smaller mds axis') v = vs[fs.axis] # get the color and sizes for the points v /= max(np.abs(v)) colors = [(0, 0, 0)] * nafrica + [get_color(x) for x in v] radii = [2] * nafrica + [5 for p in points] # get the width and height of the drawable area of the image width = fs.total_width - 2 * fs.border height = fs.total_height - 2 * fs.border if width < 1 or height < 1: msg = 'the image dimensions do not allow for enough drawable area' raise HandlingError(msg) # draw the image ext = Form.g_imageformat_to_ext[fs.imageformat] try: helper = ImgHelper(allpoints, alledges, fs.total_width, fs.total_height, fs.border) return helper.get_image_string(colors, radii, ext) except CairoUtil.CairoUtilError as e: raise HandlingError(e)
def get_response_content(fs): # use a fixed seed if requested if fs.seed: random.seed(fs.seed) # define the max number of rejection iterations limit = fs.npoints * 100 # validate input if fs.axis < 0: raise ValueError("the mds axis must be nonnegative") # get points defining the boundary of africa nafrica = len(g_africa_poly) africa_edges = [(i, (i + 1) % nafrica) for i in range(nafrica)] # get some points and edges inside africa points = sample_with_rejection(fs.npoints, g_africa_poly, limit) x_list, y_list = zip(*points) tri = Triangulation(x_list, y_list) tri_edges = [(i + nafrica, j + nafrica) for i, j in tri.edge_db.tolist()] # get the whole list of points allpoints = g_africa_poly + points # refine the list of edges tri_edges = list(gen_noncrossing_edges(tri_edges, africa_edges, allpoints)) tri_edges = get_mst(tri_edges, allpoints) alledges = africa_edges + tri_edges # make the graph laplacian A = np.zeros((len(points), len(points))) for ia, ib in tri_edges: xa, ya = allpoints[ia] xb, yb = allpoints[ib] d = math.hypot(xb - xa, yb - ya) A[ia - nafrica, ib - nafrica] = 1 / d A[ib - nafrica, ia - nafrica] = 1 / d L = Euclid.adjacency_to_laplacian(A) ws, vs = EigUtil.eigh(np.linalg.pinv(L)) if fs.axis >= len(ws): raise ValueError("choose a smaller mds axis") v = vs[fs.axis] # get the color and sizes for the points v /= max(np.abs(v)) colors = [(0, 0, 0)] * nafrica + [get_color(x) for x in v] radii = [2] * nafrica + [5 for p in points] # get the width and height of the drawable area of the image width = fs.total_width - 2 * fs.border height = fs.total_height - 2 * fs.border if width < 1 or height < 1: msg = "the image dimensions do not allow for enough drawable area" raise HandlingError(msg) # draw the image ext = Form.g_imageformat_to_ext[fs.imageformat] try: helper = ImgHelper(allpoints, alledges, fs.total_width, fs.total_height, fs.border) return helper.get_image_string(colors, radii, ext) except CairoUtil.CairoUtilError as e: raise HandlingError(e)
def main(fs): # use a fixed seed if requested if fs.seed: random.seed(fs.seed) # define the max number of rejection iterations limit = fs.npoints * 100 # validate input if fs.axis < 0: raise ValueError('the mds axis must be nonnegative') # get points defining the boundary of africa nafrica = len(g_africa_poly) africa_edges = [(i, (i + 1) % nafrica) for i in range(nafrica)] # get some points and edges inside africa points = sample_with_rejection(fs.npoints, g_africa_poly, limit) x_list, y_list = zip(*points) tri = Triangulation(x_list, y_list) tri_edges = [(i + nafrica, j + nafrica) for i, j in tri.edge_db.tolist()] # get the whole list of points allpoints = g_africa_poly + points # refine the list of edges tri_edges = list(gen_noncrossing_edges(tri_edges, africa_edges, allpoints)) tri_edges = get_mst(tri_edges, allpoints) alledges = africa_edges + tri_edges # make the graph laplacian A = np.zeros((len(points), len(points))) for ia, ib in tri_edges: xa, ya = allpoints[ia] xb, yb = allpoints[ib] d = math.hypot(xb - xa, yb - ya) A[ia - nafrica, ib - nafrica] = 1 / d A[ib - nafrica, ia - nafrica] = 1 / d L = Euclid.adjacency_to_laplacian(A) ws, vs = EigUtil.eigh(np.linalg.pinv(L)) if fs.axis >= len(ws): raise ValueError('choose a smaller mds axis') v = vs[fs.axis] # get the color and sizes for the points v /= max(np.abs(v)) # draw the picture helper = ImgHelper(allpoints, alledges, fs.total_width, fs.total_height, fs.border) helper.draw_contour_plot(v, nafrica)
def main(fs): # use a fixed seed if requested if fs.seed: random.seed(fs.seed) # define the max number of rejection iterations limit = fs.npoints * 100 # validate input if fs.axis < 0: raise ValueError("the mds axis must be nonnegative") # get points defining the boundary of africa nafrica = len(g_africa_poly) africa_edges = [(i, (i + 1) % nafrica) for i in range(nafrica)] # get some points and edges inside africa points = sample_with_rejection(fs.npoints, g_africa_poly, limit) x_list, y_list = zip(*points) tri = Triangulation(x_list, y_list) tri_edges = [(i + nafrica, j + nafrica) for i, j in tri.edge_db.tolist()] # get the whole list of points allpoints = g_africa_poly + points # refine the list of edges tri_edges = list(gen_noncrossing_edges(tri_edges, africa_edges, allpoints)) tri_edges = get_mst(tri_edges, allpoints) alledges = africa_edges + tri_edges # make the graph laplacian A = np.zeros((len(points), len(points))) for ia, ib in tri_edges: xa, ya = allpoints[ia] xb, yb = allpoints[ib] d = math.hypot(xb - xa, yb - ya) A[ia - nafrica, ib - nafrica] = 1 / d A[ib - nafrica, ia - nafrica] = 1 / d L = Euclid.adjacency_to_laplacian(A) ws, vs = EigUtil.eigh(np.linalg.pinv(L)) if fs.axis >= len(ws): raise ValueError("choose a smaller mds axis") v = vs[fs.axis] # get the color and sizes for the points v /= max(np.abs(v)) # draw the picture helper = ImgHelper(allpoints, alledges, fs.total_width, fs.total_height, fs.border) helper.draw_contour_plot(v, nafrica)
def process(args, hud_lines): """ @param hud_lines: lines of a .hud file @return: results in convenient text form """ out = StringIO() # get the ordered names from the .hud file names, data = hud.decode(hud_lines) # create the floating point count matrix C_full = np.array(data) m_full, n_full = C_full.shape # remove invariant columns C = np.vstack([v for v in C_full.T if len(set(v)) > 1]).T # get the shape of the matrix m, n = C.shape # get the column means u = C.mean(axis=0) # get the centered and normalized counts matrix M = (C - u) / np.sqrt(u * (1 - u)) # construct the sample covariance matrix X = np.dot(M, M.T) / n # get the eigendecomposition of the covariance matrix evals, evecs = EigUtil.eigh(X) L1 = evals.sum() L2 = np.dot(evals, evals) proportion = evals[0] / L1 # compute the relative size of the first eigenvalue L = m * proportion # compute the Tracy-Widom statistic x = get_tracy_widom_statistic(m, n, L) # do linkage correction n_prime = ((m + 1) * L1 * L1) / ((m - 1) * L2 - L1 * L1) # detect additional structure using alpha level of 0.05 crit = 0.9794 if n_prime < n: L_prime = (m - 1) * proportion x_prime = get_tracy_widom_statistic(m, n_prime, L_prime) sigs, insig = get_corrected_structure(crit, evals, m, n_prime) else: sigs, insig = get_corrected_structure(crit, evals, m, n) # print some infos print >> out, 'number of isolates:' print >> out, m_full print >> out print >> out, 'total number of SNPs:' print >> out, n_full print >> out print >> out, 'number of informative SNPs:' print >> out, n print >> out print >> out, 'effective number of linkage-corrected SNPs:' if n_prime < n: print >> out, n_prime else: print >> out, '[sample is too degenerate for estimation]' print >> out print >> out, 'Tracy-Widom statistic (linkage-naive):' print >> out, x print >> out print >> out, 'Tracy-Widom statistic (linkage-corrected):' if n_prime < n: print >> out, x_prime else: print >> out, '[sample is too degenerate for estimation]' print >> out print >> out, 'proportion of variance explained by principal axis:' print >> out, proportion print >> out print >> out, 'number of significant axes of variation:' print >> out, len(sigs) print >> out print >> out, 'significant Tracy-Widom statistics:' for sig in sigs: print >> out, sig print >> out print >> out, 'first insignificant Tracy-Widom statistic:' print >> out, insig print >> out print >> out, 'principal axis projection:' for loading, name in sorted(zip(evecs[0] * evals[0], names)): print >> out, '\t'.join([name, str(loading)]) print >> out # evals should sum to the number of OTUs evals_sum = sum(evals) if args.sum_to_n: print >> out, 'eigenvalues normalized to sum to the number of OTUs:' for w in evals: print >> out, m_full * w / float(evals_sum) elif args.sum_to_1: print >> out, 'eigenvalues normalized to sum to 1.0:' for w in evals: print >> out, w / float(evals_sum) return out.getvalue().rstrip()
def get_response_content(fs): # define the number of nodes N = 1 + fs.lena + fs.lenb + fs.lenc # check input compatibility if not (fs.eigk+1 <= N): raise ValueError( 'attempting to find a too highly indexed eigenvector ' 'for the number of vertices in the graph') if N < 2: raise ValueError('the tree has no length') # define the total distance of the constructed tree d = float(N-1) h = 1/d # construct the studded tree Laplacian matrix if fs.sparse: v0 = np.ones(N, dtype=float) L_csr = create_laplacian_csr_matrix(fs.lena, fs.lenb, fs.lenc) arpack_k = fs.eigk+1 ncv = 3*arpack_k + 3 ws, vs = scipy.sparse.linalg.eigsh( L_csr, arpack_k, which='SM', v0=v0, ncv=ncv, return_eigenvectors=True) ws = ws[1:] vs = vs.T[1:] else: L = create_laplacian_matrix(fs.lena, fs.lenb, fs.lenc) ws, vs = EigUtil.eigh(L) ws = ws[:-1][::-1] vs = vs[:-1][::-1] scaling_factor = math.sqrt(N * 0.5) # get the eigenvector of interest eigenvalue = ws[fs.eigk-1] v = vs[fs.eigk-1] # init the branch info binfos = [BranchInfo() for i in range(3)] for i, binfo in enumerate(binfos): binfo.k = i+1 # split the eigenvector of interest into the branch components if binfo.k == 1: offset = 1 binfo.width = fs.lena w = np.array([v[0]] + v[offset:offset+binfo.width].tolist()) elif binfo.k == 2: offset = 1 + fs.lena binfo.width = fs.lenb w = np.array([v[0]] + v[offset:offset+binfo.width].tolist()) elif binfo.k == 3: offset = 1 + fs.lena + fs.lenb binfo.width = fs.lenc w = np.array([v[0]] + v[offset:offset+binfo.width].tolist()) else: raise ValueError # compute some boundary info if len(w) >= 1: binfo.p0 = w[0] if len(w) >= 2: binfo.p1 = (w[1] - w[0]) / h if len(w) >= 3: binfo.p2 = (w[0] - 2*w[1] + w[2]) / (h*h) if len(w) >= 1: binfo.q0 = w[-1] if len(w) >= 2: binfo.q1 = (w[-1] - w[-2]) / h if len(w) >= 3: binfo.q2 = (w[-3] - 2*w[-2] + w[-1]) / (h*h) # begin writing the report np.set_printoptions(linewidth=200, threshold=10000) out = StringIO() # summarize global properties print >> out, 'total branch length:' print >> out, N - 1 print >> out print >> out, 'total number of graph vertices including degree 2 vertices:' print >> out, N print >> out # show the sum of first derivatives near the hub if N > 1: p1sum = 0 for binfo in binfos: if binfo.p1: p1sum += binfo.p1 p1sum_string = str(p1sum) else: d1sum_string = 'undefined' print >> out, "sum of f'(x) on all branches near the hub:", p1sum_string print >> out # summarize properties per branch per eigenvector for binfo in binfos: print >> out, 'summary of eigenvector', fs.eigk, 'on branch', binfo.k print >> out, 'unscaled branch length:', binfo.width if binfo.width: print >> out, 'internal', ''.join(['-']*binfo.width), 'pendant' print >> out, "internal f(x): ", value_to_string(binfo.p0) print >> out, "internal f'(x): ", value_to_string(binfo.p1) print >> out, "internal f''(x):", value_to_string(binfo.p2) print >> out, "pendant f(x): ", value_to_string(binfo.q0) print >> out, "pendant f'(x): ", value_to_string(binfo.q1) print >> out, "pendant f''(x):", value_to_string(binfo.q2) print >> out if fs.showv: print >> out, 'the eigenvalue:' print >> out, eigenvalue print >> out print >> out, 'the whole eigenvector:' print >> out, v print >> out if fs.showmatrix: if fs.sparse: print >> out, 'Laplacian matrix (from sparse internal repr):' print >> out, L_csr.toarray() print >> out else: print >> out, 'Laplacian matrix (from dense internal repr):' print >> out, L print >> out return out.getvalue()
def get_response_content(fs): # define the number of nodes N = 1 + fs.lena + fs.lenb + fs.lenc # check input compatibility if not (fs.eigk + 1 <= N): raise ValueError('attempting to find a too highly indexed eigenvector ' 'for the number of vertices in the graph') if N < 2: raise ValueError('the tree has no length') # define the total distance of the constructed tree d = float(N - 1) h = 1 / d # construct the studded tree Laplacian matrix if fs.sparse: v0 = np.ones(N, dtype=float) L_csr = create_laplacian_csr_matrix(fs.lena, fs.lenb, fs.lenc) arpack_k = fs.eigk + 1 ncv = 3 * arpack_k + 3 ws, vs = scipy.sparse.linalg.eigsh(L_csr, arpack_k, which='SM', v0=v0, ncv=ncv, return_eigenvectors=True) ws = ws[1:] vs = vs.T[1:] else: L = create_laplacian_matrix(fs.lena, fs.lenb, fs.lenc) ws, vs = EigUtil.eigh(L) ws = ws[:-1][::-1] vs = vs[:-1][::-1] scaling_factor = math.sqrt(N * 0.5) # get the eigenvector of interest eigenvalue = ws[fs.eigk - 1] v = vs[fs.eigk - 1] # init the branch info binfos = [BranchInfo() for i in range(3)] for i, binfo in enumerate(binfos): binfo.k = i + 1 # split the eigenvector of interest into the branch components if binfo.k == 1: offset = 1 binfo.width = fs.lena w = np.array([v[0]] + v[offset:offset + binfo.width].tolist()) elif binfo.k == 2: offset = 1 + fs.lena binfo.width = fs.lenb w = np.array([v[0]] + v[offset:offset + binfo.width].tolist()) elif binfo.k == 3: offset = 1 + fs.lena + fs.lenb binfo.width = fs.lenc w = np.array([v[0]] + v[offset:offset + binfo.width].tolist()) else: raise ValueError # compute some boundary info if len(w) >= 1: binfo.p0 = w[0] if len(w) >= 2: binfo.p1 = (w[1] - w[0]) / h if len(w) >= 3: binfo.p2 = (w[0] - 2 * w[1] + w[2]) / (h * h) if len(w) >= 1: binfo.q0 = w[-1] if len(w) >= 2: binfo.q1 = (w[-1] - w[-2]) / h if len(w) >= 3: binfo.q2 = (w[-3] - 2 * w[-2] + w[-1]) / (h * h) # begin writing the report np.set_printoptions(linewidth=200, threshold=10000) out = StringIO() # summarize global properties print >> out, 'total branch length:' print >> out, N - 1 print >> out print >> out, 'total number of graph vertices including degree 2 vertices:' print >> out, N print >> out # show the sum of first derivatives near the hub if N > 1: p1sum = 0 for binfo in binfos: if binfo.p1: p1sum += binfo.p1 p1sum_string = str(p1sum) else: d1sum_string = 'undefined' print >> out, "sum of f'(x) on all branches near the hub:", p1sum_string print >> out # summarize properties per branch per eigenvector for binfo in binfos: print >> out, 'summary of eigenvector', fs.eigk, 'on branch', binfo.k print >> out, 'unscaled branch length:', binfo.width if binfo.width: print >> out, 'internal', ''.join(['-'] * binfo.width), 'pendant' print >> out, "internal f(x): ", value_to_string(binfo.p0) print >> out, "internal f'(x): ", value_to_string(binfo.p1) print >> out, "internal f''(x):", value_to_string(binfo.p2) print >> out, "pendant f(x): ", value_to_string(binfo.q0) print >> out, "pendant f'(x): ", value_to_string(binfo.q1) print >> out, "pendant f''(x):", value_to_string(binfo.q2) print >> out if fs.showv: print >> out, 'the eigenvalue:' print >> out, eigenvalue print >> out print >> out, 'the whole eigenvector:' print >> out, v print >> out if fs.showmatrix: if fs.sparse: print >> out, 'Laplacian matrix (from sparse internal repr):' print >> out, L_csr.toarray() print >> out else: print >> out, 'Laplacian matrix (from dense internal repr):' print >> out, L print >> out return out.getvalue()
def get_response_content(fs): # build the newick tree from the string tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree) nvertices = len(list(tree.preorder())) nleaves = len(list(tree.gen_tips())) ninternal = nvertices - nleaves # get ordered ids with the internal nodes first ordered_ids = get_ordered_ids(tree) leaf_ids = [id(node) for node in tree.gen_tips()] # get the distance matrix and the augmented distance matrix D_leaf = np.array(tree.get_partial_distance_matrix(leaf_ids)) D = np.array(tree.get_partial_distance_matrix(ordered_ids)) # analyze the leaf distance matrix X_leaf = Euclid.edm_to_points(D_leaf) w_leaf, v_leaf = EigUtil.eigh(Euclid.edm_to_dccov(D_leaf)) V_leaf = np.array(v_leaf).T # explicitly compute the limiting points as the number of dups increases X = Euclid.edm_to_points(D) X -= np.mean(X[-nleaves:], axis=0) XL = X[-nleaves:] U, s, Vt = np.linalg.svd(XL) Z = np.dot(X, Vt.T) # hack the Z matrix to show the leaf-related eigenvectors Z = Z.T[: nleaves - 1].T WY = Z / np.sqrt(w_leaf[:-1]) # compute a product using the first few rows of WY W = WY[:ninternal] M_alpha = get_alpha_multiplier(D, nleaves) MW_alpha = np.dot(M_alpha, W) # compute a product using the first few rows of WY M_beta = get_beta_multiplier(D, nleaves) MY_beta = np.dot(M_beta, V_leaf) # report the results np.set_printoptions(linewidth=300, threshold=10000) out = StringIO() print >> out, "leaf distance matrix:" print >> out, D_leaf print >> out print >> out, "eigenvalues derived from the leaf distance matrix" print >> out, w_leaf print >> out print >> out, "corresponding eigenvectors (as columns)" print >> out, V_leaf print >> out print >> out, "candidates for [W' Y']':" print >> out, WY print >> out print >> out, "candidates for W:" print >> out, W print >> out print >> out, "left multiplier of W:" print >> out, M_alpha print >> out print >> out, "each column is a (left multiplier, W) product:" print >> out, MW_alpha print >> out print >> out, "left multiplier of Y:" print >> out, M_beta print >> out print >> out, "each column is a (left multiplier, Y) product:" print >> out, MY_beta print >> out print >> out, "the above matrix divided by 2*eigenvalue:" print >> out, MY_beta / (2 * np.array(w_leaf)) print >> out return out.getvalue()
def get_response_content(fs): # build the newick tree from the string tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree) nvertices = len(list(tree.preorder())) nleaves = len(list(tree.gen_tips())) ninternal = nvertices - nleaves # get ordered ids with the internal nodes first ordered_ids = get_ordered_ids(tree) leaf_ids = [id(node) for node in tree.gen_tips()] # get the distance matrix and the augmented distance matrix D_leaf = np.array(tree.get_partial_distance_matrix(leaf_ids)) D = np.array(tree.get_partial_distance_matrix(ordered_ids)) # analyze the leaf distance matrix X_leaf = Euclid.edm_to_points(D_leaf) w_leaf, v_leaf = EigUtil.eigh(Euclid.edm_to_dccov(D_leaf)) V_leaf = np.array(v_leaf).T # explicitly compute the limiting points as the number of dups increases X = Euclid.edm_to_points(D) X -= np.mean(X[-nleaves:], axis=0) XL = X[-nleaves:] U, s, Vt = np.linalg.svd(XL) Z = np.dot(X, Vt.T) # hack the Z matrix to show the leaf-related eigenvectors Z = Z.T[:nleaves - 1].T WY = Z / np.sqrt(w_leaf[:-1]) # compute a product using the first few rows of WY W = WY[:ninternal] M_alpha = get_alpha_multiplier(D, nleaves) MW_alpha = np.dot(M_alpha, W) # compute a product using the first few rows of WY M_beta = get_beta_multiplier(D, nleaves) MY_beta = np.dot(M_beta, V_leaf) # report the results np.set_printoptions(linewidth=300, threshold=10000) out = StringIO() print >> out, 'leaf distance matrix:' print >> out, D_leaf print >> out print >> out, 'eigenvalues derived from the leaf distance matrix' print >> out, w_leaf print >> out print >> out, 'corresponding eigenvectors (as columns)' print >> out, V_leaf print >> out print >> out, "candidates for [W' Y']':" print >> out, WY print >> out print >> out, 'candidates for W:' print >> out, W print >> out print >> out, 'left multiplier of W:' print >> out, M_alpha print >> out print >> out, 'each column is a (left multiplier, W) product:' print >> out, MW_alpha print >> out print >> out, 'left multiplier of Y:' print >> out, M_beta print >> out print >> out, 'each column is a (left multiplier, Y) product:' print >> out, MY_beta print >> out print >> out, 'the above matrix divided by 2*eigenvalue:' print >> out, MY_beta / (2 * np.array(w_leaf)) print >> out return out.getvalue()
def process(args, hud_lines): """ @param hud_lines: lines of a .hud file @return: results in convenient text form """ out = StringIO() # get the ordered names from the .hud file names, data = hud.decode(hud_lines) # create the floating point count matrix C_full = np.array(data) m_full, n_full = C_full.shape # remove invariant columns C = np.vstack([v for v in C_full.T if len(set(v))>1]).T # get the shape of the matrix m, n = C.shape # get the column means u = C.mean(axis=0) # get the centered and normalized counts matrix M = (C - u) / np.sqrt(u * (1 - u)) # construct the sample covariance matrix X = np.dot(M, M.T) / n # get the eigendecomposition of the covariance matrix evals, evecs = EigUtil.eigh(X) L1 = evals.sum() L2 = np.dot(evals, evals) proportion = evals[0] / L1 # compute the relative size of the first eigenvalue L = m*proportion # compute the Tracy-Widom statistic x = get_tracy_widom_statistic(m, n, L) # do linkage correction n_prime = ((m+1)*L1*L1) / ((m-1)*L2 - L1*L1) # detect additional structure using alpha level of 0.05 crit = 0.9794 if n_prime < n: L_prime = (m-1)*proportion x_prime = get_tracy_widom_statistic(m, n_prime, L_prime) sigs, insig = get_corrected_structure(crit, evals, m, n_prime) else: sigs, insig = get_corrected_structure(crit, evals, m, n) # print some infos print >> out, 'number of isolates:' print >> out, m_full print >> out print >> out, 'total number of SNPs:' print >> out, n_full print >> out print >> out, 'number of informative SNPs:' print >> out, n print >> out print >> out, 'effective number of linkage-corrected SNPs:' if n_prime < n: print >> out, n_prime else: print >> out, '[sample is too degenerate for estimation]' print >> out print >> out, 'Tracy-Widom statistic (linkage-naive):' print >> out, x print >> out print >> out, 'Tracy-Widom statistic (linkage-corrected):' if n_prime < n: print >> out, x_prime else: print >> out, '[sample is too degenerate for estimation]' print >> out print >> out, 'proportion of variance explained by principal axis:' print >> out, proportion print >> out print >> out, 'number of significant axes of variation:' print >> out, len(sigs) print >> out print >> out, 'significant Tracy-Widom statistics:' for sig in sigs: print >> out, sig print >> out print >> out, 'first insignificant Tracy-Widom statistic:' print >> out, insig print >> out print >> out, 'principal axis projection:' for loading, name in sorted(zip(evecs[0] * evals[0], names)): print >> out, '\t'.join([name, str(loading)]) print >> out # evals should sum to the number of OTUs evals_sum = sum(evals) if args.sum_to_n: print >> out, 'eigenvalues normalized to sum to the number of OTUs:' for w in evals: print >> out, m_full * w / float(evals_sum) elif args.sum_to_1: print >> out, 'eigenvalues normalized to sum to 1.0:' for w in evals: print >> out, w / float(evals_sum) return out.getvalue().rstrip()