Exemple #1
0
 def _do_analysis(self, use_generalized_nj):
     """
     Do some splits of the tree.
     @param use_generalized_nj: True if we use an old method of outgrouping
     """
     # define the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     # get the primary split of the criterion matrix
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # assert that the first split cleanly separates the bacteria from the rest
     left_indices, right_indices = eigensplit
     left_domains = self._get_domains([self.pruned_names[x] for x in left_indices])
     right_domains = self._get_domains([self.pruned_names[x] for x in right_indices])
     if ('bacteria' in left_domains) and ('bacteria' in right_domains):
         raise HandlingError('bacteria were not defined by the first split')
     # now we have enough info to define the first supplementary csv file
     self.first_split_object = SupplementarySpreadsheetObject(self.pruned_names, L, v)
     # define the bacteria indices vs the non-bacteria indices for the second split
     if 'bacteria' in left_domains:
         bacteria_indices = left_indices
         non_bacteria_indices = right_indices
     elif 'bacteria' in right_domains:
         bacteria_indices = right_indices
         non_bacteria_indices = left_indices
     # get the secondary split of interest
     if use_generalized_nj:
         D_secondary = BuildTreeTopology.update_generalized_nj(D, bacteria_indices)
         L_secondary = Euclid.edm_to_laplacian(D_secondary)
     else:
         L_secondary = SchurAlgebra.mmerge(L, bacteria_indices)
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     next_label_sets = SchurAlgebra.vmerge(full_label_sets, bacteria_indices)
     v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
     eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(v_secondary)
     left_subindices, right_subindices = eigensplit_secondary
     pruned_names_secondary = []
     for label_set in next_label_sets:
         if len(label_set) == 1:
             label = list(label_set)[0]
             pruned_names_secondary.append(self.pruned_names[label])
         else:
             pruned_names_secondary.append('all-bacteria')
     # assert that the second split cleanly separates the eukaryota from the rest
     left_subdomains = self._get_domains([pruned_names_secondary[x] for x in left_subindices])
     right_subdomains = self._get_domains([pruned_names_secondary[x] for x in right_subindices])
     if ('eukaryota' in left_subdomains) and ('eukaryota' in right_subdomains):
         raise HandlingError('eukaryota were not defined by the second split')
     # now we have enough info to define the second supplementary csv file
     self.second_split_object = SupplementarySpreadsheetObject(pruned_names_secondary, L_secondary, v_secondary)
Exemple #2
0
def get_response_content(fs):
    D = fs.matrix
    L = Euclid.edm_to_laplacian(D)
    S = get_sigma_matrix(D)
    P = get_precision_matrix(S)
    # begin the response
    out = StringIO()
    print >> out, 'the Laplacian matrix:'
    print >> out, MatrixUtil.m_to_string(L)
    print >> out
    print >> out, 'the sigma matrix corresponding to the Q matrix:'
    print >> out, MatrixUtil.m_to_string(S)
    print >> out
    print >> out, 'the precision matrix corresponding to the Q matrix:'
    print >> out, MatrixUtil.m_to_string(P)
    print >> out
    print >> out, 'the precision matrix minus the laplacian matrix:'
    print >> out, MatrixUtil.m_to_string(P-L)
    print >> out
    print >> out, 'the double centered precision matrix minus the laplacian matrix:'
    print >> out, MatrixUtil.m_to_string(MatrixUtil.double_centered(P)-L)
    print >> out
    print >> out, 'the pseudo-inverse of the double centered sigma matrix minus the laplacian matrix:'
    print >> out, MatrixUtil.m_to_string(np.linalg.pinv(MatrixUtil.double_centered(S))-L)
    # write the response
    return out.getvalue()
Exemple #3
0
def get_eigendecomposition_report(D):
    """
    @param D: a distance matrix
    @return: a multi-line string
    """
    out = StringIO()
    # get some intermediate matrices and vectors
    L = Euclid.edm_to_laplacian(D)
    laplacian_fiedler = BuildTreeTopology.laplacian_to_fiedler(L)
    distance_fiedler = BuildTreeTopology.edm_to_fiedler(D)
    eigensplit = BuildTreeTopology.eigenvector_to_split(laplacian_fiedler)
    # report the two eigenvalue lists that should be the same
    HDH = MatrixUtil.double_centered(D)
    HSH = -0.5 * HDH
    w_distance, vt_distance = np.linalg.eigh(HSH)
    print >> out, 'the laplacian-derived and distance-derived eigenvalues:'
    w_laplacian, vt_laplacian = np.linalg.eigh(L)
    for a, b in zip(sorted(w_laplacian), sorted(w_distance)):
        print >> out, a, '\t', b
    print >> out
    # report the two fiedler vectors that should be the same
    print >> out, 'the laplacian-derived and distance-derived fiedler vectors:'
    for a, b in zip(laplacian_fiedler, distance_fiedler):
        print >> out, a, '\t', b
    return out.getvalue().strip()
Exemple #4
0
def get_response_content(fs):
    np.set_printoptions(linewidth=200)
    n = len(fs.D)
    # create the Laplacian matrix
    L = Euclid.edm_to_laplacian(fs.D)
    # create the Laplacian matrix with the extra node added
    L_dup = get_pseudoduplicate_laplacian(L, fs.strength)
    # get the principal axis projection from the Laplacian dup matrix
    X_w, X_v = EigUtil.principal_eigh(np.linalg.pinv(L_dup))
    L_dup_x = X_v * math.sqrt(X_w)
    # get masses summing to one
    m = np.array([1]*(n-1) + [2], dtype=float) / (n+1)
    # get the principal axis projection using the weight formula
    M = np.diag(np.sqrt(m))
    L_pinv = np.linalg.pinv(L)
    I = np.eye(n, dtype=float)
    E = I - np.outer(np.ones(n, dtype=float), m)
    ME = np.dot(M, E)
    Q = np.dot(ME, np.dot(L_pinv, ME.T))
    Q_w, Q_v = EigUtil.principal_eigh(Q)
    Q_x = Q_v * math.sqrt(Q_w) / np.sqrt(m)
    # make the response
    out = StringIO()
    print >> out, 'Laplacian matrix with pseudo-duplicate node:'
    print >> out, L_dup
    print >> out
    print >> out, 'principal axis projection:'
    print >> out, L_dup_x
    print >> out
    print >> out, 'principal axis projection using the weight formula:'
    print >> out, Q_x
    return out.getvalue()
Exemple #5
0
def get_response_content(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    # validate the input
    observed_label_set = set(node.get_name() for node in tree.gen_tips())
    if set(ordered_labels) != observed_label_set:
        msg = 'the labels should match the labels of the leaves of the tree'
        raise HandlingError(msg)
    # get the matrix of pairwise distances among the tips
    D = np.array(tree.get_distance_matrix(ordered_labels))
    L = Euclid.edm_to_laplacian(D)
    w, v = get_eigendecomposition(L)
    C = get_contrast_matrix(w, v)
    # set elements with small absolute value to zero
    C[abs(C) < fs.epsilon] = 0
    # start to prepare the reponse
    out = StringIO()
    if fs.plain_format:
        print >> out, MatrixUtil.m_to_string(C)
    elif fs.matlab_format:
        print >> out, MatrixUtil.m_to_matlab_string(C)
    elif fs.r_format:
        print >> out, MatrixUtil.m_to_R_string(C)
    # write the response
    return out.getvalue()
Exemple #6
0
def get_response_content(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    # validate the input
    observed_label_set = set(node.get_name() for node in tree.gen_tips())
    if set(ordered_labels) != observed_label_set:
        msg = 'the labels should match the labels of the leaves of the tree'
        raise HandlingError(msg)
    # get the matrix of pairwise distances among the tips
    D = np.array(tree.get_distance_matrix(ordered_labels))
    L = Euclid.edm_to_laplacian(D)
    w, v = get_eigendecomposition(L)
    C = get_contrast_matrix(w, v)
    # set elements with small absolute value to zero
    C[abs(C) < fs.epsilon] = 0
    # start to prepare the reponse
    out = StringIO()
    if fs.plain_format:
        print >> out, MatrixUtil.m_to_string(C)
    elif fs.matlab_format:
        print >> out, MatrixUtil.m_to_matlab_string(C)
    elif fs.r_format:
        print >> out, MatrixUtil.m_to_R_string(C)
    # write the response
    return out.getvalue()
Exemple #7
0
def get_response_content(fs):
    D = fs.matrix
    L = Euclid.edm_to_laplacian(D)
    S = get_sigma_matrix(D)
    P = get_precision_matrix(S)
    # begin the response
    out = StringIO()
    print >> out, 'the Laplacian matrix:'
    print >> out, MatrixUtil.m_to_string(L)
    print >> out
    print >> out, 'the sigma matrix corresponding to the Q matrix:'
    print >> out, MatrixUtil.m_to_string(S)
    print >> out
    print >> out, 'the precision matrix corresponding to the Q matrix:'
    print >> out, MatrixUtil.m_to_string(P)
    print >> out
    print >> out, 'the precision matrix minus the laplacian matrix:'
    print >> out, MatrixUtil.m_to_string(P - L)
    print >> out
    print >> out, 'the double centered precision matrix minus the laplacian matrix:'
    print >> out, MatrixUtil.m_to_string(MatrixUtil.double_centered(P) - L)
    print >> out
    print >> out, 'the pseudo-inverse of the double centered sigma matrix minus the laplacian matrix:'
    print >> out, MatrixUtil.m_to_string(
        np.linalg.pinv(MatrixUtil.double_centered(S)) - L)
    # write the response
    return out.getvalue()
Exemple #8
0
def get_eigendecomposition_report(D):
    """
    @param D: a distance matrix
    @return: a multi-line string
    """
    out = StringIO()
    # get some intermediate matrices and vectors
    L = Euclid.edm_to_laplacian(D)
    laplacian_fiedler = BuildTreeTopology.laplacian_to_fiedler(L)
    distance_fiedler = BuildTreeTopology.edm_to_fiedler(D)
    eigensplit = BuildTreeTopology.eigenvector_to_split(laplacian_fiedler)
    # report the two eigenvalue lists that should be the same
    HDH = MatrixUtil.double_centered(D)
    HSH = -0.5 * HDH
    w_distance, vt_distance = np.linalg.eigh(HSH)
    print >> out, 'the laplacian-derived and distance-derived eigenvalues:'
    w_laplacian, vt_laplacian = np.linalg.eigh(L)
    for a, b in zip(sorted(w_laplacian), sorted(w_distance)):
        print >> out, a, '\t', b
    print >> out
    # report the two fiedler vectors that should be the same
    print >> out, 'the laplacian-derived and distance-derived fiedler vectors:'
    for a, b in zip(laplacian_fiedler, distance_fiedler):
        print >> out, a, '\t', b
    return out.getvalue().strip()
Exemple #9
0
def get_response_content(fs):
    np.set_printoptions(linewidth=200)
    n = len(fs.D)
    # create the Laplacian matrix
    L = Euclid.edm_to_laplacian(fs.D)
    # create the Laplacian matrix with the extra node added
    L_dup = get_pseudoduplicate_laplacian(L, fs.strength)
    # get the principal axis projection from the Laplacian dup matrix
    X_w, X_v = EigUtil.principal_eigh(np.linalg.pinv(L_dup))
    L_dup_x = X_v * math.sqrt(X_w)
    # get masses summing to one
    m = np.array([1] * (n - 1) + [2], dtype=float) / (n + 1)
    # get the principal axis projection using the weight formula
    M = np.diag(np.sqrt(m))
    L_pinv = np.linalg.pinv(L)
    I = np.eye(n, dtype=float)
    E = I - np.outer(np.ones(n, dtype=float), m)
    ME = np.dot(M, E)
    Q = np.dot(ME, np.dot(L_pinv, ME.T))
    Q_w, Q_v = EigUtil.principal_eigh(Q)
    Q_x = Q_v * math.sqrt(Q_w) / np.sqrt(m)
    # make the response
    out = StringIO()
    print >> out, 'Laplacian matrix with pseudo-duplicate node:'
    print >> out, L_dup
    print >> out
    print >> out, 'principal axis projection:'
    print >> out, L_dup_x
    print >> out
    print >> out, 'principal axis projection using the weight formula:'
    print >> out, Q_x
    return out.getvalue()
Exemple #10
0
def get_response_content(fs):
    # read the distance matrix
    D = fs.matrix
    L = Euclid.edm_to_laplacian(D)
    resistor = -1/L
    resistor -= np.diag(np.diag(resistor))
    # return the edge resistor matrix
    return MatrixUtil.m_to_string(resistor) + '\n'
Exemple #11
0
def get_splits(initial_distance_matrix,
               split_function,
               update_function,
               on_label_split=None):
    """
    This is the most external of the functions in this module.
    Get the set of splits implied by the tree that would be reconstructed.
    @param initial_distance_matrix: a distance matrix
    @param split_function: takes a distance matrix and returns an index split
    @param update_function: takes a distance matrix and an index subset and returns a distance matrix
    @param on_label_split: notifies the caller of the label split induced by an index split
    @return: a set of splits
    """
    n = len(initial_distance_matrix)
    # keep a stack of (label_set_per_vertex, distance_matrix) pairs
    initial_state = ([set([i]) for i in range(n)], initial_distance_matrix)
    stack = [initial_state]
    # process the stack in a depth first manner, building the split set
    label_split_set = set()
    while stack:
        label_sets, D = stack.pop()
        # if the matrix is small then we are done
        if len(D) < 4:
            continue
        # split the indices using the specified function
        try:
            index_split = split_function(D)
            # convert the index split to a label split
            label_split = index_split_to_label_split(index_split, label_sets)
            # notify the caller if a callback is requested
            if on_label_split:
                on_label_split(label_split)
            # add the split to the master set of label splits
            label_split_set.add(label_split)
            # for large matrices create the new label sets and the new conformant distance matrices
            a, b = index_split
            for index_selection, index_complement in ((a, b), (b, a)):
                if len(index_complement) > 2:
                    next_label_sets = SchurAlgebra.vmerge(
                        label_sets, index_selection)
                    next_D = update_function(D, index_selection)
                    next_state = (next_label_sets, next_D)
                    stack.append(next_state)
        except DegenerateSplitException, e:
            # we cannot recover from a degenerate split unless there are more than four indices
            if len(D) <= 4:
                continue
            # with more than four indices we can fall back to partial splits
            index_set = set([e.index])
            # get the next label sets
            next_label_sets = SchurAlgebra.vdelete(label_sets, index_set)
            # get the next conformant distance matrix by schur complementing out the offending index
            L = Euclid.edm_to_laplacian(D)
            L_small = SchurAlgebra.mschur(L, index_set)
            next_D = Euclid.laplacian_to_edm(L_small)
            next_state = (next_label_sets, next_D)
            stack.append(next_state)
Exemple #12
0
def get_response_content(fs):
    # build the newick tree from the string
    tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree)
    nvertices = len(list(tree.preorder()))
    nleaves = len(list(tree.gen_tips()))
    # get ordered ids with the leaves first
    ordered_ids = get_ordered_ids(tree)
    # get the distance matrix and the augmented distance matrix
    D = np.array(tree.get_partial_distance_matrix(ordered_ids))
    D_aug = get_augmented_distance(D, nleaves, fs.ndups)
    # get the laplacian matrix
    L = Euclid.edm_to_laplacian(D)
    # get the schur complement
    R = SchurAlgebra.mschur(L, set(range(nleaves, nvertices)))
    R_pinv = np.linalg.pinv(R)
    vals, vecs = EigUtil.eigh(R_pinv)
    # get the scaled Fiedler vector for the Schur complement
    w, v = EigUtil.principal_eigh(R_pinv)
    fiedler = v * math.sqrt(w)
    # get the eigendecomposition of the centered augmented distance matrix
    L_aug_pinv = Euclid.edm_to_dccov(D_aug)
    vals_aug, vecs_aug = EigUtil.eigh(L_aug_pinv)
    # get the scaled Fiedler vector for the augmented Laplacian
    w_aug, v_aug = EigUtil.principal_eigh(L_aug_pinv)
    fiedler_aug = v_aug * math.sqrt(w_aug)
    # report the results
    np.set_printoptions(linewidth=300, threshold=10000)
    out = StringIO()
    print >> out, "Laplacian matrix:"
    print >> out, L
    print >> out
    print >> out, "Schur complement of Laplacian matrix:"
    print >> out, R
    print >> out
    print >> out, "scaled Fiedler vector of Schur complement:"
    print >> out, fiedler
    print >> out
    print >> out, "eigenvalues of pinv of Schur complement:"
    print >> out, vals
    print >> out
    print >> out, "corresponding eigenvectors of pinv of Schur complement:"
    print >> out, np.array(vecs).T
    print >> out
    print >> out
    print >> out, "augmented distance matrix:"
    print >> out, D_aug
    print >> out
    print >> out, "scaled Fiedler vector of augmented Laplacian limit:"
    print >> out, fiedler_aug
    print >> out
    print >> out, "eigenvalues of pinv of augmented Laplacian limit:"
    print >> out, vals_aug
    print >> out
    print >> out, "rows are eigenvectors of pinv of augmented Laplacian limit:"
    print >> out, np.array(vecs_aug)
    return out.getvalue()
Exemple #13
0
def get_response_content(fs):
    # build the newick tree from the string
    tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree)
    nvertices = len(list(tree.preorder()))
    nleaves = len(list(tree.gen_tips()))
    # get ordered ids with the leaves first
    ordered_ids = get_ordered_ids(tree)
    # get the distance matrix and the augmented distance matrix
    D = np.array(tree.get_partial_distance_matrix(ordered_ids))
    D_aug = get_augmented_distance(D, nleaves, fs.ndups)
    # get the laplacian matrix
    L = Euclid.edm_to_laplacian(D)
    # get the schur complement
    R = SchurAlgebra.mschur(L, set(range(nleaves, nvertices)))
    R_pinv = np.linalg.pinv(R)
    vals, vecs = EigUtil.eigh(R_pinv)
    # get the scaled Fiedler vector for the Schur complement
    w, v = EigUtil.principal_eigh(R_pinv)
    fiedler = v * math.sqrt(w)
    # get the eigendecomposition of the centered augmented distance matrix
    L_aug_pinv = Euclid.edm_to_dccov(D_aug)
    vals_aug, vecs_aug = EigUtil.eigh(L_aug_pinv)
    # get the scaled Fiedler vector for the augmented Laplacian
    w_aug, v_aug = EigUtil.principal_eigh(L_aug_pinv)
    fiedler_aug = v_aug * math.sqrt(w_aug)
    # report the results
    np.set_printoptions(linewidth=300, threshold=10000)
    out = StringIO()
    print >> out, 'Laplacian matrix:'
    print >> out, L
    print >> out
    print >> out, 'Schur complement of Laplacian matrix:'
    print >> out, R
    print >> out
    print >> out, 'scaled Fiedler vector of Schur complement:'
    print >> out, fiedler
    print >> out
    print >> out, 'eigenvalues of pinv of Schur complement:'
    print >> out, vals
    print >> out
    print >> out, 'corresponding eigenvectors of pinv of Schur complement:'
    print >> out, np.array(vecs).T
    print >> out
    print >> out
    print >> out, 'augmented distance matrix:'
    print >> out, D_aug
    print >> out
    print >> out, 'scaled Fiedler vector of augmented Laplacian limit:'
    print >> out, fiedler_aug
    print >> out
    print >> out, 'eigenvalues of pinv of augmented Laplacian limit:'
    print >> out, vals_aug
    print >> out
    print >> out, 'rows are eigenvectors of pinv of augmented Laplacian limit:'
    print >> out, np.array(vecs_aug)
    return out.getvalue()
def update_using_laplacian(D, index_set):
    """
    Update the distance matrix by summing rows and columns of the removed indices.
    @param D: the distance matrix
    @param index_set: the set of indices that will be removed from the updated distance matrix
    @return: an updated distance matrix
    """
    L = Euclid.edm_to_laplacian(D)
    L_small = SchurAlgebra.mmerge(L, index_set)
    D_small = Euclid.laplacian_to_edm(L_small)
    return D_small
Exemple #15
0
def update_using_laplacian(D, index_set):
    """
    Update the distance matrix by summing rows and columns of the removed indices.
    @param D: the distance matrix
    @param index_set: the set of indices that will be removed from the updated distance matrix
    @return: an updated distance matrix
    """
    L = Euclid.edm_to_laplacian(D)
    L_small = SchurAlgebra.mmerge(L, index_set)
    D_small = Euclid.laplacian_to_edm(L_small)
    return D_small
def get_splits(initial_distance_matrix, split_function, update_function, on_label_split=None):
    """
    This is the most external of the functions in this module.
    Get the set of splits implied by the tree that would be reconstructed.
    @param initial_distance_matrix: a distance matrix
    @param split_function: takes a distance matrix and returns an index split
    @param update_function: takes a distance matrix and an index subset and returns a distance matrix
    @param on_label_split: notifies the caller of the label split induced by an index split
    @return: a set of splits
    """
    n = len(initial_distance_matrix)
    # keep a stack of (label_set_per_vertex, distance_matrix) pairs
    initial_state = ([set([i]) for i in range(n)], initial_distance_matrix)
    stack = [initial_state]
    # process the stack in a depth first manner, building the split set
    label_split_set = set()
    while stack:
        label_sets, D = stack.pop()
        # if the matrix is small then we are done
        if len(D) < 4:
            continue
        # split the indices using the specified function
        try:
            index_split = split_function(D)
            # convert the index split to a label split
            label_split = index_split_to_label_split(index_split, label_sets)
            # notify the caller if a callback is requested
            if on_label_split:
                on_label_split(label_split)
            # add the split to the master set of label splits
            label_split_set.add(label_split)
            # for large matrices create the new label sets and the new conformant distance matrices
            a, b = index_split
            for index_selection, index_complement in ((a, b), (b, a)):
                if len(index_complement) > 2:
                    next_label_sets = SchurAlgebra.vmerge(label_sets, index_selection)
                    next_D = update_function(D, index_selection)
                    next_state = (next_label_sets, next_D)
                    stack.append(next_state)
        except DegenerateSplitException, e:
            # we cannot recover from a degenerate split unless there are more than four indices
            if len(D) <= 4:
                continue
            # with more than four indices we can fall back to partial splits
            index_set = set([e.index])
            # get the next label sets
            next_label_sets = SchurAlgebra.vdelete(label_sets, index_set)
            # get the next conformant distance matrix by schur complementing out the offending index
            L = Euclid.edm_to_laplacian(D)
            L_small = SchurAlgebra.mschur(L, index_set)
            next_D = Euclid.laplacian_to_edm(L_small)
            next_state = (next_label_sets, next_D)
            stack.append(next_state)
Exemple #17
0
 def get_verbose_summary(self):
     """
     @return: a multiline string
     """
     # begin the response
     out = StringIO()
     # show the number of taxa in various domains
     print >> out, self._get_name_summary()
     print >> out
     # show the pruned full tree
     formatted_tree_string = NewickIO.get_narrow_newick_string(self.pruned_tree, 120) 
     print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
     print >> out, formatted_tree_string
     print >> out
     # split the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # report the eigendecomposition
     print >> out, get_eigendecomposition_report(D)
     print >> out
     # report the clade intersections of sides of the split
     side_names = [set(self.pruned_names[i] for i in side) for side in eigensplit]
     print >> out, 'domains represented by each side of the primary split:'
     print >> out, 'the left side has:\t', ', '.join(self._get_domains(side_names[0]))
     print >> out, 'the right side has:\t', ', '.join(self._get_domains(side_names[1]))
     print >> out
     # prepare to do the secondary splits
     left_indices, right_indices = eigensplit
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     # do the secondary splits
     for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)):
         L_secondary = SchurAlgebra.mmerge(L, index_complement)
         next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement)
         v = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
         left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v)
         left_sublabels = set()
         for i in left_subindices:
             left_sublabels.update(next_label_sets[i])
         right_sublabels = set()
         for i in right_subindices:
             right_sublabels.update(next_label_sets[i])
         left_subnames = set(self.pruned_names[i] for i in left_sublabels)
         right_subnames = set(self.pruned_names[i] for i in right_sublabels)
         print >> out, 'domains represented by a subsplit:'
         print >> out, 'the left side has:\t', ', '.join(self._get_domains(left_subnames))
         print >> out, 'the right side has:\t', ', '.join(self._get_domains(right_subnames))
         print >> out
     # return the multiline string
     return out.getvalue().strip()
Exemple #18
0
def get_response_content(fs):
    # read the matrix
    D = fs.matrix
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    if not ordered_labels:
        raise HandlingError('no ordered taxa were provided')
    if len(ordered_labels) != len(set(ordered_labels)):
        raise HandlingError('the ordered taxa should be unique')
    # get the label selection and its complement
    min_selected_labels = 2
    min_unselected_labels = 1
    selected_labels = set(Util.get_stripped_lines(StringIO(fs.selection)))
    if len(selected_labels) < min_selected_labels:
        raise HandlingError('at least %d taxa should be selected to be grouped' % min_selected_labels)
    # get the set of labels in the complement
    unselected_labels = set(ordered_labels) - selected_labels
    if len(unselected_labels) < min_unselected_labels:
        raise HandlingError('at least %d taxa should remain outside the selected group' % min_unselected_labels)
    # assert that no bizarre labels were selected
    weird_labels = selected_labels - set(ordered_labels)
    if weird_labels:
        raise HandlingError('some selected taxa are invalid: ' + str(weird_labels))
    # assert that the size of the distance matrix is compatible with the number of ordered labels
    if len(D) != len(ordered_labels):
        raise HandlingError('the number of listed taxa does not match the number of rows in the distance matrix')
    # get the set of selected indices and its complement
    n = len(D)
    index_selection = set(i for i, label in enumerate(ordered_labels) if label in selected_labels)
    index_complement = set(range(n)) - index_selection
    # begin the response
    out = StringIO()
    # get the ordered list of sets of indices to merge
    merged_indices = SchurAlgebra.vmerge([set([x]) for x in range(n)], index_selection)
    # calculate the new distance matrix
    L = Euclid.edm_to_laplacian(D)
    L_merged = SchurAlgebra.mmerge(L, index_selection)
    D_merged = Euclid.laplacian_to_edm(L_merged)
    # print the output distance matrix and the labels of its rows
    print >> out, 'new distance matrix:'
    print >> out, MatrixUtil.m_to_string(D_merged)
    print >> out
    print >> out, 'new taxon labels:'
    for merged_index_set in merged_indices:
        if len(merged_index_set) == 1:
            print >> out, ordered_labels[merged_index_set.pop()]
        else:
            print >> out, '{' + ', '.join(selected_labels) + '}'
    # write the response
    return out.getvalue()
Exemple #19
0
def get_response_content(fs):
    out = StringIO()
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # assert that each node is named
    for node in tree.preorder():
        if not node.name:
            raise HandlingError('each node in the tree must have a name')
    # get the function that converts a matrix to a string
    if fs.plain_matrix:
        m_to_string = MatrixUtil.m_to_string
    elif fs.latex_matrix:
        m_to_string = latexutil.m_to_latex_string
    # print the results for the split of the full tree
    print >> out, get_full_tree_message(tree, m_to_string)
    print >> out
    # get the alphabetically ordered names of the tips
    ordered_tip_names = list(sorted(tip.get_name() for tip in tree.gen_tips()))
    # get the corresponding ordered ids
    tip_name_to_id = dict((tip.get_name(), id(tip)) for tip in tree.gen_tips())
    ordered_tip_ids = [tip_name_to_id[name] for name in ordered_tip_names]
    # get the distance matrix defined by the tips of the tree
    D = np.array(tree.get_partial_distance_matrix(ordered_tip_ids))
    L = Euclid.edm_to_laplacian(D)
    #print >> out, 'the Laplacian obtained from the full tree by Schur complementation:'
    #print >> out, MatrixUtil.m_to_string(L)
    #print >> out
    print >> out, 'the Schur complement in the Laplacian of the full tree scaled by', fs.scaling_factor
    print >> out, m_to_string(fs.scaling_factor * L)
    print >> out
    #L_merged = SchurAlgebra.mmerge(L, set([3,4,5]))
    #print >> out, 'the merged Laplacian:'
    #print >> out, MatrixUtil.m_to_string(L_merged)
    #print >> out
    # get the Fiedler cut of the Schur Laplacian
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    print >> out, 'the Fiedler split of the Schur complement of the full tree:'
    for name, value in zip(ordered_tip_names, v):
        print >> out, name, ':', value
    print >> out
    # get the Fiedler cuts of Schur complements of child trees
    print >> out, get_child_messages(L, eigensplit, ordered_tip_names,
                                     m_to_string, fs.scaling_factor)
    print >> out
    # get the Fiedler cuts of Schur complements of subtrees
    print >> out, get_subtree_messages(D, eigensplit, ordered_tip_names)
    # return the response
    return out.getvalue()
Exemple #20
0
 def _get_any_selection(self, D):
     """
     @param D: a numpy or row major distance matrix
     @return: a set of selected indices representing one of the two parts of the bipartition
     """
     # The fiedler eigenvector is calculated for the laplacian matrix associated with the distance matrix.
     # The signs of the elements of the fiedler eigenvector determine group assignment.
     L = Euclid.edm_to_laplacian(np.array(D))
     w, v = scipy.linalg.eigh(L)
     eigenvalue_info = list(sorted((abs(x), i) for i, x in enumerate(w)))
     stationary_eigenvector_index = eigenvalue_info[0][1]
     fiedler_eigenvector_index = eigenvalue_info[1][1]
     fiedler_eigenvector = v.T[fiedler_eigenvector_index]
     index_selection = set(i for i, value in enumerate(fiedler_eigenvector) if value > 0)
     return index_selection
Exemple #21
0
def get_response_content(fs):
    out = StringIO()
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # assert that each node is named
    for node in tree.preorder():
        if not node.name:
            raise HandlingError('each node in the tree must have a name')
    # get the function that converts a matrix to a string
    if fs.plain_matrix:
        m_to_string = MatrixUtil.m_to_string
    elif fs.latex_matrix:
        m_to_string = latexutil.m_to_latex_string
    # print the results for the split of the full tree
    print >> out, get_full_tree_message(tree, m_to_string)
    print >> out
    # get the alphabetically ordered names of the tips
    ordered_tip_names = list(sorted(tip.get_name() for tip in tree.gen_tips()))
    # get the corresponding ordered ids
    tip_name_to_id = dict((tip.get_name(), id(tip)) for tip in tree.gen_tips())
    ordered_tip_ids = [tip_name_to_id[name] for name in ordered_tip_names]
    # get the distance matrix defined by the tips of the tree
    D = np.array(tree.get_partial_distance_matrix(ordered_tip_ids))
    L = Euclid.edm_to_laplacian(D)
    #print >> out, 'the Laplacian obtained from the full tree by Schur complementation:'
    #print >> out, MatrixUtil.m_to_string(L)
    #print >> out
    print >> out, 'the Schur complement in the Laplacian of the full tree scaled by', fs.scaling_factor
    print >> out, m_to_string(fs.scaling_factor * L)
    print >> out
    #L_merged = SchurAlgebra.mmerge(L, set([3,4,5]))
    #print >> out, 'the merged Laplacian:'
    #print >> out, MatrixUtil.m_to_string(L_merged)
    #print >> out
    # get the Fiedler cut of the Schur Laplacian
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    print >> out, 'the Fiedler split of the Schur complement of the full tree:'
    for name, value in zip(ordered_tip_names, v):
        print >> out, name, ':', value
    print >> out
    # get the Fiedler cuts of Schur complements of child trees
    print >> out, get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, fs.scaling_factor)
    print >> out
    # get the Fiedler cuts of Schur complements of subtrees
    print >> out, get_subtree_messages(D, eigensplit, ordered_tip_names)
    # return the response
    return out.getvalue()
Exemple #22
0
def process(npoints, nseconds):
    """
    @param npoints: attempt to form each counterexample from this many points
    @param nseconds: allow this many seconds to run
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    best_result = None
    nchecked = 0
    while time.time() - start_time < nseconds:
        # look for a counterexample
        points = sample_points(npoints)
        D = points_to_edm(points)
        L = Euclid.edm_to_laplacian(D)
        L_small = SchurAlgebra.mmerge(L, set([0, 1]))
        w = np.linalg.eigvalsh(L_small)
        D_small = Euclid.laplacian_to_edm(L_small)
        result = Counterexample(points, D, w, D_small)
        # see if the counterexample is interesting
        if best_result is None:
            best_result = result
        elif min(result.L_eigenvalues) < min(best_result.L_eigenvalues):
            best_result = result
        nchecked += 1
    out = StringIO()
    print >> out, 'checked', nchecked, 'matrices each formed from', npoints, 'points'
    print >> out
    print >> out, 'eigenvalues of the induced matrix with lowest eigenvalue:'
    for value in reversed(sorted(best_result.L_eigenvalues)):
        print >> out, value
    print >> out
    print >> out, 'corresponding induced distance matrix:'
    print >> out, MatrixUtil.m_to_string(best_result.D_small)
    print >> out
    print >> out, 'the original distance matrix corresponding to this matrix:'
    print >> out, MatrixUtil.m_to_string(best_result.D)
    print >> out
    print >> out, 'the points that formed the original distance matrix:'
    for point in best_result.points:
        print >> out, '\t'.join(str(x) for x in point)
    return out.getvalue().strip()
Exemple #23
0
def process(npoints, nseconds):
    """
    @param npoints: attempt to form each counterexample from this many points
    @param nseconds: allow this many seconds to run
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    best_result = None
    nchecked = 0
    while time.time() - start_time < nseconds:
        # look for a counterexample
        points = sample_points(npoints)
        D = points_to_edm(points)
        L = Euclid.edm_to_laplacian(D)
        L_small = SchurAlgebra.mmerge(L, set([0, 1]))
        w = np.linalg.eigvalsh(L_small)
        D_small = Euclid.laplacian_to_edm(L_small)
        result = Counterexample(points, D, w, D_small)
        # see if the counterexample is interesting
        if best_result is None:
            best_result = result
        elif min(result.L_eigenvalues) < min(best_result.L_eigenvalues):
            best_result = result
        nchecked += 1
    out = StringIO()
    print >> out, 'checked', nchecked, 'matrices each formed from', npoints, 'points'
    print >> out
    print >> out, 'eigenvalues of the induced matrix with lowest eigenvalue:'
    for value in reversed(sorted(best_result.L_eigenvalues)):
        print >> out, value
    print >> out
    print >> out, 'corresponding induced distance matrix:'
    print >> out, MatrixUtil.m_to_string(best_result.D_small)
    print >> out
    print >> out, 'the original distance matrix corresponding to this matrix:'
    print >> out, MatrixUtil.m_to_string(best_result.D)
    print >> out
    print >> out, 'the points that formed the original distance matrix:'
    for point in best_result.points:
        print >> out, '\t'.join(str(x) for x in point)
    return out.getvalue().strip()
Exemple #24
0
def get_response_content(fs):
    out = StringIO()
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    n = len(tree.preorder())
    # get the ids ordered so that the leaf ids are first
    leaf_name_id_pairs = [(node.get_name(), id(node))
            for node in tree.gen_tips()]
    ordered_leaf_ids = [node_id
            for name, node_id in sorted(leaf_name_id_pairs)]
    all_ids = [id(node) for node in tree.preorder()]
    ordered_non_leaf_ids = list(set(all_ids) - set(ordered_leaf_ids))
    ordered_ids = ordered_leaf_ids + ordered_non_leaf_ids
    # get the full laplacian matrix with small values set to zero
    D_full = tree.get_full_distance_matrix(ordered_ids)
    L_full = Euclid.edm_to_laplacian(np.array(D_full))
    epsilon = 1e-10
    L_full[np.abs(L_full) < epsilon] = 0
    # get the partial distance matrix
    D_partial = tree.get_partial_distance_matrix(ordered_leaf_ids)
    # show the partial distance matrix
    print >> out, 'partial distance matrix (leaves only):'
    print >> out, MatrixUtil.m_to_string(D_partial)
    print >> out
    # show the full laplacian matrix
    print >> out, 'full laplacian matrix (leaves first):'
    print >> out, MatrixUtil.m_to_string(L_full)
    print >> out
    # show the output matrices
    names = ('first', 'second', 'third')
    functions = (transform_a, transform_b, transform_c)
    for name, fn in zip(names, functions):
        S = fn(L_full, D_partial)
        print >> out, name + ' transformation:'
        print >> out, MatrixUtil.m_to_string(S)
        print >> out
    # write the response
    return out.getvalue()
Exemple #25
0
def get_standard_response(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # begin the response
    out = StringIO()
    # show a summary of the original data
    print >> out, 'data summary before removing branches with zero length:'
    print >> out, len(archaea_names), 'archaea names in the original tree'
    print >> out, len(bacteria_names), 'bacteria names in the original tree'
    print >> out, len(eukaryota_names), 'eukaryota names in the original tree'
    print >> out, len(all_names), 'total names in the original tree'
    print >> out
    # get the pruned full tree
    pruned_full_tree = get_pruned_tree(full_tree)
    ordered_names = list(node.get_name() for node in pruned_full_tree.gen_tips())
    # show a summary of the processed data
    print >> out, 'data summary after removing branches with zero length:'
    print >> out, len(ordered_names), 'total names in the processed non-degenerate tree'
    print >> out
    # draw the pruned full tree
    print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
    formatted_tree_string = NewickIO.get_narrow_newick_string(pruned_full_tree, 120) 
    print >> out, formatted_tree_string
    print >> out
    # split the distance matrix
    D = np.array(pruned_full_tree.get_distance_matrix(ordered_names))
    L = Euclid.edm_to_laplacian(D)
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    # report the eigendecomposition
    print >> out, get_eigendecomposition_report(D)
    # report the clade intersections of sides of the split
    side_names = [set(ordered_names[i] for i in side) for side in eigensplit]
    clade_name_pairs = ((archaea_names, 'archaea'), (bacteria_names, 'bacteria'), (eukaryota_names, 'eukaryota'))
    print >> out, 'clade intersections with each side of the split:'
    for side, side_name in zip(side_names, ('left', 'right')):
        for clade, clade_name in clade_name_pairs:
            if clade & side:
                print >> out, 'the', side_name, 'side intersects', clade_name
    print >> out
    # prepare to do the secondary splits
    left_indices, right_indices = eigensplit
    full_label_sets = [set([i]) for i in range(len(ordered_names))]
    # get a secondary split
    for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)):
        L_s1 = SchurAlgebra.mmerge(L, index_complement)
        next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement)
        v = BuildTreeTopology.laplacian_to_fiedler(L_s1)
        left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v)
        left_sublabels = set()
        for i in left_subindices:
            left_sublabels.update(next_label_sets[i])
        right_sublabels = set()
        for i in right_subindices:
            right_sublabels.update(next_label_sets[i])
        left_subnames = set(ordered_names[i] for i in left_sublabels)
        right_subnames = set(ordered_names[i] for i in right_sublabels)
        print >> out, 'clade intersections with a subsplit:'
        for clade, clade_name in clade_name_pairs:
            if clade & left_subnames:
                print >> out, 'the left side intersects', clade_name
        for clade, clade_name in clade_name_pairs:
            if clade & right_subnames:
                print >> out, 'the right side intersects', clade_name
        print >> out
    # show debug info
    print >> out, 'archaea names:'
    print >> out, '\n'.join(x for x in sorted(archaea_names))
    print >> out
    print >> out, 'bacteria names:'
    print >> out, '\n'.join(x for x in sorted(bacteria_names))
    print >> out
    print >> out, 'eukaryota names:'
    print >> out, '\n'.join(x for x in sorted(eukaryota_names))
    print >> out
    # return the response
    response_text = out.getvalue().strip()
    return [('Content-Type', 'text/plain')], response_text
Exemple #26
0
def get_response_content(fs):
    D = fs.matrix
    L = Euclid.edm_to_laplacian(D)
    return MatrixUtil.m_to_string(L) + '\n'
Exemple #27
0
def get_response_content(fs):
    D = fs.matrix
    L = Euclid.edm_to_laplacian(D)
    return MatrixUtil.m_to_string(L) + '\n'
Exemple #28
0
class TreeSearch:
    """
    This is a virtual base class.
    """

    def __init__(self):
        # boolean requirements defined by the user
        self.informative_children = None
        self.force_difference = None
        self.informative_full_split = None
        self.invalid_dendrogram = None
        # search options defined by the subclass
        self.tree = None
        self.desired_primary_split = None
        self.id_to_index = None
        # initialize the counts that are tracked for bookkeeping
        self.aug_split_collision_count = 0
        self.aug_split_degenerate_count = 0
        self.error_primary_split_count = 0
        self.invalid_primary_split_count = 0
        self.degenerate_primary_split_count = 0
        self.undesired_primary_split_count = 0
        self.desired_primary_split_count = 0
        self.uninformative_child_count = 0
        self.informative_child_count = 0
        self.valid_dendrogram_count = 0
        self.success_count = 0

    def is_initialized(self):
        required_data = [
                self.informative_children,
                self.force_difference,
                self.informative_full_split,
                self.invalid_dendrogram,
                self.tree,
                self.desired_primary_split,
                self.id_to_index]
        return not (None in required_data)

    def get_result_text(self):
        """
        @return: a multi-line string of text
        """
        out = StringIO()
        if self.force_difference or self.informative_full_split:
            print >> out, 'full graph split stats:'
            print >> out, self.aug_split_collision_count,
            print >> out, 'full graph splits collided with the desired primary split'
            print >> out, self.aug_split_degenerate_count,
            print >> out, 'full graph splits were degenerate'
            print >> out
        print >> out, 'primary split stats:'
        print >> out, self.error_primary_split_count,
        print >> out, 'errors in finding the primary split (should be 0)'
        print >> out, self.invalid_primary_split_count,
        print >> out, 'invalid primary splits (should be 0)'
        print >> out, self.degenerate_primary_split_count,
        print >> out, 'degenerate primary splits'
        print >> out, self.undesired_primary_split_count,
        print >> out, 'primary splits were not the target split'
        print >> out, self.desired_primary_split_count,
        print >> out, 'primary splits were the target split'
        print >> out
        if self.informative_children:
            print >> out, 'secondary split stats:'
            print >> out, self.uninformative_child_count,
            print >> out, 'samples had at least one uninformative child tree'
            print >> out, self.informative_child_count,
            print>> out, 'samples had two informative child trees'
            print >> out
        if self.invalid_dendrogram:
            print >> out, 'naive dendrogram stats:'
            print >> out, self.valid_dendrogram_count,
            print >> out, 'naive dendrograms were valid'
            print >> out
        return out.getvalue().strip()

    def do_search(self, nseconds, sampling_function):
        """
        @param nseconds: allowed search time or None
        @param sampling_function: a function that samples a branch length
        @return: True if a tree was found that met the criteria
        """
        if not self.is_initialized():
            raise RuntimeError('the search was not sufficiently initialized')
        true_splits = self.tree.get_nontrivial_splits()
        start_time = time.time()
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                return False
            # assign new sampled branch lengths
            for branch in self.tree.get_branches():
                branch.length = sampling_function()
            # get the distance matrix so we can use a library function to get the split
            D = np.array(self.tree.get_distance_matrix())
            ntips = len(D)
            # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves
            if self.force_difference or self.informative_full_split:
                A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index))
                L_aug = Euclid.adjacency_to_laplacian(A_aug)
                v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug)
                left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug)
                left = [x for x in left_aug if x in range(ntips)]
                right = [x for x in right_aug if x in range(ntips)]
                leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right)
                if self.force_difference:
                    if leaf_eigensplit_aug == self.desired_primary_split:
                        self.aug_split_collision_count += 1
                        continue
                if self.informative_full_split:
                    if min(len(s) for s in leaf_eigensplit_aug) < 2:
                        self.aug_split_degenerate_count += 1
                        continue
            # get the eigensplit
            try:
                eigensplit = BuildTreeTopology.split_using_eigenvector(D)
            except BuildTreeTopology.DegenerateSplitException, e:
                self.degenerate_primary_split_count += 1
                continue
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                self.error_primary_split_count += 1
                continue
            if eigensplit not in true_splits:
                raise RuntimeError('INVALID SPLIT:' + tree.get_newick_string())
            if eigensplit != self.desired_primary_split:
                self.undesired_primary_split_count += 1
                continue
            self.desired_primary_split_count += 1
            # check the splits of the two child trees
            degenerate_subsplit_count = 0
            L = Euclid.edm_to_laplacian(D)
            for side in eigensplit:
                L_child = SchurAlgebra.mmerge(L, side)
                v = BuildTreeTopology.laplacian_to_fiedler(L_child)
                child_eigensplit = BuildTreeTopology.eigenvector_to_split(v)
                if min(len(s) for s in child_eigensplit) < 2:
                    degenerate_subsplit_count += 1
            if degenerate_subsplit_count:
                self.uninformative_child_count += 1
            else:
                self.informative_child_count += 1
            if self.informative_children:
                if degenerate_subsplit_count:
                    continue
            # check the dendrogram
            if self.invalid_dendrogram:
                labels = range(len(D))
                hierarchy = Dendrogram.get_hierarchy(D, Dendrogram.spectral_split, labels)
                dendrogram_splits = set(Dendrogram.hierarchy_to_nontrivial_splits(hierarchy))
                if dendrogram_splits == true_splits:
                    self.valid_dendrogram_count += 1
                    continue
            # the tree has met all of the requirements
            return True
Exemple #29
0
 def _do_analysis(self, use_generalized_nj):
     """
     Do some splits of the tree.
     @param use_generalized_nj: True if we use an old method of outgrouping
     """
     # define the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     # get the primary split of the criterion matrix
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # assert that the first split cleanly separates the bacteria from the rest
     left_indices, right_indices = eigensplit
     left_domains = self._get_domains(
         [self.pruned_names[x] for x in left_indices])
     right_domains = self._get_domains(
         [self.pruned_names[x] for x in right_indices])
     if ('bacteria' in left_domains) and ('bacteria' in right_domains):
         raise HandlingError('bacteria were not defined by the first split')
     # now we have enough info to define the first supplementary csv file
     self.first_split_object = SupplementarySpreadsheetObject(
         self.pruned_names, L, v)
     # define the bacteria indices vs the non-bacteria indices for the second split
     if 'bacteria' in left_domains:
         bacteria_indices = left_indices
         non_bacteria_indices = right_indices
     elif 'bacteria' in right_domains:
         bacteria_indices = right_indices
         non_bacteria_indices = left_indices
     # get the secondary split of interest
     if use_generalized_nj:
         D_secondary = BuildTreeTopology.update_generalized_nj(
             D, bacteria_indices)
         L_secondary = Euclid.edm_to_laplacian(D_secondary)
     else:
         L_secondary = SchurAlgebra.mmerge(L, bacteria_indices)
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                           bacteria_indices)
     v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
     eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(
         v_secondary)
     left_subindices, right_subindices = eigensplit_secondary
     pruned_names_secondary = []
     for label_set in next_label_sets:
         if len(label_set) == 1:
             label = list(label_set)[0]
             pruned_names_secondary.append(self.pruned_names[label])
         else:
             pruned_names_secondary.append('all-bacteria')
     # assert that the second split cleanly separates the eukaryota from the rest
     left_subdomains = self._get_domains(
         [pruned_names_secondary[x] for x in left_subindices])
     right_subdomains = self._get_domains(
         [pruned_names_secondary[x] for x in right_subindices])
     if ('eukaryota' in left_subdomains) and ('eukaryota'
                                              in right_subdomains):
         raise HandlingError(
             'eukaryota were not defined by the second split')
     # now we have enough info to define the second supplementary csv file
     self.second_split_object = SupplementarySpreadsheetObject(
         pruned_names_secondary, L_secondary, v_secondary)
Exemple #30
0
def get_standard_response(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # begin the response
    out = StringIO()
    # show a summary of the original data
    print >> out, 'data summary before removing branches with zero length:'
    print >> out, len(archaea_names), 'archaea names in the original tree'
    print >> out, len(bacteria_names), 'bacteria names in the original tree'
    print >> out, len(eukaryota_names), 'eukaryota names in the original tree'
    print >> out, len(all_names), 'total names in the original tree'
    print >> out
    # get the pruned full tree
    pruned_full_tree = get_pruned_tree(full_tree)
    ordered_names = list(node.get_name()
                         for node in pruned_full_tree.gen_tips())
    # show a summary of the processed data
    print >> out, 'data summary after removing branches with zero length:'
    print >> out, len(
        ordered_names), 'total names in the processed non-degenerate tree'
    print >> out
    # draw the pruned full tree
    print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
    formatted_tree_string = NewickIO.get_narrow_newick_string(
        pruned_full_tree, 120)
    print >> out, formatted_tree_string
    print >> out
    # split the distance matrix
    D = np.array(pruned_full_tree.get_distance_matrix(ordered_names))
    L = Euclid.edm_to_laplacian(D)
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    # report the eigendecomposition
    print >> out, get_eigendecomposition_report(D)
    # report the clade intersections of sides of the split
    side_names = [set(ordered_names[i] for i in side) for side in eigensplit]
    clade_name_pairs = ((archaea_names, 'archaea'),
                        (bacteria_names, 'bacteria'), (eukaryota_names,
                                                       'eukaryota'))
    print >> out, 'clade intersections with each side of the split:'
    for side, side_name in zip(side_names, ('left', 'right')):
        for clade, clade_name in clade_name_pairs:
            if clade & side:
                print >> out, 'the', side_name, 'side intersects', clade_name
    print >> out
    # prepare to do the secondary splits
    left_indices, right_indices = eigensplit
    full_label_sets = [set([i]) for i in range(len(ordered_names))]
    # get a secondary split
    for index_selection, index_complement in ((left_indices, right_indices),
                                              (right_indices, left_indices)):
        L_s1 = SchurAlgebra.mmerge(L, index_complement)
        next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                              index_complement)
        v = BuildTreeTopology.laplacian_to_fiedler(L_s1)
        left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(
            v)
        left_sublabels = set()
        for i in left_subindices:
            left_sublabels.update(next_label_sets[i])
        right_sublabels = set()
        for i in right_subindices:
            right_sublabels.update(next_label_sets[i])
        left_subnames = set(ordered_names[i] for i in left_sublabels)
        right_subnames = set(ordered_names[i] for i in right_sublabels)
        print >> out, 'clade intersections with a subsplit:'
        for clade, clade_name in clade_name_pairs:
            if clade & left_subnames:
                print >> out, 'the left side intersects', clade_name
        for clade, clade_name in clade_name_pairs:
            if clade & right_subnames:
                print >> out, 'the right side intersects', clade_name
        print >> out
    # show debug info
    print >> out, 'archaea names:'
    print >> out, '\n'.join(x for x in sorted(archaea_names))
    print >> out
    print >> out, 'bacteria names:'
    print >> out, '\n'.join(x for x in sorted(bacteria_names))
    print >> out
    print >> out, 'eukaryota names:'
    print >> out, '\n'.join(x for x in sorted(eukaryota_names))
    print >> out
    # return the response
    response_text = out.getvalue().strip()
    return [('Content-Type', 'text/plain')], response_text
Exemple #31
0
 def get_verbose_summary(self):
     """
     @return: a multiline string
     """
     # begin the response
     out = StringIO()
     # show the number of taxa in various domains
     print >> out, self._get_name_summary()
     print >> out
     # show the pruned full tree
     formatted_tree_string = NewickIO.get_narrow_newick_string(
         self.pruned_tree, 120)
     print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
     print >> out, formatted_tree_string
     print >> out
     # split the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # report the eigendecomposition
     print >> out, get_eigendecomposition_report(D)
     print >> out
     # report the clade intersections of sides of the split
     side_names = [
         set(self.pruned_names[i] for i in side) for side in eigensplit
     ]
     print >> out, 'domains represented by each side of the primary split:'
     print >> out, 'the left side has:\t', ', '.join(
         self._get_domains(side_names[0]))
     print >> out, 'the right side has:\t', ', '.join(
         self._get_domains(side_names[1]))
     print >> out
     # prepare to do the secondary splits
     left_indices, right_indices = eigensplit
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     # do the secondary splits
     for index_selection, index_complement in ((left_indices,
                                                right_indices),
                                               (right_indices,
                                                left_indices)):
         L_secondary = SchurAlgebra.mmerge(L, index_complement)
         next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                               index_complement)
         v = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
         left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(
             v)
         left_sublabels = set()
         for i in left_subindices:
             left_sublabels.update(next_label_sets[i])
         right_sublabels = set()
         for i in right_subindices:
             right_sublabels.update(next_label_sets[i])
         left_subnames = set(self.pruned_names[i] for i in left_sublabels)
         right_subnames = set(self.pruned_names[i] for i in right_sublabels)
         print >> out, 'domains represented by a subsplit:'
         print >> out, 'the left side has:\t', ', '.join(
             self._get_domains(left_subnames))
         print >> out, 'the right side has:\t', ', '.join(
             self._get_domains(right_subnames))
         print >> out
     # return the multiline string
     return out.getvalue().strip()