Esempio n. 1
0
 def _do_analysis(self, use_generalized_nj):
     """
     Do some splits of the tree.
     @param use_generalized_nj: True if we use an old method of outgrouping
     """
     # define the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     # get the primary split of the criterion matrix
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # assert that the first split cleanly separates the bacteria from the rest
     left_indices, right_indices = eigensplit
     left_domains = self._get_domains([self.pruned_names[x] for x in left_indices])
     right_domains = self._get_domains([self.pruned_names[x] for x in right_indices])
     if ('bacteria' in left_domains) and ('bacteria' in right_domains):
         raise HandlingError('bacteria were not defined by the first split')
     # now we have enough info to define the first supplementary csv file
     self.first_split_object = SupplementarySpreadsheetObject(self.pruned_names, L, v)
     # define the bacteria indices vs the non-bacteria indices for the second split
     if 'bacteria' in left_domains:
         bacteria_indices = left_indices
         non_bacteria_indices = right_indices
     elif 'bacteria' in right_domains:
         bacteria_indices = right_indices
         non_bacteria_indices = left_indices
     # get the secondary split of interest
     if use_generalized_nj:
         D_secondary = BuildTreeTopology.update_generalized_nj(D, bacteria_indices)
         L_secondary = Euclid.edm_to_laplacian(D_secondary)
     else:
         L_secondary = SchurAlgebra.mmerge(L, bacteria_indices)
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     next_label_sets = SchurAlgebra.vmerge(full_label_sets, bacteria_indices)
     v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
     eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(v_secondary)
     left_subindices, right_subindices = eigensplit_secondary
     pruned_names_secondary = []
     for label_set in next_label_sets:
         if len(label_set) == 1:
             label = list(label_set)[0]
             pruned_names_secondary.append(self.pruned_names[label])
         else:
             pruned_names_secondary.append('all-bacteria')
     # assert that the second split cleanly separates the eukaryota from the rest
     left_subdomains = self._get_domains([pruned_names_secondary[x] for x in left_subindices])
     right_subdomains = self._get_domains([pruned_names_secondary[x] for x in right_subindices])
     if ('eukaryota' in left_subdomains) and ('eukaryota' in right_subdomains):
         raise HandlingError('eukaryota were not defined by the second split')
     # now we have enough info to define the second supplementary csv file
     self.second_split_object = SupplementarySpreadsheetObject(pruned_names_secondary, L_secondary, v_secondary)
Esempio n. 2
0
 def get_verbose_summary(self):
     """
     @return: a multiline string
     """
     # begin the response
     out = StringIO()
     # show the number of taxa in various domains
     print >> out, self._get_name_summary()
     print >> out
     # show the pruned full tree
     formatted_tree_string = NewickIO.get_narrow_newick_string(self.pruned_tree, 120) 
     print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
     print >> out, formatted_tree_string
     print >> out
     # split the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # report the eigendecomposition
     print >> out, get_eigendecomposition_report(D)
     print >> out
     # report the clade intersections of sides of the split
     side_names = [set(self.pruned_names[i] for i in side) for side in eigensplit]
     print >> out, 'domains represented by each side of the primary split:'
     print >> out, 'the left side has:\t', ', '.join(self._get_domains(side_names[0]))
     print >> out, 'the right side has:\t', ', '.join(self._get_domains(side_names[1]))
     print >> out
     # prepare to do the secondary splits
     left_indices, right_indices = eigensplit
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     # do the secondary splits
     for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)):
         L_secondary = SchurAlgebra.mmerge(L, index_complement)
         next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement)
         v = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
         left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v)
         left_sublabels = set()
         for i in left_subindices:
             left_sublabels.update(next_label_sets[i])
         right_sublabels = set()
         for i in right_subindices:
             right_sublabels.update(next_label_sets[i])
         left_subnames = set(self.pruned_names[i] for i in left_sublabels)
         right_subnames = set(self.pruned_names[i] for i in right_sublabels)
         print >> out, 'domains represented by a subsplit:'
         print >> out, 'the left side has:\t', ', '.join(self._get_domains(left_subnames))
         print >> out, 'the right side has:\t', ', '.join(self._get_domains(right_subnames))
         print >> out
     # return the multiline string
     return out.getvalue().strip()
Esempio n. 3
0
def get_eigendecomposition_report(D):
    """
    @param D: a distance matrix
    @return: a multi-line string
    """
    out = StringIO()
    # get some intermediate matrices and vectors
    L = Euclid.edm_to_laplacian(D)
    laplacian_fiedler = BuildTreeTopology.laplacian_to_fiedler(L)
    distance_fiedler = BuildTreeTopology.edm_to_fiedler(D)
    eigensplit = BuildTreeTopology.eigenvector_to_split(laplacian_fiedler)
    # report the two eigenvalue lists that should be the same
    HDH = MatrixUtil.double_centered(D)
    HSH = -0.5 * HDH
    w_distance, vt_distance = np.linalg.eigh(HSH)
    print >> out, 'the laplacian-derived and distance-derived eigenvalues:'
    w_laplacian, vt_laplacian = np.linalg.eigh(L)
    for a, b in zip(sorted(w_laplacian), sorted(w_distance)):
        print >> out, a, '\t', b
    print >> out
    # report the two fiedler vectors that should be the same
    print >> out, 'the laplacian-derived and distance-derived fiedler vectors:'
    for a, b in zip(laplacian_fiedler, distance_fiedler):
        print >> out, a, '\t', b
    return out.getvalue().strip()
Esempio n. 4
0
def get_full_tree_message(tree, m_to_string):
    """
    In this function we find the Fiedler split of the full tree.
    @param tree: each node in this tree must have a name
    @param m_to_string: a function that converts a matrix to a string
    @return: a message about the split of the tips of the tree induced by the fiedler vector
    """
    out = StringIO()
    # get the alphabetically ordered names
    ordered_names = list(sorted(node.get_name() for node in tree.preorder()))
    # get the corresponding ordered ids
    name_to_id = dict((node.get_name(), id(node)) for node in tree.preorder())
    ordered_ids = [name_to_id[name] for name in ordered_names]
    # get the full weighted adjacency matrix
    A = np.array(tree.get_affinity_matrix(ordered_ids))
    print >> out, 'the weighted reciprocal adjacency matrix of the full tree:'
    print >> out, m_to_string(get_reciprocal_matrix(A))
    print >> out
    # get the full Laplacian matrix
    L = Euclid.adjacency_to_laplacian(A)
    # get the fiedler split
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    print >> out, 'the Fiedler split of the full tree:'
    for name, value in zip(ordered_names, v):
        print >> out, name, ':', value
    return out.getvalue().strip()
Esempio n. 5
0
def get_child_messages(L, eigensplit, ordered_tip_names, m_to_string,
                       scaling_factor):
    """
    @param L: the laplacian corresponding to tips of the tree
    @param eigensplit: the split induced by the fiedler vector
    @param ordered_tip_names: names of the tips of the tree conformant to v and L
    @param m_to_string: a function that converts a matrix to a string
    @param scaling_factor: show the Laplacian scaled by this factor
    @return: a multi-line string
    """
    out = StringIO()
    n = len(L)
    ordered_label_sets = [set([i]) for i in range(n)]
    all_labels = set(range(n))
    for i, child in enumerate(eigensplit):
        complement = all_labels - child
        L_child = SchurAlgebra.mmerge(L, complement)
        print >> out, 'the Schur complement in the Laplacian of child tree', i + 1, 'scaled by', scaling_factor
        print >> out, m_to_string(scaling_factor * L_child)
        print >> out
        child_label_sets = SchurAlgebra.vmerge(ordered_label_sets, complement)
        v_child = BuildTreeTopology.laplacian_to_fiedler(L_child)
        print >> out, 'the Fiedler split of the Schur complement in the Laplacian of child tree', i + 1
        for label_set, value in zip(child_label_sets, v_child):
            s = label_set_to_string(label_set, ordered_tip_names)
            print >> out, s, ':', value
        print >> out
    return out.getvalue().strip()
Esempio n. 6
0
def get_eigendecomposition_report(D):
    """
    @param D: a distance matrix
    @return: a multi-line string
    """
    out = StringIO()
    # get some intermediate matrices and vectors
    L = Euclid.edm_to_laplacian(D)
    laplacian_fiedler = BuildTreeTopology.laplacian_to_fiedler(L)
    distance_fiedler = BuildTreeTopology.edm_to_fiedler(D)
    eigensplit = BuildTreeTopology.eigenvector_to_split(laplacian_fiedler)
    # report the two eigenvalue lists that should be the same
    HDH = MatrixUtil.double_centered(D)
    HSH = -0.5 * HDH
    w_distance, vt_distance = np.linalg.eigh(HSH)
    print >> out, 'the laplacian-derived and distance-derived eigenvalues:'
    w_laplacian, vt_laplacian = np.linalg.eigh(L)
    for a, b in zip(sorted(w_laplacian), sorted(w_distance)):
        print >> out, a, '\t', b
    print >> out
    # report the two fiedler vectors that should be the same
    print >> out, 'the laplacian-derived and distance-derived fiedler vectors:'
    for a, b in zip(laplacian_fiedler, distance_fiedler):
        print >> out, a, '\t', b
    return out.getvalue().strip()
Esempio n. 7
0
def get_full_tree_message(tree, m_to_string):
    """
    In this function we find the Fiedler split of the full tree.
    @param tree: each node in this tree must have a name
    @param m_to_string: a function that converts a matrix to a string
    @return: a message about the split of the tips of the tree induced by the fiedler vector
    """
    out = StringIO()
    # get the alphabetically ordered names
    ordered_names = list(sorted(node.get_name() for node in tree.preorder()))
    # get the corresponding ordered ids
    name_to_id = dict((node.get_name(), id(node)) for node in tree.preorder())
    ordered_ids = [name_to_id[name] for name in ordered_names]
    # get the full weighted adjacency matrix
    A = np.array(tree.get_affinity_matrix(ordered_ids))
    print >> out, 'the weighted reciprocal adjacency matrix of the full tree:'
    print >> out, m_to_string(get_reciprocal_matrix(A))
    print >> out
    # get the full Laplacian matrix
    L = Euclid.adjacency_to_laplacian(A)
    # get the fiedler split
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    print >> out, 'the Fiedler split of the full tree:'
    for name, value in zip(ordered_names, v):
        print >> out, name, ':', value
    return out.getvalue().strip()
Esempio n. 8
0
def get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, scaling_factor):
    """
    @param L: the laplacian corresponding to tips of the tree
    @param eigensplit: the split induced by the fiedler vector
    @param ordered_tip_names: names of the tips of the tree conformant to v and L
    @param m_to_string: a function that converts a matrix to a string
    @param scaling_factor: show the Laplacian scaled by this factor
    @return: a multi-line string
    """
    out = StringIO()
    n = len(L)
    ordered_label_sets = [set([i]) for i in range(n)]
    all_labels = set(range(n))
    for i, child in enumerate(eigensplit):
        complement = all_labels - child
        L_child = SchurAlgebra.mmerge(L, complement) 
        print >> out, 'the Schur complement in the Laplacian of child tree', i+1, 'scaled by', scaling_factor
        print >> out, m_to_string(scaling_factor * L_child)
        print >> out
        child_label_sets = SchurAlgebra.vmerge(ordered_label_sets, complement)
        v_child = BuildTreeTopology.laplacian_to_fiedler(L_child) 
        print >> out, 'the Fiedler split of the Schur complement in the Laplacian of child tree', i+1
        for label_set, value in zip(child_label_sets, v_child):
            s = label_set_to_string(label_set, ordered_tip_names)
            print >> out, s, ':', value
        print >> out
    return out.getvalue().strip()
Esempio n. 9
0
def get_response_content(fs):
    out = StringIO()
    # try to make some graphs
    unconnected_count = 0
    invalid_split_count = 0
    valid_split_count = 0
    for graph_index in range(fs.ngraphs):
        G = erdos_renyi(fs.nvertices, fs.pedge)
        if is_connected(G):
            # add interesting edge weights
            add_exponential_weights(G)
            # turn the adjacency matrix into a laplacian matrix
            L = Euclid.adjacency_to_laplacian(G)
            for v in range(fs.nvertices):
                small_index_to_big_index = {}
                for i_small, i_big in enumerate([i for i in range(fs.nvertices) if i != v]):
                    small_index_to_big_index[i_small] = i_big
                # take the schur complement with respect to the given vertex
                L_reduced = get_single_element_schur_complement(L, v)
                assert len(L_reduced) == len(L) - 1
                # get the loadings of the vertices of the reduced graph
                if fs.fiedler_cut:
                    Y_reduced = BuildTreeTopology.laplacian_to_fiedler(L_reduced)
                elif fs.random_cut:
                    Y_reduced = get_random_vector(L_reduced)
                assert len(Y_reduced) == len(L_reduced)
                # expand the fiedler vector with positive and negative valuations for the removed vertex
                found_valid_split = False
                for augmented_loading in (-1.0, 1.0):
                    # get the augmented split vector for this assignment of the removed vertex
                    Y_full = [0]*len(G)
                    for i_reduced, loading in enumerate(Y_reduced):
                        i_big = small_index_to_big_index[i_reduced]
                        Y_full[i_big] = loading
                    Y_full[v] = augmented_loading
                    assert len(Y_full) == len(G)
                    # get the two graphs defined by the split
                    subgraph_a, subgraph_b = list(gen_subgraphs(G, Y_full))
                    # if the subgraphs are both connected then the split is valid
                    if is_connected(subgraph_a) and is_connected(subgraph_b):
                        found_valid_split = True
                # if a valid split was not found then show the matrix
                if found_valid_split:
                    valid_split_count += 1
                else:
                    print >> out, 'Found a matrix that was split incompatibly by a cut of its schur complement!'
                    print >> out, 'matrix:'
                    print >> out, MatrixUtil.m_to_string(G)
                    print >> out, 'index that was removed:', v
                    invalid_split_count += 1
        else:
            unconnected_count += 1
    # show the number of connected and of unconnected graphs
    print >> out, 'this many random graphs were connected:', fs.ngraphs - unconnected_count
    print >> out, 'this many random graphs were not connected:', unconnected_count
    print >> out, 'this many splits were valid:', valid_split_count
    print >> out, 'this many splits were invalid:', invalid_split_count
    # return the result
    return out.getvalue()
Esempio n. 10
0
def get_response_content(fs):
    out = StringIO()
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # assert that each node is named
    for node in tree.preorder():
        if not node.name:
            raise HandlingError('each node in the tree must have a name')
    # get the function that converts a matrix to a string
    if fs.plain_matrix:
        m_to_string = MatrixUtil.m_to_string
    elif fs.latex_matrix:
        m_to_string = latexutil.m_to_latex_string
    # print the results for the split of the full tree
    print >> out, get_full_tree_message(tree, m_to_string)
    print >> out
    # get the alphabetically ordered names of the tips
    ordered_tip_names = list(sorted(tip.get_name() for tip in tree.gen_tips()))
    # get the corresponding ordered ids
    tip_name_to_id = dict((tip.get_name(), id(tip)) for tip in tree.gen_tips())
    ordered_tip_ids = [tip_name_to_id[name] for name in ordered_tip_names]
    # get the distance matrix defined by the tips of the tree
    D = np.array(tree.get_partial_distance_matrix(ordered_tip_ids))
    L = Euclid.edm_to_laplacian(D)
    #print >> out, 'the Laplacian obtained from the full tree by Schur complementation:'
    #print >> out, MatrixUtil.m_to_string(L)
    #print >> out
    print >> out, 'the Schur complement in the Laplacian of the full tree scaled by', fs.scaling_factor
    print >> out, m_to_string(fs.scaling_factor * L)
    print >> out
    #L_merged = SchurAlgebra.mmerge(L, set([3,4,5]))
    #print >> out, 'the merged Laplacian:'
    #print >> out, MatrixUtil.m_to_string(L_merged)
    #print >> out
    # get the Fiedler cut of the Schur Laplacian
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    print >> out, 'the Fiedler split of the Schur complement of the full tree:'
    for name, value in zip(ordered_tip_names, v):
        print >> out, name, ':', value
    print >> out
    # get the Fiedler cuts of Schur complements of child trees
    print >> out, get_child_messages(L, eigensplit, ordered_tip_names,
                                     m_to_string, fs.scaling_factor)
    print >> out
    # get the Fiedler cuts of Schur complements of subtrees
    print >> out, get_subtree_messages(D, eigensplit, ordered_tip_names)
    # return the response
    return out.getvalue()
Esempio n. 11
0
def get_response_content(fs):
    out = StringIO()
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # assert that each node is named
    for node in tree.preorder():
        if not node.name:
            raise HandlingError('each node in the tree must have a name')
    # get the function that converts a matrix to a string
    if fs.plain_matrix:
        m_to_string = MatrixUtil.m_to_string
    elif fs.latex_matrix:
        m_to_string = latexutil.m_to_latex_string
    # print the results for the split of the full tree
    print >> out, get_full_tree_message(tree, m_to_string)
    print >> out
    # get the alphabetically ordered names of the tips
    ordered_tip_names = list(sorted(tip.get_name() for tip in tree.gen_tips()))
    # get the corresponding ordered ids
    tip_name_to_id = dict((tip.get_name(), id(tip)) for tip in tree.gen_tips())
    ordered_tip_ids = [tip_name_to_id[name] for name in ordered_tip_names]
    # get the distance matrix defined by the tips of the tree
    D = np.array(tree.get_partial_distance_matrix(ordered_tip_ids))
    L = Euclid.edm_to_laplacian(D)
    #print >> out, 'the Laplacian obtained from the full tree by Schur complementation:'
    #print >> out, MatrixUtil.m_to_string(L)
    #print >> out
    print >> out, 'the Schur complement in the Laplacian of the full tree scaled by', fs.scaling_factor
    print >> out, m_to_string(fs.scaling_factor * L)
    print >> out
    #L_merged = SchurAlgebra.mmerge(L, set([3,4,5]))
    #print >> out, 'the merged Laplacian:'
    #print >> out, MatrixUtil.m_to_string(L_merged)
    #print >> out
    # get the Fiedler cut of the Schur Laplacian
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    print >> out, 'the Fiedler split of the Schur complement of the full tree:'
    for name, value in zip(ordered_tip_names, v):
        print >> out, name, ':', value
    print >> out
    # get the Fiedler cuts of Schur complements of child trees
    print >> out, get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, fs.scaling_factor)
    print >> out
    # get the Fiedler cuts of Schur complements of subtrees
    print >> out, get_subtree_messages(D, eigensplit, ordered_tip_names)
    # return the response
    return out.getvalue()
Esempio n. 12
0
 def do_search(self, nseconds, sampling_function):
     """
     @param nseconds: allowed search time or None
     @param sampling_function: a function that samples a branch length
     @return: True if a tree was found that met the criteria
     """
     if not self.is_initialized():
         raise RuntimeError("the search was not sufficiently initialized")
     true_splits = self.tree.get_nontrivial_splits()
     start_time = time.time()
     while True:
         elapsed_time = time.time() - start_time
         if nseconds and elapsed_time > nseconds:
             return False
         # assign new sampled branch lengths
         for branch in self.tree.get_branches():
             branch.length = sampling_function()
         # get the distance matrix so we can use a library function to get the split
         D = np.array(self.tree.get_distance_matrix())
         ntips = len(D)
         # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves
         if self.force_difference or self.informative_full_split:
             A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index))
             L_aug = Euclid.adjacency_to_laplacian(A_aug)
             v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug)
             left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug)
             left = [x for x in left_aug if x in range(ntips)]
             right = [x for x in right_aug if x in range(ntips)]
             leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right)
             if self.force_difference:
                 if leaf_eigensplit_aug == self.desired_primary_split:
                     self.aug_split_collision_count += 1
                     continue
             if self.informative_full_split:
                 if min(len(s) for s in leaf_eigensplit_aug) < 2:
                     self.aug_split_degenerate_count += 1
                     continue
         # get the eigensplit
         try:
             eigensplit = BuildTreeTopology.split_using_eigenvector(D)
         except BuildTreeTopology.DegenerateSplitException, e:
             self.degenerate_primary_split_count += 1
             continue
         except BuildTreeTopology.InvalidSpectralSplitException, e:
             self.error_primary_split_count += 1
             continue
Esempio n. 13
0
 def do_search(self, nseconds, sampling_function):
     """
     @param nseconds: allowed search time or None
     @param sampling_function: a function that samples a branch length
     @return: True if a tree was found that met the criteria
     """
     if not self.is_initialized():
         raise RuntimeError('the search was not sufficiently initialized')
     true_splits = self.tree.get_nontrivial_splits()
     start_time = time.time()
     while True:
         elapsed_time = time.time() - start_time
         if nseconds and elapsed_time > nseconds:
             return False
         # assign new sampled branch lengths
         for branch in self.tree.get_branches():
             branch.length = sampling_function()
         # get the distance matrix so we can use a library function to get the split
         D = np.array(self.tree.get_distance_matrix())
         ntips = len(D)
         # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves
         if self.force_difference or self.informative_full_split:
             A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index))
             L_aug = Euclid.adjacency_to_laplacian(A_aug)
             v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug)
             left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug)
             left = [x for x in left_aug if x in range(ntips)]
             right = [x for x in right_aug if x in range(ntips)]
             leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right)
             if self.force_difference:
                 if leaf_eigensplit_aug == self.desired_primary_split:
                     self.aug_split_collision_count += 1
                     continue
             if self.informative_full_split:
                 if min(len(s) for s in leaf_eigensplit_aug) < 2:
                     self.aug_split_degenerate_count += 1
                     continue
         # get the eigensplit
         try:
             eigensplit = BuildTreeTopology.split_using_eigenvector(D)
         except BuildTreeTopology.DegenerateSplitException, e:
             self.degenerate_primary_split_count += 1
             continue
         except BuildTreeTopology.InvalidSpectralSplitException, e:
             self.error_primary_split_count += 1
             continue
Esempio n. 14
0
def get_response_content(fs):
    # read the points and edges
    points, edges = read_points_and_edges(fs.graph_data)
    # get the width and height of the drawable area of the image
    width = fs.total_width - 2*fs.border
    height = fs.total_height - 2*fs.border
    if width < 1 or height < 1:
        msg = 'the image dimensions do not allow for enough drawable area'
        raise HandlingError(msg)
    # read the image info
    show_labels = None
    if fs.label_from_0:
        show_labels = 0
    elif fs.label_from_1:
        show_labels = 1
    # define the valuations which will define the node colors
    if fs.color_x:
        valuations = [p[0] for p in points]
    elif fs.color_fiedler_weighted or fs.color_fiedler_unweighted:
        if fs.color_fiedler_weighted:
            X = [np.array(p) for p in points]
            dists = [np.linalg.norm(X[j] - X[i]) for i, j in edges]
            weights = [1.0 / d for d in dists]
        else:
            weights = [1.0 for e in edges]
        L = edges_to_laplacian(edges, weights)
        valuations = BuildTreeTopology.laplacian_to_fiedler(L)
    else:
        valuations = [0 for p in points]
    valuations = [-v if fs.flip else v for v in valuations]
    colors = valuations_to_colors(valuations)
    # draw the image
    ext = Form.g_imageformat_to_ext[fs.imageformat]
    info = ImageInfo(fs.total_width, fs.total_height,
            fs.black, show_labels, fs.border, ext)
    try:
        return get_image_string(points, edges, colors, info)
    except CairoUtil.CairoUtilError as e:
        raise HandlingError(e)
Esempio n. 15
0
def get_response_content(fs):
    # read the points and edges
    points, edges = read_points_and_edges(fs.graph_data)
    # get the width and height of the drawable area of the image
    width = fs.total_width - 2 * fs.border
    height = fs.total_height - 2 * fs.border
    if width < 1 or height < 1:
        msg = 'the image dimensions do not allow for enough drawable area'
        raise HandlingError(msg)
    # read the image info
    show_labels = None
    if fs.label_from_0:
        show_labels = 0
    elif fs.label_from_1:
        show_labels = 1
    # define the valuations which will define the node colors
    if fs.color_x:
        valuations = [p[0] for p in points]
    elif fs.color_fiedler_weighted or fs.color_fiedler_unweighted:
        if fs.color_fiedler_weighted:
            X = [np.array(p) for p in points]
            dists = [np.linalg.norm(X[j] - X[i]) for i, j in edges]
            weights = [1.0 / d for d in dists]
        else:
            weights = [1.0 for e in edges]
        L = edges_to_laplacian(edges, weights)
        valuations = BuildTreeTopology.laplacian_to_fiedler(L)
    else:
        valuations = [0 for p in points]
    valuations = [-v if fs.flip else v for v in valuations]
    colors = valuations_to_colors(valuations)
    # draw the image
    ext = Form.g_imageformat_to_ext[fs.imageformat]
    info = ImageInfo(fs.total_width, fs.total_height, fs.black, show_labels,
                     fs.border, ext)
    try:
        return get_image_string(points, edges, colors, info)
    except CairoUtil.CairoUtilError as e:
        raise HandlingError(e)
Esempio n. 16
0
def get_standard_response(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # begin the response
    out = StringIO()
    # show a summary of the original data
    print >> out, 'data summary before removing branches with zero length:'
    print >> out, len(archaea_names), 'archaea names in the original tree'
    print >> out, len(bacteria_names), 'bacteria names in the original tree'
    print >> out, len(eukaryota_names), 'eukaryota names in the original tree'
    print >> out, len(all_names), 'total names in the original tree'
    print >> out
    # get the pruned full tree
    pruned_full_tree = get_pruned_tree(full_tree)
    ordered_names = list(node.get_name()
                         for node in pruned_full_tree.gen_tips())
    # show a summary of the processed data
    print >> out, 'data summary after removing branches with zero length:'
    print >> out, len(
        ordered_names), 'total names in the processed non-degenerate tree'
    print >> out
    # draw the pruned full tree
    print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
    formatted_tree_string = NewickIO.get_narrow_newick_string(
        pruned_full_tree, 120)
    print >> out, formatted_tree_string
    print >> out
    # split the distance matrix
    D = np.array(pruned_full_tree.get_distance_matrix(ordered_names))
    L = Euclid.edm_to_laplacian(D)
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    # report the eigendecomposition
    print >> out, get_eigendecomposition_report(D)
    # report the clade intersections of sides of the split
    side_names = [set(ordered_names[i] for i in side) for side in eigensplit]
    clade_name_pairs = ((archaea_names, 'archaea'),
                        (bacteria_names, 'bacteria'), (eukaryota_names,
                                                       'eukaryota'))
    print >> out, 'clade intersections with each side of the split:'
    for side, side_name in zip(side_names, ('left', 'right')):
        for clade, clade_name in clade_name_pairs:
            if clade & side:
                print >> out, 'the', side_name, 'side intersects', clade_name
    print >> out
    # prepare to do the secondary splits
    left_indices, right_indices = eigensplit
    full_label_sets = [set([i]) for i in range(len(ordered_names))]
    # get a secondary split
    for index_selection, index_complement in ((left_indices, right_indices),
                                              (right_indices, left_indices)):
        L_s1 = SchurAlgebra.mmerge(L, index_complement)
        next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                              index_complement)
        v = BuildTreeTopology.laplacian_to_fiedler(L_s1)
        left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(
            v)
        left_sublabels = set()
        for i in left_subindices:
            left_sublabels.update(next_label_sets[i])
        right_sublabels = set()
        for i in right_subindices:
            right_sublabels.update(next_label_sets[i])
        left_subnames = set(ordered_names[i] for i in left_sublabels)
        right_subnames = set(ordered_names[i] for i in right_sublabels)
        print >> out, 'clade intersections with a subsplit:'
        for clade, clade_name in clade_name_pairs:
            if clade & left_subnames:
                print >> out, 'the left side intersects', clade_name
        for clade, clade_name in clade_name_pairs:
            if clade & right_subnames:
                print >> out, 'the right side intersects', clade_name
        print >> out
    # show debug info
    print >> out, 'archaea names:'
    print >> out, '\n'.join(x for x in sorted(archaea_names))
    print >> out
    print >> out, 'bacteria names:'
    print >> out, '\n'.join(x for x in sorted(bacteria_names))
    print >> out
    print >> out, 'eukaryota names:'
    print >> out, '\n'.join(x for x in sorted(eukaryota_names))
    print >> out
    # return the response
    response_text = out.getvalue().strip()
    return [('Content-Type', 'text/plain')], response_text
Esempio n. 17
0
 def get_verbose_summary(self):
     """
     @return: a multiline string
     """
     # begin the response
     out = StringIO()
     # show the number of taxa in various domains
     print >> out, self._get_name_summary()
     print >> out
     # show the pruned full tree
     formatted_tree_string = NewickIO.get_narrow_newick_string(
         self.pruned_tree, 120)
     print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
     print >> out, formatted_tree_string
     print >> out
     # split the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # report the eigendecomposition
     print >> out, get_eigendecomposition_report(D)
     print >> out
     # report the clade intersections of sides of the split
     side_names = [
         set(self.pruned_names[i] for i in side) for side in eigensplit
     ]
     print >> out, 'domains represented by each side of the primary split:'
     print >> out, 'the left side has:\t', ', '.join(
         self._get_domains(side_names[0]))
     print >> out, 'the right side has:\t', ', '.join(
         self._get_domains(side_names[1]))
     print >> out
     # prepare to do the secondary splits
     left_indices, right_indices = eigensplit
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     # do the secondary splits
     for index_selection, index_complement in ((left_indices,
                                                right_indices),
                                               (right_indices,
                                                left_indices)):
         L_secondary = SchurAlgebra.mmerge(L, index_complement)
         next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                               index_complement)
         v = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
         left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(
             v)
         left_sublabels = set()
         for i in left_subindices:
             left_sublabels.update(next_label_sets[i])
         right_sublabels = set()
         for i in right_subindices:
             right_sublabels.update(next_label_sets[i])
         left_subnames = set(self.pruned_names[i] for i in left_sublabels)
         right_subnames = set(self.pruned_names[i] for i in right_sublabels)
         print >> out, 'domains represented by a subsplit:'
         print >> out, 'the left side has:\t', ', '.join(
             self._get_domains(left_subnames))
         print >> out, 'the right side has:\t', ', '.join(
             self._get_domains(right_subnames))
         print >> out
     # return the multiline string
     return out.getvalue().strip()
Esempio n. 18
0
 def _do_analysis(self, use_generalized_nj):
     """
     Do some splits of the tree.
     @param use_generalized_nj: True if we use an old method of outgrouping
     """
     # define the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     # get the primary split of the criterion matrix
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # assert that the first split cleanly separates the bacteria from the rest
     left_indices, right_indices = eigensplit
     left_domains = self._get_domains(
         [self.pruned_names[x] for x in left_indices])
     right_domains = self._get_domains(
         [self.pruned_names[x] for x in right_indices])
     if ('bacteria' in left_domains) and ('bacteria' in right_domains):
         raise HandlingError('bacteria were not defined by the first split')
     # now we have enough info to define the first supplementary csv file
     self.first_split_object = SupplementarySpreadsheetObject(
         self.pruned_names, L, v)
     # define the bacteria indices vs the non-bacteria indices for the second split
     if 'bacteria' in left_domains:
         bacteria_indices = left_indices
         non_bacteria_indices = right_indices
     elif 'bacteria' in right_domains:
         bacteria_indices = right_indices
         non_bacteria_indices = left_indices
     # get the secondary split of interest
     if use_generalized_nj:
         D_secondary = BuildTreeTopology.update_generalized_nj(
             D, bacteria_indices)
         L_secondary = Euclid.edm_to_laplacian(D_secondary)
     else:
         L_secondary = SchurAlgebra.mmerge(L, bacteria_indices)
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                           bacteria_indices)
     v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
     eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(
         v_secondary)
     left_subindices, right_subindices = eigensplit_secondary
     pruned_names_secondary = []
     for label_set in next_label_sets:
         if len(label_set) == 1:
             label = list(label_set)[0]
             pruned_names_secondary.append(self.pruned_names[label])
         else:
             pruned_names_secondary.append('all-bacteria')
     # assert that the second split cleanly separates the eukaryota from the rest
     left_subdomains = self._get_domains(
         [pruned_names_secondary[x] for x in left_subindices])
     right_subdomains = self._get_domains(
         [pruned_names_secondary[x] for x in right_subindices])
     if ('eukaryota' in left_subdomains) and ('eukaryota'
                                              in right_subdomains):
         raise HandlingError(
             'eukaryota were not defined by the second split')
     # now we have enough info to define the second supplementary csv file
     self.second_split_object = SupplementarySpreadsheetObject(
         pruned_names_secondary, L_secondary, v_secondary)
Esempio n. 19
0
def get_standard_response(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # begin the response
    out = StringIO()
    # show a summary of the original data
    print >> out, 'data summary before removing branches with zero length:'
    print >> out, len(archaea_names), 'archaea names in the original tree'
    print >> out, len(bacteria_names), 'bacteria names in the original tree'
    print >> out, len(eukaryota_names), 'eukaryota names in the original tree'
    print >> out, len(all_names), 'total names in the original tree'
    print >> out
    # get the pruned full tree
    pruned_full_tree = get_pruned_tree(full_tree)
    ordered_names = list(node.get_name() for node in pruned_full_tree.gen_tips())
    # show a summary of the processed data
    print >> out, 'data summary after removing branches with zero length:'
    print >> out, len(ordered_names), 'total names in the processed non-degenerate tree'
    print >> out
    # draw the pruned full tree
    print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
    formatted_tree_string = NewickIO.get_narrow_newick_string(pruned_full_tree, 120) 
    print >> out, formatted_tree_string
    print >> out
    # split the distance matrix
    D = np.array(pruned_full_tree.get_distance_matrix(ordered_names))
    L = Euclid.edm_to_laplacian(D)
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    # report the eigendecomposition
    print >> out, get_eigendecomposition_report(D)
    # report the clade intersections of sides of the split
    side_names = [set(ordered_names[i] for i in side) for side in eigensplit]
    clade_name_pairs = ((archaea_names, 'archaea'), (bacteria_names, 'bacteria'), (eukaryota_names, 'eukaryota'))
    print >> out, 'clade intersections with each side of the split:'
    for side, side_name in zip(side_names, ('left', 'right')):
        for clade, clade_name in clade_name_pairs:
            if clade & side:
                print >> out, 'the', side_name, 'side intersects', clade_name
    print >> out
    # prepare to do the secondary splits
    left_indices, right_indices = eigensplit
    full_label_sets = [set([i]) for i in range(len(ordered_names))]
    # get a secondary split
    for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)):
        L_s1 = SchurAlgebra.mmerge(L, index_complement)
        next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement)
        v = BuildTreeTopology.laplacian_to_fiedler(L_s1)
        left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v)
        left_sublabels = set()
        for i in left_subindices:
            left_sublabels.update(next_label_sets[i])
        right_sublabels = set()
        for i in right_subindices:
            right_sublabels.update(next_label_sets[i])
        left_subnames = set(ordered_names[i] for i in left_sublabels)
        right_subnames = set(ordered_names[i] for i in right_sublabels)
        print >> out, 'clade intersections with a subsplit:'
        for clade, clade_name in clade_name_pairs:
            if clade & left_subnames:
                print >> out, 'the left side intersects', clade_name
        for clade, clade_name in clade_name_pairs:
            if clade & right_subnames:
                print >> out, 'the right side intersects', clade_name
        print >> out
    # show debug info
    print >> out, 'archaea names:'
    print >> out, '\n'.join(x for x in sorted(archaea_names))
    print >> out
    print >> out, 'bacteria names:'
    print >> out, '\n'.join(x for x in sorted(bacteria_names))
    print >> out
    print >> out, 'eukaryota names:'
    print >> out, '\n'.join(x for x in sorted(eukaryota_names))
    print >> out
    # return the response
    response_text = out.getvalue().strip()
    return [('Content-Type', 'text/plain')], response_text
Esempio n. 20
0
class TreeSearch:
    """
    This is a virtual base class.
    """

    def __init__(self):
        # boolean requirements defined by the user
        self.informative_children = None
        self.force_difference = None
        self.informative_full_split = None
        self.invalid_dendrogram = None
        # search options defined by the subclass
        self.tree = None
        self.desired_primary_split = None
        self.id_to_index = None
        # initialize the counts that are tracked for bookkeeping
        self.aug_split_collision_count = 0
        self.aug_split_degenerate_count = 0
        self.error_primary_split_count = 0
        self.invalid_primary_split_count = 0
        self.degenerate_primary_split_count = 0
        self.undesired_primary_split_count = 0
        self.desired_primary_split_count = 0
        self.uninformative_child_count = 0
        self.informative_child_count = 0
        self.valid_dendrogram_count = 0
        self.success_count = 0

    def is_initialized(self):
        required_data = [
                self.informative_children,
                self.force_difference,
                self.informative_full_split,
                self.invalid_dendrogram,
                self.tree,
                self.desired_primary_split,
                self.id_to_index]
        return not (None in required_data)

    def get_result_text(self):
        """
        @return: a multi-line string of text
        """
        out = StringIO()
        if self.force_difference or self.informative_full_split:
            print >> out, 'full graph split stats:'
            print >> out, self.aug_split_collision_count,
            print >> out, 'full graph splits collided with the desired primary split'
            print >> out, self.aug_split_degenerate_count,
            print >> out, 'full graph splits were degenerate'
            print >> out
        print >> out, 'primary split stats:'
        print >> out, self.error_primary_split_count,
        print >> out, 'errors in finding the primary split (should be 0)'
        print >> out, self.invalid_primary_split_count,
        print >> out, 'invalid primary splits (should be 0)'
        print >> out, self.degenerate_primary_split_count,
        print >> out, 'degenerate primary splits'
        print >> out, self.undesired_primary_split_count,
        print >> out, 'primary splits were not the target split'
        print >> out, self.desired_primary_split_count,
        print >> out, 'primary splits were the target split'
        print >> out
        if self.informative_children:
            print >> out, 'secondary split stats:'
            print >> out, self.uninformative_child_count,
            print >> out, 'samples had at least one uninformative child tree'
            print >> out, self.informative_child_count,
            print>> out, 'samples had two informative child trees'
            print >> out
        if self.invalid_dendrogram:
            print >> out, 'naive dendrogram stats:'
            print >> out, self.valid_dendrogram_count,
            print >> out, 'naive dendrograms were valid'
            print >> out
        return out.getvalue().strip()

    def do_search(self, nseconds, sampling_function):
        """
        @param nseconds: allowed search time or None
        @param sampling_function: a function that samples a branch length
        @return: True if a tree was found that met the criteria
        """
        if not self.is_initialized():
            raise RuntimeError('the search was not sufficiently initialized')
        true_splits = self.tree.get_nontrivial_splits()
        start_time = time.time()
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                return False
            # assign new sampled branch lengths
            for branch in self.tree.get_branches():
                branch.length = sampling_function()
            # get the distance matrix so we can use a library function to get the split
            D = np.array(self.tree.get_distance_matrix())
            ntips = len(D)
            # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves
            if self.force_difference or self.informative_full_split:
                A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index))
                L_aug = Euclid.adjacency_to_laplacian(A_aug)
                v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug)
                left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug)
                left = [x for x in left_aug if x in range(ntips)]
                right = [x for x in right_aug if x in range(ntips)]
                leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right)
                if self.force_difference:
                    if leaf_eigensplit_aug == self.desired_primary_split:
                        self.aug_split_collision_count += 1
                        continue
                if self.informative_full_split:
                    if min(len(s) for s in leaf_eigensplit_aug) < 2:
                        self.aug_split_degenerate_count += 1
                        continue
            # get the eigensplit
            try:
                eigensplit = BuildTreeTopology.split_using_eigenvector(D)
            except BuildTreeTopology.DegenerateSplitException, e:
                self.degenerate_primary_split_count += 1
                continue
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                self.error_primary_split_count += 1
                continue
            if eigensplit not in true_splits:
                raise RuntimeError('INVALID SPLIT:' + tree.get_newick_string())
            if eigensplit != self.desired_primary_split:
                self.undesired_primary_split_count += 1
                continue
            self.desired_primary_split_count += 1
            # check the splits of the two child trees
            degenerate_subsplit_count = 0
            L = Euclid.edm_to_laplacian(D)
            for side in eigensplit:
                L_child = SchurAlgebra.mmerge(L, side)
                v = BuildTreeTopology.laplacian_to_fiedler(L_child)
                child_eigensplit = BuildTreeTopology.eigenvector_to_split(v)
                if min(len(s) for s in child_eigensplit) < 2:
                    degenerate_subsplit_count += 1
            if degenerate_subsplit_count:
                self.uninformative_child_count += 1
            else:
                self.informative_child_count += 1
            if self.informative_children:
                if degenerate_subsplit_count:
                    continue
            # check the dendrogram
            if self.invalid_dendrogram:
                labels = range(len(D))
                hierarchy = Dendrogram.get_hierarchy(D, Dendrogram.spectral_split, labels)
                dendrogram_splits = set(Dendrogram.hierarchy_to_nontrivial_splits(hierarchy))
                if dendrogram_splits == true_splits:
                    self.valid_dendrogram_count += 1
                    continue
            # the tree has met all of the requirements
            return True
Esempio n. 21
0
def get_response_content(fs):
    out = StringIO()
    # try to make some graphs
    unconnected_count = 0
    invalid_split_count = 0
    valid_split_count = 0
    for graph_index in range(fs.ngraphs):
        G = erdos_renyi(fs.nvertices, fs.pedge)
        if is_connected(G):
            # add interesting edge weights
            add_exponential_weights(G)
            # turn the adjacency matrix into a laplacian matrix
            L = Euclid.adjacency_to_laplacian(G)
            for v in range(fs.nvertices):
                small_index_to_big_index = {}
                for i_small, i_big in enumerate(
                    [i for i in range(fs.nvertices) if i != v]):
                    small_index_to_big_index[i_small] = i_big
                # take the schur complement with respect to the given vertex
                L_reduced = get_single_element_schur_complement(L, v)
                assert len(L_reduced) == len(L) - 1
                # get the loadings of the vertices of the reduced graph
                if fs.fiedler_cut:
                    Y_reduced = BuildTreeTopology.laplacian_to_fiedler(
                        L_reduced)
                elif fs.random_cut:
                    Y_reduced = get_random_vector(L_reduced)
                assert len(Y_reduced) == len(L_reduced)
                # expand the fiedler vector with positive and negative valuations for the removed vertex
                found_valid_split = False
                for augmented_loading in (-1.0, 1.0):
                    # get the augmented split vector for this assignment of the removed vertex
                    Y_full = [0] * len(G)
                    for i_reduced, loading in enumerate(Y_reduced):
                        i_big = small_index_to_big_index[i_reduced]
                        Y_full[i_big] = loading
                    Y_full[v] = augmented_loading
                    assert len(Y_full) == len(G)
                    # get the two graphs defined by the split
                    subgraph_a, subgraph_b = list(gen_subgraphs(G, Y_full))
                    # if the subgraphs are both connected then the split is valid
                    if is_connected(subgraph_a) and is_connected(subgraph_b):
                        found_valid_split = True
                # if a valid split was not found then show the matrix
                if found_valid_split:
                    valid_split_count += 1
                else:
                    print >> out, 'Found a matrix that was split incompatibly by a cut of its schur complement!'
                    print >> out, 'matrix:'
                    print >> out, MatrixUtil.m_to_string(G)
                    print >> out, 'index that was removed:', v
                    invalid_split_count += 1
        else:
            unconnected_count += 1
    # show the number of connected and of unconnected graphs
    print >> out, 'this many random graphs were connected:', fs.ngraphs - unconnected_count
    print >> out, 'this many random graphs were not connected:', unconnected_count
    print >> out, 'this many splits were valid:', valid_split_count
    print >> out, 'this many splits were invalid:', invalid_split_count
    # return the result
    return out.getvalue()