Exemple #1
0
 def get_verbose_summary(self):
     """
     @return: a multiline string
     """
     # begin the response
     out = StringIO()
     # show the number of taxa in various domains
     print >> out, self._get_name_summary()
     print >> out
     # show the pruned full tree
     formatted_tree_string = NewickIO.get_narrow_newick_string(self.pruned_tree, 120) 
     print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
     print >> out, formatted_tree_string
     print >> out
     # split the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # report the eigendecomposition
     print >> out, get_eigendecomposition_report(D)
     print >> out
     # report the clade intersections of sides of the split
     side_names = [set(self.pruned_names[i] for i in side) for side in eigensplit]
     print >> out, 'domains represented by each side of the primary split:'
     print >> out, 'the left side has:\t', ', '.join(self._get_domains(side_names[0]))
     print >> out, 'the right side has:\t', ', '.join(self._get_domains(side_names[1]))
     print >> out
     # prepare to do the secondary splits
     left_indices, right_indices = eigensplit
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     # do the secondary splits
     for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)):
         L_secondary = SchurAlgebra.mmerge(L, index_complement)
         next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement)
         v = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
         left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v)
         left_sublabels = set()
         for i in left_subindices:
             left_sublabels.update(next_label_sets[i])
         right_sublabels = set()
         for i in right_subindices:
             right_sublabels.update(next_label_sets[i])
         left_subnames = set(self.pruned_names[i] for i in left_sublabels)
         right_subnames = set(self.pruned_names[i] for i in right_sublabels)
         print >> out, 'domains represented by a subsplit:'
         print >> out, 'the left side has:\t', ', '.join(self._get_domains(left_subnames))
         print >> out, 'the right side has:\t', ', '.join(self._get_domains(right_subnames))
         print >> out
     # return the multiline string
     return out.getvalue().strip()
Exemple #2
0
 def _do_analysis(self, use_generalized_nj):
     """
     Do some splits of the tree.
     @param use_generalized_nj: True if we use an old method of outgrouping
     """
     # define the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     # get the primary split of the criterion matrix
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # assert that the first split cleanly separates the bacteria from the rest
     left_indices, right_indices = eigensplit
     left_domains = self._get_domains([self.pruned_names[x] for x in left_indices])
     right_domains = self._get_domains([self.pruned_names[x] for x in right_indices])
     if ('bacteria' in left_domains) and ('bacteria' in right_domains):
         raise HandlingError('bacteria were not defined by the first split')
     # now we have enough info to define the first supplementary csv file
     self.first_split_object = SupplementarySpreadsheetObject(self.pruned_names, L, v)
     # define the bacteria indices vs the non-bacteria indices for the second split
     if 'bacteria' in left_domains:
         bacteria_indices = left_indices
         non_bacteria_indices = right_indices
     elif 'bacteria' in right_domains:
         bacteria_indices = right_indices
         non_bacteria_indices = left_indices
     # get the secondary split of interest
     if use_generalized_nj:
         D_secondary = BuildTreeTopology.update_generalized_nj(D, bacteria_indices)
         L_secondary = Euclid.edm_to_laplacian(D_secondary)
     else:
         L_secondary = SchurAlgebra.mmerge(L, bacteria_indices)
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     next_label_sets = SchurAlgebra.vmerge(full_label_sets, bacteria_indices)
     v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
     eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(v_secondary)
     left_subindices, right_subindices = eigensplit_secondary
     pruned_names_secondary = []
     for label_set in next_label_sets:
         if len(label_set) == 1:
             label = list(label_set)[0]
             pruned_names_secondary.append(self.pruned_names[label])
         else:
             pruned_names_secondary.append('all-bacteria')
     # assert that the second split cleanly separates the eukaryota from the rest
     left_subdomains = self._get_domains([pruned_names_secondary[x] for x in left_subindices])
     right_subdomains = self._get_domains([pruned_names_secondary[x] for x in right_subindices])
     if ('eukaryota' in left_subdomains) and ('eukaryota' in right_subdomains):
         raise HandlingError('eukaryota were not defined by the second split')
     # now we have enough info to define the second supplementary csv file
     self.second_split_object = SupplementarySpreadsheetObject(pruned_names_secondary, L_secondary, v_secondary)
Exemple #3
0
def get_eigendecomposition_report(D):
    """
    @param D: a distance matrix
    @return: a multi-line string
    """
    out = StringIO()
    # get some intermediate matrices and vectors
    L = Euclid.edm_to_laplacian(D)
    laplacian_fiedler = BuildTreeTopology.laplacian_to_fiedler(L)
    distance_fiedler = BuildTreeTopology.edm_to_fiedler(D)
    eigensplit = BuildTreeTopology.eigenvector_to_split(laplacian_fiedler)
    # report the two eigenvalue lists that should be the same
    HDH = MatrixUtil.double_centered(D)
    HSH = -0.5 * HDH
    w_distance, vt_distance = np.linalg.eigh(HSH)
    print >> out, 'the laplacian-derived and distance-derived eigenvalues:'
    w_laplacian, vt_laplacian = np.linalg.eigh(L)
    for a, b in zip(sorted(w_laplacian), sorted(w_distance)):
        print >> out, a, '\t', b
    print >> out
    # report the two fiedler vectors that should be the same
    print >> out, 'the laplacian-derived and distance-derived fiedler vectors:'
    for a, b in zip(laplacian_fiedler, distance_fiedler):
        print >> out, a, '\t', b
    return out.getvalue().strip()
Exemple #4
0
def get_eigendecomposition_report(D):
    """
    @param D: a distance matrix
    @return: a multi-line string
    """
    out = StringIO()
    # get some intermediate matrices and vectors
    L = Euclid.edm_to_laplacian(D)
    laplacian_fiedler = BuildTreeTopology.laplacian_to_fiedler(L)
    distance_fiedler = BuildTreeTopology.edm_to_fiedler(D)
    eigensplit = BuildTreeTopology.eigenvector_to_split(laplacian_fiedler)
    # report the two eigenvalue lists that should be the same
    HDH = MatrixUtil.double_centered(D)
    HSH = -0.5 * HDH
    w_distance, vt_distance = np.linalg.eigh(HSH)
    print >> out, 'the laplacian-derived and distance-derived eigenvalues:'
    w_laplacian, vt_laplacian = np.linalg.eigh(L)
    for a, b in zip(sorted(w_laplacian), sorted(w_distance)):
        print >> out, a, '\t', b
    print >> out
    # report the two fiedler vectors that should be the same
    print >> out, 'the laplacian-derived and distance-derived fiedler vectors:'
    for a, b in zip(laplacian_fiedler, distance_fiedler):
        print >> out, a, '\t', b
    return out.getvalue().strip()
Exemple #5
0
def spectral_split(D):
    """
    @param D: numpy distance matrix
    @return: a split of the indices of the distance matrix
    """
    v = BuildTreeTopology.edm_to_fiedler(D)
    split = BuildTreeTopology.eigenvector_to_split(v)
    # if one side of the split is empty then there is a failure
    if frozenset() in split:
        raise ValueError('failed split')
    return split
Exemple #6
0
def get_response_content(fs):
    out = StringIO()
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # assert that each node is named
    for node in tree.preorder():
        if not node.name:
            raise HandlingError('each node in the tree must have a name')
    # get the function that converts a matrix to a string
    if fs.plain_matrix:
        m_to_string = MatrixUtil.m_to_string
    elif fs.latex_matrix:
        m_to_string = latexutil.m_to_latex_string
    # print the results for the split of the full tree
    print >> out, get_full_tree_message(tree, m_to_string)
    print >> out
    # get the alphabetically ordered names of the tips
    ordered_tip_names = list(sorted(tip.get_name() for tip in tree.gen_tips()))
    # get the corresponding ordered ids
    tip_name_to_id = dict((tip.get_name(), id(tip)) for tip in tree.gen_tips())
    ordered_tip_ids = [tip_name_to_id[name] for name in ordered_tip_names]
    # get the distance matrix defined by the tips of the tree
    D = np.array(tree.get_partial_distance_matrix(ordered_tip_ids))
    L = Euclid.edm_to_laplacian(D)
    #print >> out, 'the Laplacian obtained from the full tree by Schur complementation:'
    #print >> out, MatrixUtil.m_to_string(L)
    #print >> out
    print >> out, 'the Schur complement in the Laplacian of the full tree scaled by', fs.scaling_factor
    print >> out, m_to_string(fs.scaling_factor * L)
    print >> out
    #L_merged = SchurAlgebra.mmerge(L, set([3,4,5]))
    #print >> out, 'the merged Laplacian:'
    #print >> out, MatrixUtil.m_to_string(L_merged)
    #print >> out
    # get the Fiedler cut of the Schur Laplacian
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    print >> out, 'the Fiedler split of the Schur complement of the full tree:'
    for name, value in zip(ordered_tip_names, v):
        print >> out, name, ':', value
    print >> out
    # get the Fiedler cuts of Schur complements of child trees
    print >> out, get_child_messages(L, eigensplit, ordered_tip_names,
                                     m_to_string, fs.scaling_factor)
    print >> out
    # get the Fiedler cuts of Schur complements of subtrees
    print >> out, get_subtree_messages(D, eigensplit, ordered_tip_names)
    # return the response
    return out.getvalue()
Exemple #7
0
def get_response_content(fs):
    out = StringIO()
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # assert that each node is named
    for node in tree.preorder():
        if not node.name:
            raise HandlingError('each node in the tree must have a name')
    # get the function that converts a matrix to a string
    if fs.plain_matrix:
        m_to_string = MatrixUtil.m_to_string
    elif fs.latex_matrix:
        m_to_string = latexutil.m_to_latex_string
    # print the results for the split of the full tree
    print >> out, get_full_tree_message(tree, m_to_string)
    print >> out
    # get the alphabetically ordered names of the tips
    ordered_tip_names = list(sorted(tip.get_name() for tip in tree.gen_tips()))
    # get the corresponding ordered ids
    tip_name_to_id = dict((tip.get_name(), id(tip)) for tip in tree.gen_tips())
    ordered_tip_ids = [tip_name_to_id[name] for name in ordered_tip_names]
    # get the distance matrix defined by the tips of the tree
    D = np.array(tree.get_partial_distance_matrix(ordered_tip_ids))
    L = Euclid.edm_to_laplacian(D)
    #print >> out, 'the Laplacian obtained from the full tree by Schur complementation:'
    #print >> out, MatrixUtil.m_to_string(L)
    #print >> out
    print >> out, 'the Schur complement in the Laplacian of the full tree scaled by', fs.scaling_factor
    print >> out, m_to_string(fs.scaling_factor * L)
    print >> out
    #L_merged = SchurAlgebra.mmerge(L, set([3,4,5]))
    #print >> out, 'the merged Laplacian:'
    #print >> out, MatrixUtil.m_to_string(L_merged)
    #print >> out
    # get the Fiedler cut of the Schur Laplacian
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    print >> out, 'the Fiedler split of the Schur complement of the full tree:'
    for name, value in zip(ordered_tip_names, v):
        print >> out, name, ':', value
    print >> out
    # get the Fiedler cuts of Schur complements of child trees
    print >> out, get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, fs.scaling_factor)
    print >> out
    # get the Fiedler cuts of Schur complements of subtrees
    print >> out, get_subtree_messages(D, eigensplit, ordered_tip_names)
    # return the response
    return out.getvalue()
Exemple #8
0
 def do_search(self, nseconds, sampling_function):
     """
     @param nseconds: allowed search time or None
     @param sampling_function: a function that samples a branch length
     @return: True if a tree was found that met the criteria
     """
     if not self.is_initialized():
         raise RuntimeError("the search was not sufficiently initialized")
     true_splits = self.tree.get_nontrivial_splits()
     start_time = time.time()
     while True:
         elapsed_time = time.time() - start_time
         if nseconds and elapsed_time > nseconds:
             return False
         # assign new sampled branch lengths
         for branch in self.tree.get_branches():
             branch.length = sampling_function()
         # get the distance matrix so we can use a library function to get the split
         D = np.array(self.tree.get_distance_matrix())
         ntips = len(D)
         # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves
         if self.force_difference or self.informative_full_split:
             A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index))
             L_aug = Euclid.adjacency_to_laplacian(A_aug)
             v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug)
             left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug)
             left = [x for x in left_aug if x in range(ntips)]
             right = [x for x in right_aug if x in range(ntips)]
             leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right)
             if self.force_difference:
                 if leaf_eigensplit_aug == self.desired_primary_split:
                     self.aug_split_collision_count += 1
                     continue
             if self.informative_full_split:
                 if min(len(s) for s in leaf_eigensplit_aug) < 2:
                     self.aug_split_degenerate_count += 1
                     continue
         # get the eigensplit
         try:
             eigensplit = BuildTreeTopology.split_using_eigenvector(D)
         except BuildTreeTopology.DegenerateSplitException, e:
             self.degenerate_primary_split_count += 1
             continue
         except BuildTreeTopology.InvalidSpectralSplitException, e:
             self.error_primary_split_count += 1
             continue
Exemple #9
0
 def do_search(self, nseconds, sampling_function):
     """
     @param nseconds: allowed search time or None
     @param sampling_function: a function that samples a branch length
     @return: True if a tree was found that met the criteria
     """
     if not self.is_initialized():
         raise RuntimeError('the search was not sufficiently initialized')
     true_splits = self.tree.get_nontrivial_splits()
     start_time = time.time()
     while True:
         elapsed_time = time.time() - start_time
         if nseconds and elapsed_time > nseconds:
             return False
         # assign new sampled branch lengths
         for branch in self.tree.get_branches():
             branch.length = sampling_function()
         # get the distance matrix so we can use a library function to get the split
         D = np.array(self.tree.get_distance_matrix())
         ntips = len(D)
         # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves
         if self.force_difference or self.informative_full_split:
             A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index))
             L_aug = Euclid.adjacency_to_laplacian(A_aug)
             v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug)
             left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug)
             left = [x for x in left_aug if x in range(ntips)]
             right = [x for x in right_aug if x in range(ntips)]
             leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right)
             if self.force_difference:
                 if leaf_eigensplit_aug == self.desired_primary_split:
                     self.aug_split_collision_count += 1
                     continue
             if self.informative_full_split:
                 if min(len(s) for s in leaf_eigensplit_aug) < 2:
                     self.aug_split_degenerate_count += 1
                     continue
         # get the eigensplit
         try:
             eigensplit = BuildTreeTopology.split_using_eigenvector(D)
         except BuildTreeTopology.DegenerateSplitException, e:
             self.degenerate_primary_split_count += 1
             continue
         except BuildTreeTopology.InvalidSpectralSplitException, e:
             self.error_primary_split_count += 1
             continue
Exemple #10
0
def get_standard_response(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # begin the response
    out = StringIO()
    # show a summary of the original data
    print >> out, 'data summary before removing branches with zero length:'
    print >> out, len(archaea_names), 'archaea names in the original tree'
    print >> out, len(bacteria_names), 'bacteria names in the original tree'
    print >> out, len(eukaryota_names), 'eukaryota names in the original tree'
    print >> out, len(all_names), 'total names in the original tree'
    print >> out
    # get the pruned full tree
    pruned_full_tree = get_pruned_tree(full_tree)
    ordered_names = list(node.get_name()
                         for node in pruned_full_tree.gen_tips())
    # show a summary of the processed data
    print >> out, 'data summary after removing branches with zero length:'
    print >> out, len(
        ordered_names), 'total names in the processed non-degenerate tree'
    print >> out
    # draw the pruned full tree
    print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
    formatted_tree_string = NewickIO.get_narrow_newick_string(
        pruned_full_tree, 120)
    print >> out, formatted_tree_string
    print >> out
    # split the distance matrix
    D = np.array(pruned_full_tree.get_distance_matrix(ordered_names))
    L = Euclid.edm_to_laplacian(D)
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    # report the eigendecomposition
    print >> out, get_eigendecomposition_report(D)
    # report the clade intersections of sides of the split
    side_names = [set(ordered_names[i] for i in side) for side in eigensplit]
    clade_name_pairs = ((archaea_names, 'archaea'),
                        (bacteria_names, 'bacteria'), (eukaryota_names,
                                                       'eukaryota'))
    print >> out, 'clade intersections with each side of the split:'
    for side, side_name in zip(side_names, ('left', 'right')):
        for clade, clade_name in clade_name_pairs:
            if clade & side:
                print >> out, 'the', side_name, 'side intersects', clade_name
    print >> out
    # prepare to do the secondary splits
    left_indices, right_indices = eigensplit
    full_label_sets = [set([i]) for i in range(len(ordered_names))]
    # get a secondary split
    for index_selection, index_complement in ((left_indices, right_indices),
                                              (right_indices, left_indices)):
        L_s1 = SchurAlgebra.mmerge(L, index_complement)
        next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                              index_complement)
        v = BuildTreeTopology.laplacian_to_fiedler(L_s1)
        left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(
            v)
        left_sublabels = set()
        for i in left_subindices:
            left_sublabels.update(next_label_sets[i])
        right_sublabels = set()
        for i in right_subindices:
            right_sublabels.update(next_label_sets[i])
        left_subnames = set(ordered_names[i] for i in left_sublabels)
        right_subnames = set(ordered_names[i] for i in right_sublabels)
        print >> out, 'clade intersections with a subsplit:'
        for clade, clade_name in clade_name_pairs:
            if clade & left_subnames:
                print >> out, 'the left side intersects', clade_name
        for clade, clade_name in clade_name_pairs:
            if clade & right_subnames:
                print >> out, 'the right side intersects', clade_name
        print >> out
    # show debug info
    print >> out, 'archaea names:'
    print >> out, '\n'.join(x for x in sorted(archaea_names))
    print >> out
    print >> out, 'bacteria names:'
    print >> out, '\n'.join(x for x in sorted(bacteria_names))
    print >> out
    print >> out, 'eukaryota names:'
    print >> out, '\n'.join(x for x in sorted(eukaryota_names))
    print >> out
    # return the response
    response_text = out.getvalue().strip()
    return [('Content-Type', 'text/plain')], response_text
Exemple #11
0
 def get_verbose_summary(self):
     """
     @return: a multiline string
     """
     # begin the response
     out = StringIO()
     # show the number of taxa in various domains
     print >> out, self._get_name_summary()
     print >> out
     # show the pruned full tree
     formatted_tree_string = NewickIO.get_narrow_newick_string(
         self.pruned_tree, 120)
     print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
     print >> out, formatted_tree_string
     print >> out
     # split the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # report the eigendecomposition
     print >> out, get_eigendecomposition_report(D)
     print >> out
     # report the clade intersections of sides of the split
     side_names = [
         set(self.pruned_names[i] for i in side) for side in eigensplit
     ]
     print >> out, 'domains represented by each side of the primary split:'
     print >> out, 'the left side has:\t', ', '.join(
         self._get_domains(side_names[0]))
     print >> out, 'the right side has:\t', ', '.join(
         self._get_domains(side_names[1]))
     print >> out
     # prepare to do the secondary splits
     left_indices, right_indices = eigensplit
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     # do the secondary splits
     for index_selection, index_complement in ((left_indices,
                                                right_indices),
                                               (right_indices,
                                                left_indices)):
         L_secondary = SchurAlgebra.mmerge(L, index_complement)
         next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                               index_complement)
         v = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
         left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(
             v)
         left_sublabels = set()
         for i in left_subindices:
             left_sublabels.update(next_label_sets[i])
         right_sublabels = set()
         for i in right_subindices:
             right_sublabels.update(next_label_sets[i])
         left_subnames = set(self.pruned_names[i] for i in left_sublabels)
         right_subnames = set(self.pruned_names[i] for i in right_sublabels)
         print >> out, 'domains represented by a subsplit:'
         print >> out, 'the left side has:\t', ', '.join(
             self._get_domains(left_subnames))
         print >> out, 'the right side has:\t', ', '.join(
             self._get_domains(right_subnames))
         print >> out
     # return the multiline string
     return out.getvalue().strip()
Exemple #12
0
 def _do_analysis(self, use_generalized_nj):
     """
     Do some splits of the tree.
     @param use_generalized_nj: True if we use an old method of outgrouping
     """
     # define the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     # get the primary split of the criterion matrix
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # assert that the first split cleanly separates the bacteria from the rest
     left_indices, right_indices = eigensplit
     left_domains = self._get_domains(
         [self.pruned_names[x] for x in left_indices])
     right_domains = self._get_domains(
         [self.pruned_names[x] for x in right_indices])
     if ('bacteria' in left_domains) and ('bacteria' in right_domains):
         raise HandlingError('bacteria were not defined by the first split')
     # now we have enough info to define the first supplementary csv file
     self.first_split_object = SupplementarySpreadsheetObject(
         self.pruned_names, L, v)
     # define the bacteria indices vs the non-bacteria indices for the second split
     if 'bacteria' in left_domains:
         bacteria_indices = left_indices
         non_bacteria_indices = right_indices
     elif 'bacteria' in right_domains:
         bacteria_indices = right_indices
         non_bacteria_indices = left_indices
     # get the secondary split of interest
     if use_generalized_nj:
         D_secondary = BuildTreeTopology.update_generalized_nj(
             D, bacteria_indices)
         L_secondary = Euclid.edm_to_laplacian(D_secondary)
     else:
         L_secondary = SchurAlgebra.mmerge(L, bacteria_indices)
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                           bacteria_indices)
     v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
     eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(
         v_secondary)
     left_subindices, right_subindices = eigensplit_secondary
     pruned_names_secondary = []
     for label_set in next_label_sets:
         if len(label_set) == 1:
             label = list(label_set)[0]
             pruned_names_secondary.append(self.pruned_names[label])
         else:
             pruned_names_secondary.append('all-bacteria')
     # assert that the second split cleanly separates the eukaryota from the rest
     left_subdomains = self._get_domains(
         [pruned_names_secondary[x] for x in left_subindices])
     right_subdomains = self._get_domains(
         [pruned_names_secondary[x] for x in right_subindices])
     if ('eukaryota' in left_subdomains) and ('eukaryota'
                                              in right_subdomains):
         raise HandlingError(
             'eukaryota were not defined by the second split')
     # now we have enough info to define the second supplementary csv file
     self.second_split_object = SupplementarySpreadsheetObject(
         pruned_names_secondary, L_secondary, v_secondary)
Exemple #13
0
class TreeSearch:
    """
    This is a virtual base class.
    """

    def __init__(self):
        # boolean requirements defined by the user
        self.informative_children = None
        self.force_difference = None
        self.informative_full_split = None
        self.invalid_dendrogram = None
        # search options defined by the subclass
        self.tree = None
        self.desired_primary_split = None
        self.id_to_index = None
        # initialize the counts that are tracked for bookkeeping
        self.aug_split_collision_count = 0
        self.aug_split_degenerate_count = 0
        self.error_primary_split_count = 0
        self.invalid_primary_split_count = 0
        self.degenerate_primary_split_count = 0
        self.undesired_primary_split_count = 0
        self.desired_primary_split_count = 0
        self.uninformative_child_count = 0
        self.informative_child_count = 0
        self.valid_dendrogram_count = 0
        self.success_count = 0

    def is_initialized(self):
        required_data = [
                self.informative_children,
                self.force_difference,
                self.informative_full_split,
                self.invalid_dendrogram,
                self.tree,
                self.desired_primary_split,
                self.id_to_index]
        return not (None in required_data)

    def get_result_text(self):
        """
        @return: a multi-line string of text
        """
        out = StringIO()
        if self.force_difference or self.informative_full_split:
            print >> out, 'full graph split stats:'
            print >> out, self.aug_split_collision_count,
            print >> out, 'full graph splits collided with the desired primary split'
            print >> out, self.aug_split_degenerate_count,
            print >> out, 'full graph splits were degenerate'
            print >> out
        print >> out, 'primary split stats:'
        print >> out, self.error_primary_split_count,
        print >> out, 'errors in finding the primary split (should be 0)'
        print >> out, self.invalid_primary_split_count,
        print >> out, 'invalid primary splits (should be 0)'
        print >> out, self.degenerate_primary_split_count,
        print >> out, 'degenerate primary splits'
        print >> out, self.undesired_primary_split_count,
        print >> out, 'primary splits were not the target split'
        print >> out, self.desired_primary_split_count,
        print >> out, 'primary splits were the target split'
        print >> out
        if self.informative_children:
            print >> out, 'secondary split stats:'
            print >> out, self.uninformative_child_count,
            print >> out, 'samples had at least one uninformative child tree'
            print >> out, self.informative_child_count,
            print>> out, 'samples had two informative child trees'
            print >> out
        if self.invalid_dendrogram:
            print >> out, 'naive dendrogram stats:'
            print >> out, self.valid_dendrogram_count,
            print >> out, 'naive dendrograms were valid'
            print >> out
        return out.getvalue().strip()

    def do_search(self, nseconds, sampling_function):
        """
        @param nseconds: allowed search time or None
        @param sampling_function: a function that samples a branch length
        @return: True if a tree was found that met the criteria
        """
        if not self.is_initialized():
            raise RuntimeError('the search was not sufficiently initialized')
        true_splits = self.tree.get_nontrivial_splits()
        start_time = time.time()
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                return False
            # assign new sampled branch lengths
            for branch in self.tree.get_branches():
                branch.length = sampling_function()
            # get the distance matrix so we can use a library function to get the split
            D = np.array(self.tree.get_distance_matrix())
            ntips = len(D)
            # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves
            if self.force_difference or self.informative_full_split:
                A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index))
                L_aug = Euclid.adjacency_to_laplacian(A_aug)
                v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug)
                left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug)
                left = [x for x in left_aug if x in range(ntips)]
                right = [x for x in right_aug if x in range(ntips)]
                leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right)
                if self.force_difference:
                    if leaf_eigensplit_aug == self.desired_primary_split:
                        self.aug_split_collision_count += 1
                        continue
                if self.informative_full_split:
                    if min(len(s) for s in leaf_eigensplit_aug) < 2:
                        self.aug_split_degenerate_count += 1
                        continue
            # get the eigensplit
            try:
                eigensplit = BuildTreeTopology.split_using_eigenvector(D)
            except BuildTreeTopology.DegenerateSplitException, e:
                self.degenerate_primary_split_count += 1
                continue
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                self.error_primary_split_count += 1
                continue
            if eigensplit not in true_splits:
                raise RuntimeError('INVALID SPLIT:' + tree.get_newick_string())
            if eigensplit != self.desired_primary_split:
                self.undesired_primary_split_count += 1
                continue
            self.desired_primary_split_count += 1
            # check the splits of the two child trees
            degenerate_subsplit_count = 0
            L = Euclid.edm_to_laplacian(D)
            for side in eigensplit:
                L_child = SchurAlgebra.mmerge(L, side)
                v = BuildTreeTopology.laplacian_to_fiedler(L_child)
                child_eigensplit = BuildTreeTopology.eigenvector_to_split(v)
                if min(len(s) for s in child_eigensplit) < 2:
                    degenerate_subsplit_count += 1
            if degenerate_subsplit_count:
                self.uninformative_child_count += 1
            else:
                self.informative_child_count += 1
            if self.informative_children:
                if degenerate_subsplit_count:
                    continue
            # check the dendrogram
            if self.invalid_dendrogram:
                labels = range(len(D))
                hierarchy = Dendrogram.get_hierarchy(D, Dendrogram.spectral_split, labels)
                dendrogram_splits = set(Dendrogram.hierarchy_to_nontrivial_splits(hierarchy))
                if dendrogram_splits == true_splits:
                    self.valid_dendrogram_count += 1
                    continue
            # the tree has met all of the requirements
            return True
Exemple #14
0
def get_standard_response(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # begin the response
    out = StringIO()
    # show a summary of the original data
    print >> out, 'data summary before removing branches with zero length:'
    print >> out, len(archaea_names), 'archaea names in the original tree'
    print >> out, len(bacteria_names), 'bacteria names in the original tree'
    print >> out, len(eukaryota_names), 'eukaryota names in the original tree'
    print >> out, len(all_names), 'total names in the original tree'
    print >> out
    # get the pruned full tree
    pruned_full_tree = get_pruned_tree(full_tree)
    ordered_names = list(node.get_name() for node in pruned_full_tree.gen_tips())
    # show a summary of the processed data
    print >> out, 'data summary after removing branches with zero length:'
    print >> out, len(ordered_names), 'total names in the processed non-degenerate tree'
    print >> out
    # draw the pruned full tree
    print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
    formatted_tree_string = NewickIO.get_narrow_newick_string(pruned_full_tree, 120) 
    print >> out, formatted_tree_string
    print >> out
    # split the distance matrix
    D = np.array(pruned_full_tree.get_distance_matrix(ordered_names))
    L = Euclid.edm_to_laplacian(D)
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    # report the eigendecomposition
    print >> out, get_eigendecomposition_report(D)
    # report the clade intersections of sides of the split
    side_names = [set(ordered_names[i] for i in side) for side in eigensplit]
    clade_name_pairs = ((archaea_names, 'archaea'), (bacteria_names, 'bacteria'), (eukaryota_names, 'eukaryota'))
    print >> out, 'clade intersections with each side of the split:'
    for side, side_name in zip(side_names, ('left', 'right')):
        for clade, clade_name in clade_name_pairs:
            if clade & side:
                print >> out, 'the', side_name, 'side intersects', clade_name
    print >> out
    # prepare to do the secondary splits
    left_indices, right_indices = eigensplit
    full_label_sets = [set([i]) for i in range(len(ordered_names))]
    # get a secondary split
    for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)):
        L_s1 = SchurAlgebra.mmerge(L, index_complement)
        next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement)
        v = BuildTreeTopology.laplacian_to_fiedler(L_s1)
        left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v)
        left_sublabels = set()
        for i in left_subindices:
            left_sublabels.update(next_label_sets[i])
        right_sublabels = set()
        for i in right_subindices:
            right_sublabels.update(next_label_sets[i])
        left_subnames = set(ordered_names[i] for i in left_sublabels)
        right_subnames = set(ordered_names[i] for i in right_sublabels)
        print >> out, 'clade intersections with a subsplit:'
        for clade, clade_name in clade_name_pairs:
            if clade & left_subnames:
                print >> out, 'the left side intersects', clade_name
        for clade, clade_name in clade_name_pairs:
            if clade & right_subnames:
                print >> out, 'the right side intersects', clade_name
        print >> out
    # show debug info
    print >> out, 'archaea names:'
    print >> out, '\n'.join(x for x in sorted(archaea_names))
    print >> out
    print >> out, 'bacteria names:'
    print >> out, '\n'.join(x for x in sorted(bacteria_names))
    print >> out
    print >> out, 'eukaryota names:'
    print >> out, '\n'.join(x for x in sorted(eukaryota_names))
    print >> out
    # return the response
    response_text = out.getvalue().strip()
    return [('Content-Type', 'text/plain')], response_text