Example #1
0
 def evaluate(self, true_splits, D_estimated):
     """
     @param true_splits: the set of all full splits implied by the true tree
     @param D_estimated: the estimated distance matrix
     """
     self.true_splits = true_splits
     BuildTreeTopology.get_splits(D_estimated, self.split_function, BuildTreeTopology.update_using_laplacian, self.on_label_split)
Example #2
0
 def evaluate(self, true_splits, D_estimated, atteson, use_nj, use_modified_nj, use_all_spectral, use_one_spectral):
     """
     @param true_splits: the set of all full splits implied by the true tree
     @param D_estimated: the estimated distance matrix
     @param atteson: True iff the distance matrix is Atteson
     """
     # initialize the errors
     nj_error = None
     modified_nj_error = None
     all_spectral_error = None
     one_spectral_error = None
     if use_nj:
         nj_splits = BuildTreeTopology.get_splits(D_estimated, BuildTreeTopology.split_nj, BuildTreeTopology.update_nj)
         nj_error = Xtree.splits_to_rf_distance(nj_splits, true_splits)
     if use_modified_nj:
         modified_nj_splits = BuildTreeTopology.get_splits(D_estimated, BuildTreeTopology.split_nj, BuildTreeTopology.update_using_laplacian)
         modified_nj_error = Xtree.splits_to_rf_distance(modified_nj_splits, true_splits)
     if use_all_spectral:
         splitter = BuildTreeTopology.split_using_eigenvector_with_nj_fallback
         updater = BuildTreeTopology.update_using_laplacian
         all_spectral_splits = BuildTreeTopology.get_splits(D_estimated, splitter, updater)
         all_spectral_error = Xtree.splits_to_rf_distance(all_spectral_splits, true_splits)
     if use_one_spectral:
         splitter = SplitFunctor(len(D_estimated))
         updater = UpdateFunctor(len(D_estimated))
         one_spectral_splits = BuildTreeTopology.get_splits(D_estimated, splitter, updater)
         one_spectral_error = Xtree.splits_to_rf_distance(one_spectral_splits, true_splits)
     # add the data point
     self.scatter_points.append(ScatterPoint(atteson, nj_error, modified_nj_error, all_spectral_error, one_spectral_error))
Example #3
0
def get_eigendecomposition_report(D):
    """
    @param D: a distance matrix
    @return: a multi-line string
    """
    out = StringIO()
    # get some intermediate matrices and vectors
    L = Euclid.edm_to_laplacian(D)
    laplacian_fiedler = BuildTreeTopology.laplacian_to_fiedler(L)
    distance_fiedler = BuildTreeTopology.edm_to_fiedler(D)
    eigensplit = BuildTreeTopology.eigenvector_to_split(laplacian_fiedler)
    # report the two eigenvalue lists that should be the same
    HDH = MatrixUtil.double_centered(D)
    HSH = -0.5 * HDH
    w_distance, vt_distance = np.linalg.eigh(HSH)
    print >> out, 'the laplacian-derived and distance-derived eigenvalues:'
    w_laplacian, vt_laplacian = np.linalg.eigh(L)
    for a, b in zip(sorted(w_laplacian), sorted(w_distance)):
        print >> out, a, '\t', b
    print >> out
    # report the two fiedler vectors that should be the same
    print >> out, 'the laplacian-derived and distance-derived fiedler vectors:'
    for a, b in zip(laplacian_fiedler, distance_fiedler):
        print >> out, a, '\t', b
    return out.getvalue().strip()
Example #4
0
 def __call__(self, D):
     """
     @param D: the distance matrix
     @return: a set of two index sets defining a split of the indices
     """
     if len(D) < self.large_matrix_size:
         return BuildTreeTopology.split_nj(D)
     else:
         return BuildTreeTopology.split_using_eigenvector_with_nj_fallback(D)
Example #5
0
 def __call__(self, D, index_set):
     """
     @param D: the distance matrix
     @param index_set: the subset of indices that will be removed from the updated distance matrix
     @return: an updated distance matrix
     """
     if len(D) < self.large_matrix_size:
         return BuildTreeTopology.update_nj(D, index_set)
     else:
         return BuildTreeTopology.update_using_laplacian(D, index_set)
Example #6
0
def do_it_right(D):
    """
    Do neighbor joining correctly.
    @param D: distance matrix
    @return: a sequence of splits
    """
    # use neighbor joining to build the tree, saving the splits in the order they are made
    split_saver = SplitSaver()
    BuildTreeTopology.get_splits(D, BuildTreeTopology.split_nj, BuildTreeTopology.update_nj, split_saver)
    return split_saver.splits
Example #7
0
 def _do_analysis(self, use_generalized_nj):
     """
     Do some splits of the tree.
     @param use_generalized_nj: True if we use an old method of outgrouping
     """
     # define the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     # get the primary split of the criterion matrix
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # assert that the first split cleanly separates the bacteria from the rest
     left_indices, right_indices = eigensplit
     left_domains = self._get_domains([self.pruned_names[x] for x in left_indices])
     right_domains = self._get_domains([self.pruned_names[x] for x in right_indices])
     if ('bacteria' in left_domains) and ('bacteria' in right_domains):
         raise HandlingError('bacteria were not defined by the first split')
     # now we have enough info to define the first supplementary csv file
     self.first_split_object = SupplementarySpreadsheetObject(self.pruned_names, L, v)
     # define the bacteria indices vs the non-bacteria indices for the second split
     if 'bacteria' in left_domains:
         bacteria_indices = left_indices
         non_bacteria_indices = right_indices
     elif 'bacteria' in right_domains:
         bacteria_indices = right_indices
         non_bacteria_indices = left_indices
     # get the secondary split of interest
     if use_generalized_nj:
         D_secondary = BuildTreeTopology.update_generalized_nj(D, bacteria_indices)
         L_secondary = Euclid.edm_to_laplacian(D_secondary)
     else:
         L_secondary = SchurAlgebra.mmerge(L, bacteria_indices)
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     next_label_sets = SchurAlgebra.vmerge(full_label_sets, bacteria_indices)
     v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
     eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(v_secondary)
     left_subindices, right_subindices = eigensplit_secondary
     pruned_names_secondary = []
     for label_set in next_label_sets:
         if len(label_set) == 1:
             label = list(label_set)[0]
             pruned_names_secondary.append(self.pruned_names[label])
         else:
             pruned_names_secondary.append('all-bacteria')
     # assert that the second split cleanly separates the eukaryota from the rest
     left_subdomains = self._get_domains([pruned_names_secondary[x] for x in left_subindices])
     right_subdomains = self._get_domains([pruned_names_secondary[x] for x in right_subindices])
     if ('eukaryota' in left_subdomains) and ('eukaryota' in right_subdomains):
         raise HandlingError('eukaryota were not defined by the second split')
     # now we have enough info to define the second supplementary csv file
     self.second_split_object = SupplementarySpreadsheetObject(pruned_names_secondary, L_secondary, v_secondary)
Example #8
0
 def get_verbose_summary(self):
     """
     @return: a multiline string
     """
     # begin the response
     out = StringIO()
     # show the number of taxa in various domains
     print >> out, self._get_name_summary()
     print >> out
     # show the pruned full tree
     formatted_tree_string = NewickIO.get_narrow_newick_string(self.pruned_tree, 120) 
     print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
     print >> out, formatted_tree_string
     print >> out
     # split the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # report the eigendecomposition
     print >> out, get_eigendecomposition_report(D)
     print >> out
     # report the clade intersections of sides of the split
     side_names = [set(self.pruned_names[i] for i in side) for side in eigensplit]
     print >> out, 'domains represented by each side of the primary split:'
     print >> out, 'the left side has:\t', ', '.join(self._get_domains(side_names[0]))
     print >> out, 'the right side has:\t', ', '.join(self._get_domains(side_names[1]))
     print >> out
     # prepare to do the secondary splits
     left_indices, right_indices = eigensplit
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     # do the secondary splits
     for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)):
         L_secondary = SchurAlgebra.mmerge(L, index_complement)
         next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement)
         v = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
         left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v)
         left_sublabels = set()
         for i in left_subindices:
             left_sublabels.update(next_label_sets[i])
         right_sublabels = set()
         for i in right_subindices:
             right_sublabels.update(next_label_sets[i])
         left_subnames = set(self.pruned_names[i] for i in left_sublabels)
         right_subnames = set(self.pruned_names[i] for i in right_sublabels)
         print >> out, 'domains represented by a subsplit:'
         print >> out, 'the left side has:\t', ', '.join(self._get_domains(left_subnames))
         print >> out, 'the right side has:\t', ', '.join(self._get_domains(right_subnames))
         print >> out
     # return the multiline string
     return out.getvalue().strip()
Example #9
0
def process(ntaxa, nseconds, branch_length_sampler):
    """
    @param ntaxa: the number of taxa in the sampled trees
    @param nseconds: allow this many seconds to run or None to run forever
    @param branch_length_sampler: a functor that returns a branch length and has a string cast
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    # initialize some state that will be tracked over the entire run
    degenerate_count = 0
    invalid_split_count = 0
    valid_split_count = 0
    spectral_error_count = 0
    atteson_error_count = 0
    counterexample_D = None
    counterexample_tree = None
    # do a bunch of reconstructions from sampled distance matrices
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample the tree topology and get its set of implied full label splits
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            true_splits = tree.get_nontrivial_splits()
            # sample the branch lengths
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            # sample the atteson distance matrix
            D = sample_atteson_distance_matrix(tree)
            # assert that the atteson condition is true
            if not BuildTreeTopology.is_atteson(tree, D):
                atteson_error_count += 1
            else:
                try:
                    # see if the eigensplit is in the set of true splits
                    eigensplit = BuildTreeTopology.split_using_eigenvector(D)
                    if eigensplit in true_splits:
                        valid_split_count += 1
                    else:
                        invalid_split_count += 1
                        counterexample_D = D
                        counterexample_tree = tree
                        break
                except BuildTreeTopology.DegenerateSplitException, e:
                    degenerate_count += 1
                except BuildTreeTopology.InvalidSpectralSplitException, e:
                    spectral_error_count += 1
Example #10
0
def get_response_content(fs):
    out = StringIO()
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # assert that each node is named
    for node in tree.preorder():
        if not node.name:
            raise HandlingError('each node in the tree must have a name')
    # get the function that converts a matrix to a string
    if fs.plain_matrix:
        m_to_string = MatrixUtil.m_to_string
    elif fs.latex_matrix:
        m_to_string = latexutil.m_to_latex_string
    # print the results for the split of the full tree
    print >> out, get_full_tree_message(tree, m_to_string)
    print >> out
    # get the alphabetically ordered names of the tips
    ordered_tip_names = list(sorted(tip.get_name() for tip in tree.gen_tips()))
    # get the corresponding ordered ids
    tip_name_to_id = dict((tip.get_name(), id(tip)) for tip in tree.gen_tips())
    ordered_tip_ids = [tip_name_to_id[name] for name in ordered_tip_names]
    # get the distance matrix defined by the tips of the tree
    D = np.array(tree.get_partial_distance_matrix(ordered_tip_ids))
    L = Euclid.edm_to_laplacian(D)
    #print >> out, 'the Laplacian obtained from the full tree by Schur complementation:'
    #print >> out, MatrixUtil.m_to_string(L)
    #print >> out
    print >> out, 'the Schur complement in the Laplacian of the full tree scaled by', fs.scaling_factor
    print >> out, m_to_string(fs.scaling_factor * L)
    print >> out
    #L_merged = SchurAlgebra.mmerge(L, set([3,4,5]))
    #print >> out, 'the merged Laplacian:'
    #print >> out, MatrixUtil.m_to_string(L_merged)
    #print >> out
    # get the Fiedler cut of the Schur Laplacian
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    print >> out, 'the Fiedler split of the Schur complement of the full tree:'
    for name, value in zip(ordered_tip_names, v):
        print >> out, name, ':', value
    print >> out
    # get the Fiedler cuts of Schur complements of child trees
    print >> out, get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, fs.scaling_factor)
    print >> out
    # get the Fiedler cuts of Schur complements of subtrees
    print >> out, get_subtree_messages(D, eigensplit, ordered_tip_names)
    # return the response
    return out.getvalue()
Example #11
0
def get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, scaling_factor):
    """
    @param L: the laplacian corresponding to tips of the tree
    @param eigensplit: the split induced by the fiedler vector
    @param ordered_tip_names: names of the tips of the tree conformant to v and L
    @param m_to_string: a function that converts a matrix to a string
    @param scaling_factor: show the Laplacian scaled by this factor
    @return: a multi-line string
    """
    out = StringIO()
    n = len(L)
    ordered_label_sets = [set([i]) for i in range(n)]
    all_labels = set(range(n))
    for i, child in enumerate(eigensplit):
        complement = all_labels - child
        L_child = SchurAlgebra.mmerge(L, complement) 
        print >> out, 'the Schur complement in the Laplacian of child tree', i+1, 'scaled by', scaling_factor
        print >> out, m_to_string(scaling_factor * L_child)
        print >> out
        child_label_sets = SchurAlgebra.vmerge(ordered_label_sets, complement)
        v_child = BuildTreeTopology.laplacian_to_fiedler(L_child) 
        print >> out, 'the Fiedler split of the Schur complement in the Laplacian of child tree', i+1
        for label_set, value in zip(child_label_sets, v_child):
            s = label_set_to_string(label_set, ordered_tip_names)
            print >> out, s, ':', value
        print >> out
    return out.getvalue().strip()
Example #12
0
 def split_function(self, D):
     """
     Split the distance matrix using signs of an eigenvector of -HDH/2.
     If a degenerate split is found then a DegenerateSplitException is raised.
     @param D: the distance matrix
     @return: a set of two index sets defining a split of the indices
     """
     try:
         # get the matrix whose eigendecomposition is of interest
         HSH = Euclid.edm_to_dccov(D)
         # get the eigendecomposition
         eigenvalues, V_T = np.linalg.eigh(HSH)
         eigenvectors = V_T.T.tolist()
         # save the eigenvalues for reporting
         self.eigenvalues = eigenvalues
         # get the eigenvector of interest
         w, v = max(zip(eigenvalues, eigenvectors))
         # get the indices with positive eigenvector valuations
         n = len(D)
         positive = frozenset(i for i, x in enumerate(v) if x > 0)
         nonpositive = frozenset(set(range(n)) - positive)
         # check for a degenerate split
         for index_set in (positive, nonpositive):
             assert len(index_set) > 0
         for index_set in (positive, nonpositive):
             if len(index_set) == 1:
                 index, = index_set
                 raise BuildTreeTopology.DegenerateSplitException(index)
         return frozenset((positive, nonpositive))
     except BuildTreeTopology.DegenerateSplitException, e:
         self.eigenvalues = None
         return BuildTreeTopology.split_nj(D)
Example #13
0
def get_response_content(fs):
    # read the matrix
    D = fs.matrix
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    # validate the input
    if len(D) != len(ordered_labels):
        raise HandlingError('the number of taxon labels should match the number of rows in the distance matrix')
    # get the split and update methods
    if fs.option_a:
        split_function = BuildTreeTopology.split_nj
        update_function = BuildTreeTopology.update_nj
    elif fs.option_b:
        split_function = BuildTreeTopology.split_nj
        update_function = BuildTreeTopology.update_using_laplacian
    elif fs.option_c:
        split_function = BuildTreeTopology.split_using_eigenvector_with_nj_fallback
        update_function = BuildTreeTopology.update_using_laplacian
    elif fs.option_d:
        split_function = BuildTreeTopology.split_using_eigenvector
        update_function = BuildTreeTopology.update_using_laplacian
    # get the splits
    index_splits = BuildTreeTopology.get_splits(D, split_function, update_function)
    # start to prepare the reponse
    out = StringIO()
    for index_split in index_splits:
        taxon_split = [[ordered_labels[i] for i in group] for group in index_split]
        print >> out, split_to_string(taxon_split)
    # write the response
    return out.getvalue()
Example #14
0
def get_full_tree_message(tree, m_to_string):
    """
    In this function we find the Fiedler split of the full tree.
    @param tree: each node in this tree must have a name
    @param m_to_string: a function that converts a matrix to a string
    @return: a message about the split of the tips of the tree induced by the fiedler vector
    """
    out = StringIO()
    # get the alphabetically ordered names
    ordered_names = list(sorted(node.get_name() for node in tree.preorder()))
    # get the corresponding ordered ids
    name_to_id = dict((node.get_name(), id(node)) for node in tree.preorder())
    ordered_ids = [name_to_id[name] for name in ordered_names]
    # get the full weighted adjacency matrix
    A = np.array(tree.get_affinity_matrix(ordered_ids))
    print >> out, 'the weighted reciprocal adjacency matrix of the full tree:'
    print >> out, m_to_string(get_reciprocal_matrix(A))
    print >> out
    # get the full Laplacian matrix
    L = Euclid.adjacency_to_laplacian(A)
    # get the fiedler split
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    print >> out, 'the Fiedler split of the full tree:'
    for name, value in zip(ordered_names, v):
        print >> out, name, ':', value
    return out.getvalue().strip()
Example #15
0
def get_response_content(fs):
    out = StringIO()
    # try to make some graphs
    unconnected_count = 0
    invalid_split_count = 0
    valid_split_count = 0
    for graph_index in range(fs.ngraphs):
        G = erdos_renyi(fs.nvertices, fs.pedge)
        if is_connected(G):
            # add interesting edge weights
            add_exponential_weights(G)
            # turn the adjacency matrix into a laplacian matrix
            L = Euclid.adjacency_to_laplacian(G)
            for v in range(fs.nvertices):
                small_index_to_big_index = {}
                for i_small, i_big in enumerate([i for i in range(fs.nvertices) if i != v]):
                    small_index_to_big_index[i_small] = i_big
                # take the schur complement with respect to the given vertex
                L_reduced = get_single_element_schur_complement(L, v)
                assert len(L_reduced) == len(L) - 1
                # get the loadings of the vertices of the reduced graph
                if fs.fiedler_cut:
                    Y_reduced = BuildTreeTopology.laplacian_to_fiedler(L_reduced)
                elif fs.random_cut:
                    Y_reduced = get_random_vector(L_reduced)
                assert len(Y_reduced) == len(L_reduced)
                # expand the fiedler vector with positive and negative valuations for the removed vertex
                found_valid_split = False
                for augmented_loading in (-1.0, 1.0):
                    # get the augmented split vector for this assignment of the removed vertex
                    Y_full = [0]*len(G)
                    for i_reduced, loading in enumerate(Y_reduced):
                        i_big = small_index_to_big_index[i_reduced]
                        Y_full[i_big] = loading
                    Y_full[v] = augmented_loading
                    assert len(Y_full) == len(G)
                    # get the two graphs defined by the split
                    subgraph_a, subgraph_b = list(gen_subgraphs(G, Y_full))
                    # if the subgraphs are both connected then the split is valid
                    if is_connected(subgraph_a) and is_connected(subgraph_b):
                        found_valid_split = True
                # if a valid split was not found then show the matrix
                if found_valid_split:
                    valid_split_count += 1
                else:
                    print >> out, 'Found a matrix that was split incompatibly by a cut of its schur complement!'
                    print >> out, 'matrix:'
                    print >> out, MatrixUtil.m_to_string(G)
                    print >> out, 'index that was removed:', v
                    invalid_split_count += 1
        else:
            unconnected_count += 1
    # show the number of connected and of unconnected graphs
    print >> out, 'this many random graphs were connected:', fs.ngraphs - unconnected_count
    print >> out, 'this many random graphs were not connected:', unconnected_count
    print >> out, 'this many splits were valid:', valid_split_count
    print >> out, 'this many splits were invalid:', invalid_split_count
    # return the result
    return out.getvalue()
Example #16
0
 def do_search(self, nseconds, sampling_function):
     """
     @param nseconds: allowed search time or None
     @param sampling_function: a function that samples a branch length
     @return: True if a tree was found that met the criteria
     """
     if not self.is_initialized():
         raise RuntimeError("the search was not sufficiently initialized")
     true_splits = self.tree.get_nontrivial_splits()
     start_time = time.time()
     while True:
         elapsed_time = time.time() - start_time
         if nseconds and elapsed_time > nseconds:
             return False
         # assign new sampled branch lengths
         for branch in self.tree.get_branches():
             branch.length = sampling_function()
         # get the distance matrix so we can use a library function to get the split
         D = np.array(self.tree.get_distance_matrix())
         ntips = len(D)
         # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves
         if self.force_difference or self.informative_full_split:
             A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index))
             L_aug = Euclid.adjacency_to_laplacian(A_aug)
             v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug)
             left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug)
             left = [x for x in left_aug if x in range(ntips)]
             right = [x for x in right_aug if x in range(ntips)]
             leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right)
             if self.force_difference:
                 if leaf_eigensplit_aug == self.desired_primary_split:
                     self.aug_split_collision_count += 1
                     continue
             if self.informative_full_split:
                 if min(len(s) for s in leaf_eigensplit_aug) < 2:
                     self.aug_split_degenerate_count += 1
                     continue
         # get the eigensplit
         try:
             eigensplit = BuildTreeTopology.split_using_eigenvector(D)
         except BuildTreeTopology.DegenerateSplitException, e:
             self.degenerate_primary_split_count += 1
             continue
         except BuildTreeTopology.InvalidSpectralSplitException, e:
             self.error_primary_split_count += 1
             continue
Example #17
0
def get_response_content(fs):
    # read the matrix
    D = np.array(fs.matrix)
    n = len(D)
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    selected_labels = Util.get_stripped_lines(StringIO(fs.selection))
    # validate the input
    if n != len(ordered_labels):
        raise HandlingError("the number of taxon labels should match the number of rows in the distance matrix")
    # get the two sets of indices
    index_set_A = set(i for i, label in enumerate(ordered_labels) if label in selected_labels)
    index_set_B = set(range(n)) - index_set_A
    # get internal values related to the split
    R, alpha, beta, gamma = get_R_alpha_beta_gamma(D, index_set_B)
    # get the two new distance matrices
    D_A = BuildTreeTopology.update_generalized_nj(D, index_set_B)
    D_B = BuildTreeTopology.update_generalized_nj(D, index_set_A)
    # get the names associated with the indices of the new distance matrices
    all_names = [set([name]) for name in ordered_labels]
    D_A_names = [set_to_string(x) for x in SchurAlgebra.vmerge(all_names, index_set_B)]
    D_B_names = [set_to_string(x) for x in SchurAlgebra.vmerge(all_names, index_set_A)]
    # show the results
    out = StringIO()
    print >> out, "alpha:", alpha
    print >> out, "beta:", beta
    print >> out, "gamma:", gamma
    print >> out
    print >> out, "new distance matrix corresponding to the selected names:"
    print >> out, MatrixUtil.m_to_string(D_A)
    print >> out
    print >> out, "ordered labels corresponding to this matrix:"
    for name in D_A_names:
        print >> out, name
    print >> out
    print >> out, "new distance matrix corresponding to the non-selected names:"
    print >> out, MatrixUtil.m_to_string(D_B)
    print >> out
    print >> out, "ordered labels corresponding to this matrix:"
    for name in D_B_names:
        print >> out, name
    # return the response
    return out.getvalue()
Example #18
0
 def evaluate(self, true_splits, D_estimated):
     """
     @param true_splits: a set of full splits that defines the true tree topology
     @param D_estimated: an estimated distance matrix conformant to the split labels
     @return: 1 if success, 0 if failure
     """
     estimated_splits = BuildTreeTopology.get_splits(D_estimated, self.split_function, self.update_function)
     if estimated_splits == true_splits:
         return 1
     else:
         return 0
Example #19
0
def process(ntaxa, length, nseconds, builders, branch_length_sampler):
    """
    @param ntaxa: the number of taxa in the sampled trees
    @param length: the length of sequences used to sample the distance matrix
    @param nseconds: allow this many seconds to run
    @param builders: tree builder objects
    @param branch_length_sampler: returns a tree drawn from some distribution
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    # track the number of samples that failed for various reasons
    n_zero_errors = 0
    n_infinite_errors = 0
    n_failed_spectral_splits = 0
    # define the number of attempts that fall into each of the four categories
    non_atteson_results = [[0, 0], [0, 0]]
    atteson_results = [[0, 0], [0, 0]]
    #pachter_results = [[0, 0], [0, 0]]
    # evaluate the quality of reconstructions from a bunch of different samples
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample the tree topology and get its set of implied full label splits
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            true_splits = tree.get_nontrivial_splits()
            # sample the branch lengths
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            try:
                D = sample_distance_matrix(tree, length)
                a, b = [
                    builder.evaluate(true_splits, D) for builder in builders
                ]
                if BuildTreeTopology.is_atteson(tree, D):
                    atteson_results[a][b] += 1
                #elif BuildTreeTopology.is_quartet_additive(tree, D) and BuildTreeTopology.is_quartet_consistent(tree, D):
                #pachter_results[a][b] += 1
                else:
                    non_atteson_results[a][b] += 1
            except InfiniteDistanceError as e:
                n_infinite_errors += 1
            except ZeroDistanceError as e:
                n_zero_errors += 1
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                n_failed_spectral_splits += 1
    except KeyboardInterrupt, e:
        pass
Example #20
0
def process(ntaxa, length, nseconds, builders, branch_length_sampler):
    """
    @param ntaxa: the number of taxa in the sampled trees
    @param length: the length of sequences used to sample the distance matrix
    @param nseconds: allow this many seconds to run
    @param builders: tree builder objects
    @param branch_length_sampler: returns a tree drawn from some distribution
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    # track the number of samples that failed for various reasons
    n_zero_errors = 0
    n_infinite_errors = 0
    n_failed_spectral_splits = 0
    # define the number of attempts that fall into each of the four categories
    non_atteson_results = [[0, 0], [0, 0]]
    atteson_results = [[0, 0], [0, 0]]
    #pachter_results = [[0, 0], [0, 0]]
    # evaluate the quality of reconstructions from a bunch of different samples
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample the tree topology and get its set of implied full label splits
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            true_splits = tree.get_nontrivial_splits()
            # sample the branch lengths
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            try:
                D = sample_distance_matrix(tree, length)
                a, b = [builder.evaluate(true_splits, D) for builder in builders]
                if BuildTreeTopology.is_atteson(tree, D):
                    atteson_results[a][b] += 1
                #elif BuildTreeTopology.is_quartet_additive(tree, D) and BuildTreeTopology.is_quartet_consistent(tree, D):
                    #pachter_results[a][b] += 1
                else:
                    non_atteson_results[a][b] += 1
            except InfiniteDistanceError as e:
                n_infinite_errors += 1
            except ZeroDistanceError as e:
                n_zero_errors += 1
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                n_failed_spectral_splits += 1
    except KeyboardInterrupt, e:
        pass
Example #21
0
def process(ntaxa, length, nseconds, branch_length_sampler, use_nj,
            use_modified_nj, use_all_spectral, use_one_spectral):
    """
    @param ntaxa: the number of taxa in the sampled trees
    @param length: the length of sequences used to sample the distance matrix
    @param nseconds: allow this many seconds to run or None to run forever
    @param branch_length_sampler: a functor that returns a branch length and has a string cast
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    # initialize the builder object
    builder = Builder()
    # track the number of samples that failed for various reasons
    n_zero_errors = 0
    n_infinite_errors = 0
    n_failed_spectral_splits = 0
    # do a bunch of reconstructions of sampled distance matrices
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample the tree topology and get its set of implied full label splits
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            true_splits = tree.get_nontrivial_splits()
            # sample the branch lengths
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            try:
                D = sample_distance_matrix(tree, length)
                # determine whether or not the distance matrix is Atteson with respect to the tree
                atteson = BuildTreeTopology.is_atteson(tree, D)
                # record information about the splits
                builder.evaluate(true_splits, D, atteson, use_nj,
                                 use_modified_nj, use_all_spectral,
                                 use_one_spectral)
            except InfiniteDistanceError as e:
                n_infinite_errors += 1
            except ZeroDistanceError as e:
                n_zero_errors += 1
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                n_failed_spectral_splits += 1
    except KeyboardInterrupt, e:
        pass
Example #22
0
def process(ntaxa, length, nseconds, branch_length_sampler, use_nj, use_modified_nj, use_all_spectral, use_one_spectral):
    """
    @param ntaxa: the number of taxa in the sampled trees
    @param length: the length of sequences used to sample the distance matrix
    @param nseconds: allow this many seconds to run or None to run forever
    @param branch_length_sampler: a functor that returns a branch length and has a string cast
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    # initialize the builder object
    builder = Builder()
    # track the number of samples that failed for various reasons
    n_zero_errors = 0
    n_infinite_errors = 0
    n_failed_spectral_splits = 0
    # do a bunch of reconstructions of sampled distance matrices
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample the tree topology and get its set of implied full label splits
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            true_splits = tree.get_nontrivial_splits()
            # sample the branch lengths
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            try:
                D = sample_distance_matrix(tree, length)
                # determine whether or not the distance matrix is Atteson with respect to the tree
                atteson = BuildTreeTopology.is_atteson(tree, D)
                # record information about the splits
                builder.evaluate(true_splits, D, atteson, use_nj, use_modified_nj, use_all_spectral, use_one_spectral)
            except InfiniteDistanceError as e:
                n_infinite_errors += 1
            except ZeroDistanceError as e:
                n_zero_errors += 1
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                n_failed_spectral_splits += 1
    except KeyboardInterrupt, e:
        pass
Example #23
0
def get_subtree_messages(D, eigensplit, ordered_tip_names):
    """
    @param D: the matrix of pairwise distances among tips of the tree
    @param eigensplit: the split induced by the fiedler vector
    @param ordered_tip_names: names of the tips of the tree conformant to v and D
    @return: a multi-line string
    """
    out = StringIO()
    n = len(D)
    ordered_label_sets = [set([i]) for i in range(n)]
    all_labels = set(range(n))
    for i, child in enumerate(eigensplit):
        complement = all_labels - child
        D_child = MatrixUtil.get_principal_submatrix(D, list(sorted(child)))
        child_label_sets = SchurAlgebra.vdelete(ordered_label_sets, complement)
        v_child = BuildTreeTopology.edm_to_fiedler(D_child)
        print >> out, 'the Fiedler split of Schur complements of subtree', i + 1
        for label_set, value in zip(child_label_sets, v_child):
            s = label_set_to_string(label_set, ordered_tip_names)
            print >> out, s, ':', value
        print >> out
    return out.getvalue().strip()
Example #24
0
def get_response_content(fs):
    # read the points and edges
    points, edges = read_points_and_edges(fs.graph_data)
    # get the width and height of the drawable area of the image
    width = fs.total_width - 2 * fs.border
    height = fs.total_height - 2 * fs.border
    if width < 1 or height < 1:
        msg = 'the image dimensions do not allow for enough drawable area'
        raise HandlingError(msg)
    # read the image info
    show_labels = None
    if fs.label_from_0:
        show_labels = 0
    elif fs.label_from_1:
        show_labels = 1
    # define the valuations which will define the node colors
    if fs.color_x:
        valuations = [p[0] for p in points]
    elif fs.color_fiedler_weighted or fs.color_fiedler_unweighted:
        if fs.color_fiedler_weighted:
            X = [np.array(p) for p in points]
            dists = [np.linalg.norm(X[j] - X[i]) for i, j in edges]
            weights = [1.0 / d for d in dists]
        else:
            weights = [1.0 for e in edges]
        L = edges_to_laplacian(edges, weights)
        valuations = BuildTreeTopology.laplacian_to_fiedler(L)
    else:
        valuations = [0 for p in points]
    valuations = [-v if fs.flip else v for v in valuations]
    colors = valuations_to_colors(valuations)
    # draw the image
    ext = Form.g_imageformat_to_ext[fs.imageformat]
    info = ImageInfo(fs.total_width, fs.total_height, fs.black, show_labels,
                     fs.border, ext)
    try:
        return get_image_string(points, edges, colors, info)
    except CairoUtil.CairoUtilError as e:
        raise HandlingError(e)
Example #25
0
def get_response_content(fs):
    # read the points and edges
    points, edges = read_points_and_edges(fs.graph_data)
    # get the width and height of the drawable area of the image
    width = fs.total_width - 2*fs.border
    height = fs.total_height - 2*fs.border
    if width < 1 or height < 1:
        msg = 'the image dimensions do not allow for enough drawable area'
        raise HandlingError(msg)
    # read the image info
    show_labels = None
    if fs.label_from_0:
        show_labels = 0
    elif fs.label_from_1:
        show_labels = 1
    # define the valuations which will define the node colors
    if fs.color_x:
        valuations = [p[0] for p in points]
    elif fs.color_fiedler_weighted or fs.color_fiedler_unweighted:
        if fs.color_fiedler_weighted:
            X = [np.array(p) for p in points]
            dists = [np.linalg.norm(X[j] - X[i]) for i, j in edges]
            weights = [1.0 / d for d in dists]
        else:
            weights = [1.0 for e in edges]
        L = edges_to_laplacian(edges, weights)
        valuations = BuildTreeTopology.laplacian_to_fiedler(L)
    else:
        valuations = [0 for p in points]
    valuations = [-v if fs.flip else v for v in valuations]
    colors = valuations_to_colors(valuations)
    # draw the image
    ext = Form.g_imageformat_to_ext[fs.imageformat]
    info = ImageInfo(fs.total_width, fs.total_height,
            fs.black, show_labels, fs.border, ext)
    try:
        return get_image_string(points, edges, colors, info)
    except CairoUtil.CairoUtilError as e:
        raise HandlingError(e)
Example #26
0
def get_subtree_messages(D, eigensplit, ordered_tip_names):
    """
    @param D: the matrix of pairwise distances among tips of the tree
    @param eigensplit: the split induced by the fiedler vector
    @param ordered_tip_names: names of the tips of the tree conformant to v and D
    @return: a multi-line string
    """
    out = StringIO()
    n = len(D)
    ordered_label_sets = [set([i]) for i in range(n)]
    all_labels = set(range(n))
    for i, child in enumerate(eigensplit):
        complement = all_labels - child
        D_child = MatrixUtil.get_principal_submatrix(D, list(sorted(child)))
        child_label_sets = SchurAlgebra.vdelete(ordered_label_sets, complement)
        v_child = BuildTreeTopology.edm_to_fiedler(D_child) 
        print >> out, 'the Fiedler split of Schur complements of subtree', i+1
        for label_set, value in zip(child_label_sets, v_child):
            s = label_set_to_string(label_set, ordered_tip_names)
            print >> out, s, ':', value
        print >> out
    return out.getvalue().strip()
Example #27
0
 try:
     D = sample_distance_matrix(tree, sequence_length)
 except InfiniteDistanceError as e:
     return incr_attribute(attribute_array, 'nsamples.rejected.inf')
 except ZeroDistanceError as e:
     return incr_attribute(attribute_array, 'nsamples.rejected.zero')
 except BuildTreeTopology.InvalidSpectralSplitException, e:
     return incr_attribute(attribute_array, 'nsamples.rejected.fail')
 # see if the top down reconstruction was successful
 try:
     splitter = BuildTreeTopology.split_using_eigenvector_with_nj_fallback
     if nj_like:
         updater = BuildTreeTopology.update_generalized_nj
     else:
         updater = BuildTreeTopology.update_using_laplacian
     all_spectral_splits = BuildTreeTopology.get_splits(
         D, splitter, updater)
     top_down_success = (all_spectral_splits == true_splits)
 except BuildTreeTopology.InvalidSpectralSplitException, e:
     return incr_attribute(attribute_array, 'nsamples.rejected.fail')
 # at this point the sample is accepted
 incr_attribute(attribute_array, 'nsamples.accepted')
 # determine whether or not the distance matrix is Atteson with respect to the tree
 if BuildTreeTopology.is_atteson(tree, D):
     incr_attribute(attribute_array, 'nsamples.accepted.atteson')
 # see if the bottom up reconstruction was successful
 nj_splits = BuildTreeTopology.get_splits(D, BuildTreeTopology.split_nj,
                                          BuildTreeTopology.update_nj)
 nj_success = (nj_splits == true_splits)
 # note the joint results of the two reconstruction methods
 if top_down_success and nj_success:
     incr_attribute(attribute_array, 'nsuccesses.both')
Example #28
0
def get_standard_response(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # begin the response
    out = StringIO()
    # show a summary of the original data
    print >> out, 'data summary before removing branches with zero length:'
    print >> out, len(archaea_names), 'archaea names in the original tree'
    print >> out, len(bacteria_names), 'bacteria names in the original tree'
    print >> out, len(eukaryota_names), 'eukaryota names in the original tree'
    print >> out, len(all_names), 'total names in the original tree'
    print >> out
    # get the pruned full tree
    pruned_full_tree = get_pruned_tree(full_tree)
    ordered_names = list(node.get_name() for node in pruned_full_tree.gen_tips())
    # show a summary of the processed data
    print >> out, 'data summary after removing branches with zero length:'
    print >> out, len(ordered_names), 'total names in the processed non-degenerate tree'
    print >> out
    # draw the pruned full tree
    print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
    formatted_tree_string = NewickIO.get_narrow_newick_string(pruned_full_tree, 120) 
    print >> out, formatted_tree_string
    print >> out
    # split the distance matrix
    D = np.array(pruned_full_tree.get_distance_matrix(ordered_names))
    L = Euclid.edm_to_laplacian(D)
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    # report the eigendecomposition
    print >> out, get_eigendecomposition_report(D)
    # report the clade intersections of sides of the split
    side_names = [set(ordered_names[i] for i in side) for side in eigensplit]
    clade_name_pairs = ((archaea_names, 'archaea'), (bacteria_names, 'bacteria'), (eukaryota_names, 'eukaryota'))
    print >> out, 'clade intersections with each side of the split:'
    for side, side_name in zip(side_names, ('left', 'right')):
        for clade, clade_name in clade_name_pairs:
            if clade & side:
                print >> out, 'the', side_name, 'side intersects', clade_name
    print >> out
    # prepare to do the secondary splits
    left_indices, right_indices = eigensplit
    full_label_sets = [set([i]) for i in range(len(ordered_names))]
    # get a secondary split
    for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)):
        L_s1 = SchurAlgebra.mmerge(L, index_complement)
        next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement)
        v = BuildTreeTopology.laplacian_to_fiedler(L_s1)
        left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v)
        left_sublabels = set()
        for i in left_subindices:
            left_sublabels.update(next_label_sets[i])
        right_sublabels = set()
        for i in right_subindices:
            right_sublabels.update(next_label_sets[i])
        left_subnames = set(ordered_names[i] for i in left_sublabels)
        right_subnames = set(ordered_names[i] for i in right_sublabels)
        print >> out, 'clade intersections with a subsplit:'
        for clade, clade_name in clade_name_pairs:
            if clade & left_subnames:
                print >> out, 'the left side intersects', clade_name
        for clade, clade_name in clade_name_pairs:
            if clade & right_subnames:
                print >> out, 'the right side intersects', clade_name
        print >> out
    # show debug info
    print >> out, 'archaea names:'
    print >> out, '\n'.join(x for x in sorted(archaea_names))
    print >> out
    print >> out, 'bacteria names:'
    print >> out, '\n'.join(x for x in sorted(bacteria_names))
    print >> out
    print >> out, 'eukaryota names:'
    print >> out, '\n'.join(x for x in sorted(eukaryota_names))
    print >> out
    # return the response
    response_text = out.getvalue().strip()
    return [('Content-Type', 'text/plain')], response_text
Example #29
0
def get_standard_response(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # begin the response
    out = StringIO()
    # show a summary of the original data
    print >> out, 'data summary before removing branches with zero length:'
    print >> out, len(archaea_names), 'archaea names in the original tree'
    print >> out, len(bacteria_names), 'bacteria names in the original tree'
    print >> out, len(eukaryota_names), 'eukaryota names in the original tree'
    print >> out, len(all_names), 'total names in the original tree'
    print >> out
    # get the pruned full tree
    pruned_full_tree = get_pruned_tree(full_tree)
    ordered_names = list(node.get_name()
                         for node in pruned_full_tree.gen_tips())
    # show a summary of the processed data
    print >> out, 'data summary after removing branches with zero length:'
    print >> out, len(
        ordered_names), 'total names in the processed non-degenerate tree'
    print >> out
    # draw the pruned full tree
    print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
    formatted_tree_string = NewickIO.get_narrow_newick_string(
        pruned_full_tree, 120)
    print >> out, formatted_tree_string
    print >> out
    # split the distance matrix
    D = np.array(pruned_full_tree.get_distance_matrix(ordered_names))
    L = Euclid.edm_to_laplacian(D)
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    # report the eigendecomposition
    print >> out, get_eigendecomposition_report(D)
    # report the clade intersections of sides of the split
    side_names = [set(ordered_names[i] for i in side) for side in eigensplit]
    clade_name_pairs = ((archaea_names, 'archaea'),
                        (bacteria_names, 'bacteria'), (eukaryota_names,
                                                       'eukaryota'))
    print >> out, 'clade intersections with each side of the split:'
    for side, side_name in zip(side_names, ('left', 'right')):
        for clade, clade_name in clade_name_pairs:
            if clade & side:
                print >> out, 'the', side_name, 'side intersects', clade_name
    print >> out
    # prepare to do the secondary splits
    left_indices, right_indices = eigensplit
    full_label_sets = [set([i]) for i in range(len(ordered_names))]
    # get a secondary split
    for index_selection, index_complement in ((left_indices, right_indices),
                                              (right_indices, left_indices)):
        L_s1 = SchurAlgebra.mmerge(L, index_complement)
        next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                              index_complement)
        v = BuildTreeTopology.laplacian_to_fiedler(L_s1)
        left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(
            v)
        left_sublabels = set()
        for i in left_subindices:
            left_sublabels.update(next_label_sets[i])
        right_sublabels = set()
        for i in right_subindices:
            right_sublabels.update(next_label_sets[i])
        left_subnames = set(ordered_names[i] for i in left_sublabels)
        right_subnames = set(ordered_names[i] for i in right_sublabels)
        print >> out, 'clade intersections with a subsplit:'
        for clade, clade_name in clade_name_pairs:
            if clade & left_subnames:
                print >> out, 'the left side intersects', clade_name
        for clade, clade_name in clade_name_pairs:
            if clade & right_subnames:
                print >> out, 'the right side intersects', clade_name
        print >> out
    # show debug info
    print >> out, 'archaea names:'
    print >> out, '\n'.join(x for x in sorted(archaea_names))
    print >> out
    print >> out, 'bacteria names:'
    print >> out, '\n'.join(x for x in sorted(bacteria_names))
    print >> out
    print >> out, 'eukaryota names:'
    print >> out, '\n'.join(x for x in sorted(eukaryota_names))
    print >> out
    # return the response
    response_text = out.getvalue().strip()
    return [('Content-Type', 'text/plain')], response_text
Example #30
0
 def get_verbose_summary(self):
     """
     @return: a multiline string
     """
     # begin the response
     out = StringIO()
     # show the number of taxa in various domains
     print >> out, self._get_name_summary()
     print >> out
     # show the pruned full tree
     formatted_tree_string = NewickIO.get_narrow_newick_string(
         self.pruned_tree, 120)
     print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
     print >> out, formatted_tree_string
     print >> out
     # split the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # report the eigendecomposition
     print >> out, get_eigendecomposition_report(D)
     print >> out
     # report the clade intersections of sides of the split
     side_names = [
         set(self.pruned_names[i] for i in side) for side in eigensplit
     ]
     print >> out, 'domains represented by each side of the primary split:'
     print >> out, 'the left side has:\t', ', '.join(
         self._get_domains(side_names[0]))
     print >> out, 'the right side has:\t', ', '.join(
         self._get_domains(side_names[1]))
     print >> out
     # prepare to do the secondary splits
     left_indices, right_indices = eigensplit
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     # do the secondary splits
     for index_selection, index_complement in ((left_indices,
                                                right_indices),
                                               (right_indices,
                                                left_indices)):
         L_secondary = SchurAlgebra.mmerge(L, index_complement)
         next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                               index_complement)
         v = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
         left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(
             v)
         left_sublabels = set()
         for i in left_subindices:
             left_sublabels.update(next_label_sets[i])
         right_sublabels = set()
         for i in right_subindices:
             right_sublabels.update(next_label_sets[i])
         left_subnames = set(self.pruned_names[i] for i in left_sublabels)
         right_subnames = set(self.pruned_names[i] for i in right_sublabels)
         print >> out, 'domains represented by a subsplit:'
         print >> out, 'the left side has:\t', ', '.join(
             self._get_domains(left_subnames))
         print >> out, 'the right side has:\t', ', '.join(
             self._get_domains(right_subnames))
         print >> out
     # return the multiline string
     return out.getvalue().strip()
Example #31
0
 def _do_analysis(self, use_generalized_nj):
     """
     Do some splits of the tree.
     @param use_generalized_nj: True if we use an old method of outgrouping
     """
     # define the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     # get the primary split of the criterion matrix
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # assert that the first split cleanly separates the bacteria from the rest
     left_indices, right_indices = eigensplit
     left_domains = self._get_domains(
         [self.pruned_names[x] for x in left_indices])
     right_domains = self._get_domains(
         [self.pruned_names[x] for x in right_indices])
     if ('bacteria' in left_domains) and ('bacteria' in right_domains):
         raise HandlingError('bacteria were not defined by the first split')
     # now we have enough info to define the first supplementary csv file
     self.first_split_object = SupplementarySpreadsheetObject(
         self.pruned_names, L, v)
     # define the bacteria indices vs the non-bacteria indices for the second split
     if 'bacteria' in left_domains:
         bacteria_indices = left_indices
         non_bacteria_indices = right_indices
     elif 'bacteria' in right_domains:
         bacteria_indices = right_indices
         non_bacteria_indices = left_indices
     # get the secondary split of interest
     if use_generalized_nj:
         D_secondary = BuildTreeTopology.update_generalized_nj(
             D, bacteria_indices)
         L_secondary = Euclid.edm_to_laplacian(D_secondary)
     else:
         L_secondary = SchurAlgebra.mmerge(L, bacteria_indices)
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                           bacteria_indices)
     v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
     eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(
         v_secondary)
     left_subindices, right_subindices = eigensplit_secondary
     pruned_names_secondary = []
     for label_set in next_label_sets:
         if len(label_set) == 1:
             label = list(label_set)[0]
             pruned_names_secondary.append(self.pruned_names[label])
         else:
             pruned_names_secondary.append('all-bacteria')
     # assert that the second split cleanly separates the eukaryota from the rest
     left_subdomains = self._get_domains(
         [pruned_names_secondary[x] for x in left_subindices])
     right_subdomains = self._get_domains(
         [pruned_names_secondary[x] for x in right_subindices])
     if ('eukaryota' in left_subdomains) and ('eukaryota'
                                              in right_subdomains):
         raise HandlingError(
             'eukaryota were not defined by the second split')
     # now we have enough info to define the second supplementary csv file
     self.second_split_object = SupplementarySpreadsheetObject(
         pruned_names_secondary, L_secondary, v_secondary)
Example #32
0
def process(ntaxa, nseconds, seqlen, nsamples, branch_length_sampler, use_pbar):
    """
    @param ntaxa: the number of taxa per tree
    @param nseconds: stop after this many seconds
    @param seqlen: use this sequence length
    @param nsamples: stop after this many samples per sequence length
    @param branch_length_sampler: this function samples branch lengths independently
    @param use_pbar: True iff a progress bar should be used
    @return: a multi-line string of the contents of an R table
    """
    # initialize the global rejection counts
    nrejected_zero = 0
    nrejected_inf = 0
    nrejected_fail = 0
    naccepted = 0
    # Initialize the accumulation matrix.
    # The rows specify the size of the smaller side of the initial split.
    # The columns specify the compatibility status of the split.
    nsmall_sizes = (ntaxa / 2) + 1
    accum = np.zeros((nsmall_sizes, 2), dtype=np.int)
    # Repeatedly analyze samples.
    # We might have to stop early if we run out of time or if ctrl-c is pressed.
    # If we have to stop early, then show the results of the progress so far.
    termination_reason = 'no reason for termination was given'
    start_time = time.time()
    pbar = Progress.Bar(nsamples) if use_pbar else None
    try:
        for sample_index in range(nsamples):
            # keep trying to get an accepted sample
            while True:
                # check the time
                if nseconds and time.time() - start_time > nseconds:
                    raise TimeoutError()
                # first sample a tree and get its set of informative splits
                tree = TreeSampler.sample_agglomerated_tree(ntaxa)
                true_splits = tree.get_nontrivial_splits()
                # sample the branch lengths
                for branch in tree.get_branches():
                    branch.length = branch_length_sampler()
                # Attempt to sample a distance matrix.
                # If the sample was rejected then note the reason and go back to the drawing board.
                try:
                    D = sample_distance_matrix(tree, seqlen)
                except InfiniteDistanceError as e:
                    nrejected_inf += 1
                    continue
                except ZeroDistanceError as e:
                    nrejected_zero += 1
                    continue
                # Attempt to estimate the primary split of the tree from the distance matrix.
                # If there was a technical failure then note it and go back to the drawing board.
                # Otherwise note the compatibility and balance of the split.
                try:
                    eigensplit = BuildTreeTopology.split_using_eigenvector(D)
                    small_size = min(len(side) for side in eigensplit)
                    if eigensplit in true_splits:
                        compatibility = 1
                    else:
                        compatibility = 0
                except BuildTreeTopology.DegenerateSplitException, e:
                    small_size = 0
                    compatibility = 1
                except BuildTreeTopology.InvalidSpectralSplitException, e:
                    nrejected_fail += 1
                    continue
Example #33
0
def process(ntaxa, nseconds, seqlen, nsamples, branch_length_sampler,
            use_pbar):
    """
    @param ntaxa: the number of taxa per tree
    @param nseconds: stop after this many seconds
    @param seqlen: use this sequence length
    @param nsamples: stop after this many samples per sequence length
    @param branch_length_sampler: this function samples branch lengths independently
    @param use_pbar: True iff a progress bar should be used
    @return: a multi-line string of the contents of an R table
    """
    # initialize the global rejection counts
    nrejected_zero = 0
    nrejected_inf = 0
    nrejected_fail = 0
    naccepted = 0
    # Initialize the accumulation matrix.
    # The rows specify the size of the smaller side of the initial split.
    # The columns specify the compatibility status of the split.
    nsmall_sizes = (ntaxa / 2) + 1
    accum = np.zeros((nsmall_sizes, 2), dtype=np.int)
    # Repeatedly analyze samples.
    # We might have to stop early if we run out of time or if ctrl-c is pressed.
    # If we have to stop early, then show the results of the progress so far.
    termination_reason = 'no reason for termination was given'
    start_time = time.time()
    pbar = Progress.Bar(nsamples) if use_pbar else None
    try:
        for sample_index in range(nsamples):
            # keep trying to get an accepted sample
            while True:
                # check the time
                if nseconds and time.time() - start_time > nseconds:
                    raise TimeoutError()
                # first sample a tree and get its set of informative splits
                tree = TreeSampler.sample_agglomerated_tree(ntaxa)
                true_splits = tree.get_nontrivial_splits()
                # sample the branch lengths
                for branch in tree.get_branches():
                    branch.length = branch_length_sampler()
                # Attempt to sample a distance matrix.
                # If the sample was rejected then note the reason and go back to the drawing board.
                try:
                    D = sample_distance_matrix(tree, seqlen)
                except InfiniteDistanceError as e:
                    nrejected_inf += 1
                    continue
                except ZeroDistanceError as e:
                    nrejected_zero += 1
                    continue
                # Attempt to estimate the primary split of the tree from the distance matrix.
                # If there was a technical failure then note it and go back to the drawing board.
                # Otherwise note the compatibility and balance of the split.
                try:
                    eigensplit = BuildTreeTopology.split_using_eigenvector(D)
                    small_size = min(len(side) for side in eigensplit)
                    if eigensplit in true_splits:
                        compatibility = 1
                    else:
                        compatibility = 0
                except BuildTreeTopology.DegenerateSplitException, e:
                    small_size = 0
                    compatibility = 1
                except BuildTreeTopology.InvalidSpectralSplitException, e:
                    nrejected_fail += 1
                    continue
Example #34
0
def get_response_content(fs):
    out = StringIO()
    # try to make some graphs
    unconnected_count = 0
    invalid_split_count = 0
    valid_split_count = 0
    for graph_index in range(fs.ngraphs):
        G = erdos_renyi(fs.nvertices, fs.pedge)
        if is_connected(G):
            # add interesting edge weights
            add_exponential_weights(G)
            # turn the adjacency matrix into a laplacian matrix
            L = Euclid.adjacency_to_laplacian(G)
            for v in range(fs.nvertices):
                small_index_to_big_index = {}
                for i_small, i_big in enumerate(
                    [i for i in range(fs.nvertices) if i != v]):
                    small_index_to_big_index[i_small] = i_big
                # take the schur complement with respect to the given vertex
                L_reduced = get_single_element_schur_complement(L, v)
                assert len(L_reduced) == len(L) - 1
                # get the loadings of the vertices of the reduced graph
                if fs.fiedler_cut:
                    Y_reduced = BuildTreeTopology.laplacian_to_fiedler(
                        L_reduced)
                elif fs.random_cut:
                    Y_reduced = get_random_vector(L_reduced)
                assert len(Y_reduced) == len(L_reduced)
                # expand the fiedler vector with positive and negative valuations for the removed vertex
                found_valid_split = False
                for augmented_loading in (-1.0, 1.0):
                    # get the augmented split vector for this assignment of the removed vertex
                    Y_full = [0] * len(G)
                    for i_reduced, loading in enumerate(Y_reduced):
                        i_big = small_index_to_big_index[i_reduced]
                        Y_full[i_big] = loading
                    Y_full[v] = augmented_loading
                    assert len(Y_full) == len(G)
                    # get the two graphs defined by the split
                    subgraph_a, subgraph_b = list(gen_subgraphs(G, Y_full))
                    # if the subgraphs are both connected then the split is valid
                    if is_connected(subgraph_a) and is_connected(subgraph_b):
                        found_valid_split = True
                # if a valid split was not found then show the matrix
                if found_valid_split:
                    valid_split_count += 1
                else:
                    print >> out, 'Found a matrix that was split incompatibly by a cut of its schur complement!'
                    print >> out, 'matrix:'
                    print >> out, MatrixUtil.m_to_string(G)
                    print >> out, 'index that was removed:', v
                    invalid_split_count += 1
        else:
            unconnected_count += 1
    # show the number of connected and of unconnected graphs
    print >> out, 'this many random graphs were connected:', fs.ngraphs - unconnected_count
    print >> out, 'this many random graphs were not connected:', unconnected_count
    print >> out, 'this many splits were valid:', valid_split_count
    print >> out, 'this many splits were invalid:', invalid_split_count
    # return the result
    return out.getvalue()
Example #35
0
class TreeSearch:
    """
    This is a virtual base class.
    """

    def __init__(self):
        # boolean requirements defined by the user
        self.informative_children = None
        self.force_difference = None
        self.informative_full_split = None
        self.invalid_dendrogram = None
        # search options defined by the subclass
        self.tree = None
        self.desired_primary_split = None
        self.id_to_index = None
        # initialize the counts that are tracked for bookkeeping
        self.aug_split_collision_count = 0
        self.aug_split_degenerate_count = 0
        self.error_primary_split_count = 0
        self.invalid_primary_split_count = 0
        self.degenerate_primary_split_count = 0
        self.undesired_primary_split_count = 0
        self.desired_primary_split_count = 0
        self.uninformative_child_count = 0
        self.informative_child_count = 0
        self.valid_dendrogram_count = 0
        self.success_count = 0

    def is_initialized(self):
        required_data = [
                self.informative_children,
                self.force_difference,
                self.informative_full_split,
                self.invalid_dendrogram,
                self.tree,
                self.desired_primary_split,
                self.id_to_index]
        return not (None in required_data)

    def get_result_text(self):
        """
        @return: a multi-line string of text
        """
        out = StringIO()
        if self.force_difference or self.informative_full_split:
            print >> out, 'full graph split stats:'
            print >> out, self.aug_split_collision_count,
            print >> out, 'full graph splits collided with the desired primary split'
            print >> out, self.aug_split_degenerate_count,
            print >> out, 'full graph splits were degenerate'
            print >> out
        print >> out, 'primary split stats:'
        print >> out, self.error_primary_split_count,
        print >> out, 'errors in finding the primary split (should be 0)'
        print >> out, self.invalid_primary_split_count,
        print >> out, 'invalid primary splits (should be 0)'
        print >> out, self.degenerate_primary_split_count,
        print >> out, 'degenerate primary splits'
        print >> out, self.undesired_primary_split_count,
        print >> out, 'primary splits were not the target split'
        print >> out, self.desired_primary_split_count,
        print >> out, 'primary splits were the target split'
        print >> out
        if self.informative_children:
            print >> out, 'secondary split stats:'
            print >> out, self.uninformative_child_count,
            print >> out, 'samples had at least one uninformative child tree'
            print >> out, self.informative_child_count,
            print>> out, 'samples had two informative child trees'
            print >> out
        if self.invalid_dendrogram:
            print >> out, 'naive dendrogram stats:'
            print >> out, self.valid_dendrogram_count,
            print >> out, 'naive dendrograms were valid'
            print >> out
        return out.getvalue().strip()

    def do_search(self, nseconds, sampling_function):
        """
        @param nseconds: allowed search time or None
        @param sampling_function: a function that samples a branch length
        @return: True if a tree was found that met the criteria
        """
        if not self.is_initialized():
            raise RuntimeError('the search was not sufficiently initialized')
        true_splits = self.tree.get_nontrivial_splits()
        start_time = time.time()
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                return False
            # assign new sampled branch lengths
            for branch in self.tree.get_branches():
                branch.length = sampling_function()
            # get the distance matrix so we can use a library function to get the split
            D = np.array(self.tree.get_distance_matrix())
            ntips = len(D)
            # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves
            if self.force_difference or self.informative_full_split:
                A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index))
                L_aug = Euclid.adjacency_to_laplacian(A_aug)
                v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug)
                left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug)
                left = [x for x in left_aug if x in range(ntips)]
                right = [x for x in right_aug if x in range(ntips)]
                leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right)
                if self.force_difference:
                    if leaf_eigensplit_aug == self.desired_primary_split:
                        self.aug_split_collision_count += 1
                        continue
                if self.informative_full_split:
                    if min(len(s) for s in leaf_eigensplit_aug) < 2:
                        self.aug_split_degenerate_count += 1
                        continue
            # get the eigensplit
            try:
                eigensplit = BuildTreeTopology.split_using_eigenvector(D)
            except BuildTreeTopology.DegenerateSplitException, e:
                self.degenerate_primary_split_count += 1
                continue
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                self.error_primary_split_count += 1
                continue
            if eigensplit not in true_splits:
                raise RuntimeError('INVALID SPLIT:' + tree.get_newick_string())
            if eigensplit != self.desired_primary_split:
                self.undesired_primary_split_count += 1
                continue
            self.desired_primary_split_count += 1
            # check the splits of the two child trees
            degenerate_subsplit_count = 0
            L = Euclid.edm_to_laplacian(D)
            for side in eigensplit:
                L_child = SchurAlgebra.mmerge(L, side)
                v = BuildTreeTopology.laplacian_to_fiedler(L_child)
                child_eigensplit = BuildTreeTopology.eigenvector_to_split(v)
                if min(len(s) for s in child_eigensplit) < 2:
                    degenerate_subsplit_count += 1
            if degenerate_subsplit_count:
                self.uninformative_child_count += 1
            else:
                self.informative_child_count += 1
            if self.informative_children:
                if degenerate_subsplit_count:
                    continue
            # check the dendrogram
            if self.invalid_dendrogram:
                labels = range(len(D))
                hierarchy = Dendrogram.get_hierarchy(D, Dendrogram.spectral_split, labels)
                dendrogram_splits = set(Dendrogram.hierarchy_to_nontrivial_splits(hierarchy))
                if dendrogram_splits == true_splits:
                    self.valid_dendrogram_count += 1
                    continue
            # the tree has met all of the requirements
            return True
Example #36
0
 try:
     D = sample_distance_matrix(tree, sequence_length)
 except InfiniteDistanceError as e:
     return incr_attribute(attribute_array, 'nsamples.rejected.inf')
 except ZeroDistanceError as e:
     return incr_attribute(attribute_array, 'nsamples.rejected.zero')
 except BuildTreeTopology.InvalidSpectralSplitException, e:
     return incr_attribute(attribute_array, 'nsamples.rejected.fail')
 # see if the top down reconstruction was successful
 try:
     splitter = BuildTreeTopology.split_using_eigenvector_with_nj_fallback
     if nj_like:
         updater = BuildTreeTopology.update_generalized_nj
     else:
         updater = BuildTreeTopology.update_using_laplacian
     all_spectral_splits = BuildTreeTopology.get_splits(D, splitter, updater)
     top_down_success = (all_spectral_splits == true_splits)
 except BuildTreeTopology.InvalidSpectralSplitException, e:
     return incr_attribute(attribute_array, 'nsamples.rejected.fail')
 # at this point the sample is accepted
 incr_attribute(attribute_array, 'nsamples.accepted')
 # determine whether or not the distance matrix is Atteson with respect to the tree
 if BuildTreeTopology.is_atteson(tree, D):
     incr_attribute(attribute_array, 'nsamples.accepted.atteson')
 # see if the bottom up reconstruction was successful
 nj_splits = BuildTreeTopology.get_splits(D, BuildTreeTopology.split_nj, BuildTreeTopology.update_nj)
 nj_success = (nj_splits == true_splits)
 # note the joint results of the two reconstruction methods
 if top_down_success and nj_success:
     incr_attribute(attribute_array, 'nsuccesses.both')
 elif (not top_down_success) and (not nj_success):