def evaluate(self, true_splits, D_estimated): """ @param true_splits: the set of all full splits implied by the true tree @param D_estimated: the estimated distance matrix """ self.true_splits = true_splits BuildTreeTopology.get_splits(D_estimated, self.split_function, BuildTreeTopology.update_using_laplacian, self.on_label_split)
def evaluate(self, true_splits, D_estimated, atteson, use_nj, use_modified_nj, use_all_spectral, use_one_spectral): """ @param true_splits: the set of all full splits implied by the true tree @param D_estimated: the estimated distance matrix @param atteson: True iff the distance matrix is Atteson """ # initialize the errors nj_error = None modified_nj_error = None all_spectral_error = None one_spectral_error = None if use_nj: nj_splits = BuildTreeTopology.get_splits(D_estimated, BuildTreeTopology.split_nj, BuildTreeTopology.update_nj) nj_error = Xtree.splits_to_rf_distance(nj_splits, true_splits) if use_modified_nj: modified_nj_splits = BuildTreeTopology.get_splits(D_estimated, BuildTreeTopology.split_nj, BuildTreeTopology.update_using_laplacian) modified_nj_error = Xtree.splits_to_rf_distance(modified_nj_splits, true_splits) if use_all_spectral: splitter = BuildTreeTopology.split_using_eigenvector_with_nj_fallback updater = BuildTreeTopology.update_using_laplacian all_spectral_splits = BuildTreeTopology.get_splits(D_estimated, splitter, updater) all_spectral_error = Xtree.splits_to_rf_distance(all_spectral_splits, true_splits) if use_one_spectral: splitter = SplitFunctor(len(D_estimated)) updater = UpdateFunctor(len(D_estimated)) one_spectral_splits = BuildTreeTopology.get_splits(D_estimated, splitter, updater) one_spectral_error = Xtree.splits_to_rf_distance(one_spectral_splits, true_splits) # add the data point self.scatter_points.append(ScatterPoint(atteson, nj_error, modified_nj_error, all_spectral_error, one_spectral_error))
def get_eigendecomposition_report(D): """ @param D: a distance matrix @return: a multi-line string """ out = StringIO() # get some intermediate matrices and vectors L = Euclid.edm_to_laplacian(D) laplacian_fiedler = BuildTreeTopology.laplacian_to_fiedler(L) distance_fiedler = BuildTreeTopology.edm_to_fiedler(D) eigensplit = BuildTreeTopology.eigenvector_to_split(laplacian_fiedler) # report the two eigenvalue lists that should be the same HDH = MatrixUtil.double_centered(D) HSH = -0.5 * HDH w_distance, vt_distance = np.linalg.eigh(HSH) print >> out, 'the laplacian-derived and distance-derived eigenvalues:' w_laplacian, vt_laplacian = np.linalg.eigh(L) for a, b in zip(sorted(w_laplacian), sorted(w_distance)): print >> out, a, '\t', b print >> out # report the two fiedler vectors that should be the same print >> out, 'the laplacian-derived and distance-derived fiedler vectors:' for a, b in zip(laplacian_fiedler, distance_fiedler): print >> out, a, '\t', b return out.getvalue().strip()
def __call__(self, D): """ @param D: the distance matrix @return: a set of two index sets defining a split of the indices """ if len(D) < self.large_matrix_size: return BuildTreeTopology.split_nj(D) else: return BuildTreeTopology.split_using_eigenvector_with_nj_fallback(D)
def __call__(self, D, index_set): """ @param D: the distance matrix @param index_set: the subset of indices that will be removed from the updated distance matrix @return: an updated distance matrix """ if len(D) < self.large_matrix_size: return BuildTreeTopology.update_nj(D, index_set) else: return BuildTreeTopology.update_using_laplacian(D, index_set)
def do_it_right(D): """ Do neighbor joining correctly. @param D: distance matrix @return: a sequence of splits """ # use neighbor joining to build the tree, saving the splits in the order they are made split_saver = SplitSaver() BuildTreeTopology.get_splits(D, BuildTreeTopology.split_nj, BuildTreeTopology.update_nj, split_saver) return split_saver.splits
def _do_analysis(self, use_generalized_nj): """ Do some splits of the tree. @param use_generalized_nj: True if we use an old method of outgrouping """ # define the distance matrix D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names)) # get the primary split of the criterion matrix L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # assert that the first split cleanly separates the bacteria from the rest left_indices, right_indices = eigensplit left_domains = self._get_domains([self.pruned_names[x] for x in left_indices]) right_domains = self._get_domains([self.pruned_names[x] for x in right_indices]) if ('bacteria' in left_domains) and ('bacteria' in right_domains): raise HandlingError('bacteria were not defined by the first split') # now we have enough info to define the first supplementary csv file self.first_split_object = SupplementarySpreadsheetObject(self.pruned_names, L, v) # define the bacteria indices vs the non-bacteria indices for the second split if 'bacteria' in left_domains: bacteria_indices = left_indices non_bacteria_indices = right_indices elif 'bacteria' in right_domains: bacteria_indices = right_indices non_bacteria_indices = left_indices # get the secondary split of interest if use_generalized_nj: D_secondary = BuildTreeTopology.update_generalized_nj(D, bacteria_indices) L_secondary = Euclid.edm_to_laplacian(D_secondary) else: L_secondary = SchurAlgebra.mmerge(L, bacteria_indices) full_label_sets = [set([i]) for i in range(len(self.pruned_names))] next_label_sets = SchurAlgebra.vmerge(full_label_sets, bacteria_indices) v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary) eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(v_secondary) left_subindices, right_subindices = eigensplit_secondary pruned_names_secondary = [] for label_set in next_label_sets: if len(label_set) == 1: label = list(label_set)[0] pruned_names_secondary.append(self.pruned_names[label]) else: pruned_names_secondary.append('all-bacteria') # assert that the second split cleanly separates the eukaryota from the rest left_subdomains = self._get_domains([pruned_names_secondary[x] for x in left_subindices]) right_subdomains = self._get_domains([pruned_names_secondary[x] for x in right_subindices]) if ('eukaryota' in left_subdomains) and ('eukaryota' in right_subdomains): raise HandlingError('eukaryota were not defined by the second split') # now we have enough info to define the second supplementary csv file self.second_split_object = SupplementarySpreadsheetObject(pruned_names_secondary, L_secondary, v_secondary)
def get_verbose_summary(self): """ @return: a multiline string """ # begin the response out = StringIO() # show the number of taxa in various domains print >> out, self._get_name_summary() print >> out # show the pruned full tree formatted_tree_string = NewickIO.get_narrow_newick_string(self.pruned_tree, 120) print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:' print >> out, formatted_tree_string print >> out # split the distance matrix D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names)) L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # report the eigendecomposition print >> out, get_eigendecomposition_report(D) print >> out # report the clade intersections of sides of the split side_names = [set(self.pruned_names[i] for i in side) for side in eigensplit] print >> out, 'domains represented by each side of the primary split:' print >> out, 'the left side has:\t', ', '.join(self._get_domains(side_names[0])) print >> out, 'the right side has:\t', ', '.join(self._get_domains(side_names[1])) print >> out # prepare to do the secondary splits left_indices, right_indices = eigensplit full_label_sets = [set([i]) for i in range(len(self.pruned_names))] # do the secondary splits for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)): L_secondary = SchurAlgebra.mmerge(L, index_complement) next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement) v = BuildTreeTopology.laplacian_to_fiedler(L_secondary) left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v) left_sublabels = set() for i in left_subindices: left_sublabels.update(next_label_sets[i]) right_sublabels = set() for i in right_subindices: right_sublabels.update(next_label_sets[i]) left_subnames = set(self.pruned_names[i] for i in left_sublabels) right_subnames = set(self.pruned_names[i] for i in right_sublabels) print >> out, 'domains represented by a subsplit:' print >> out, 'the left side has:\t', ', '.join(self._get_domains(left_subnames)) print >> out, 'the right side has:\t', ', '.join(self._get_domains(right_subnames)) print >> out # return the multiline string return out.getvalue().strip()
def process(ntaxa, nseconds, branch_length_sampler): """ @param ntaxa: the number of taxa in the sampled trees @param nseconds: allow this many seconds to run or None to run forever @param branch_length_sampler: a functor that returns a branch length and has a string cast @return: a multi-line string that summarizes the results """ start_time = time.time() # initialize some state that will be tracked over the entire run degenerate_count = 0 invalid_split_count = 0 valid_split_count = 0 spectral_error_count = 0 atteson_error_count = 0 counterexample_D = None counterexample_tree = None # do a bunch of reconstructions from sampled distance matrices try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # sample the tree topology and get its set of implied full label splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() # sample the atteson distance matrix D = sample_atteson_distance_matrix(tree) # assert that the atteson condition is true if not BuildTreeTopology.is_atteson(tree, D): atteson_error_count += 1 else: try: # see if the eigensplit is in the set of true splits eigensplit = BuildTreeTopology.split_using_eigenvector(D) if eigensplit in true_splits: valid_split_count += 1 else: invalid_split_count += 1 counterexample_D = D counterexample_tree = tree break except BuildTreeTopology.DegenerateSplitException, e: degenerate_count += 1 except BuildTreeTopology.InvalidSpectralSplitException, e: spectral_error_count += 1
def get_response_content(fs): out = StringIO() # get the tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # assert that each node is named for node in tree.preorder(): if not node.name: raise HandlingError('each node in the tree must have a name') # get the function that converts a matrix to a string if fs.plain_matrix: m_to_string = MatrixUtil.m_to_string elif fs.latex_matrix: m_to_string = latexutil.m_to_latex_string # print the results for the split of the full tree print >> out, get_full_tree_message(tree, m_to_string) print >> out # get the alphabetically ordered names of the tips ordered_tip_names = list(sorted(tip.get_name() for tip in tree.gen_tips())) # get the corresponding ordered ids tip_name_to_id = dict((tip.get_name(), id(tip)) for tip in tree.gen_tips()) ordered_tip_ids = [tip_name_to_id[name] for name in ordered_tip_names] # get the distance matrix defined by the tips of the tree D = np.array(tree.get_partial_distance_matrix(ordered_tip_ids)) L = Euclid.edm_to_laplacian(D) #print >> out, 'the Laplacian obtained from the full tree by Schur complementation:' #print >> out, MatrixUtil.m_to_string(L) #print >> out print >> out, 'the Schur complement in the Laplacian of the full tree scaled by', fs.scaling_factor print >> out, m_to_string(fs.scaling_factor * L) print >> out #L_merged = SchurAlgebra.mmerge(L, set([3,4,5])) #print >> out, 'the merged Laplacian:' #print >> out, MatrixUtil.m_to_string(L_merged) #print >> out # get the Fiedler cut of the Schur Laplacian v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) print >> out, 'the Fiedler split of the Schur complement of the full tree:' for name, value in zip(ordered_tip_names, v): print >> out, name, ':', value print >> out # get the Fiedler cuts of Schur complements of child trees print >> out, get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, fs.scaling_factor) print >> out # get the Fiedler cuts of Schur complements of subtrees print >> out, get_subtree_messages(D, eigensplit, ordered_tip_names) # return the response return out.getvalue()
def get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, scaling_factor): """ @param L: the laplacian corresponding to tips of the tree @param eigensplit: the split induced by the fiedler vector @param ordered_tip_names: names of the tips of the tree conformant to v and L @param m_to_string: a function that converts a matrix to a string @param scaling_factor: show the Laplacian scaled by this factor @return: a multi-line string """ out = StringIO() n = len(L) ordered_label_sets = [set([i]) for i in range(n)] all_labels = set(range(n)) for i, child in enumerate(eigensplit): complement = all_labels - child L_child = SchurAlgebra.mmerge(L, complement) print >> out, 'the Schur complement in the Laplacian of child tree', i+1, 'scaled by', scaling_factor print >> out, m_to_string(scaling_factor * L_child) print >> out child_label_sets = SchurAlgebra.vmerge(ordered_label_sets, complement) v_child = BuildTreeTopology.laplacian_to_fiedler(L_child) print >> out, 'the Fiedler split of the Schur complement in the Laplacian of child tree', i+1 for label_set, value in zip(child_label_sets, v_child): s = label_set_to_string(label_set, ordered_tip_names) print >> out, s, ':', value print >> out return out.getvalue().strip()
def split_function(self, D): """ Split the distance matrix using signs of an eigenvector of -HDH/2. If a degenerate split is found then a DegenerateSplitException is raised. @param D: the distance matrix @return: a set of two index sets defining a split of the indices """ try: # get the matrix whose eigendecomposition is of interest HSH = Euclid.edm_to_dccov(D) # get the eigendecomposition eigenvalues, V_T = np.linalg.eigh(HSH) eigenvectors = V_T.T.tolist() # save the eigenvalues for reporting self.eigenvalues = eigenvalues # get the eigenvector of interest w, v = max(zip(eigenvalues, eigenvectors)) # get the indices with positive eigenvector valuations n = len(D) positive = frozenset(i for i, x in enumerate(v) if x > 0) nonpositive = frozenset(set(range(n)) - positive) # check for a degenerate split for index_set in (positive, nonpositive): assert len(index_set) > 0 for index_set in (positive, nonpositive): if len(index_set) == 1: index, = index_set raise BuildTreeTopology.DegenerateSplitException(index) return frozenset((positive, nonpositive)) except BuildTreeTopology.DegenerateSplitException, e: self.eigenvalues = None return BuildTreeTopology.split_nj(D)
def get_response_content(fs): # read the matrix D = fs.matrix # read the ordered labels ordered_labels = Util.get_stripped_lines(StringIO(fs.labels)) # validate the input if len(D) != len(ordered_labels): raise HandlingError('the number of taxon labels should match the number of rows in the distance matrix') # get the split and update methods if fs.option_a: split_function = BuildTreeTopology.split_nj update_function = BuildTreeTopology.update_nj elif fs.option_b: split_function = BuildTreeTopology.split_nj update_function = BuildTreeTopology.update_using_laplacian elif fs.option_c: split_function = BuildTreeTopology.split_using_eigenvector_with_nj_fallback update_function = BuildTreeTopology.update_using_laplacian elif fs.option_d: split_function = BuildTreeTopology.split_using_eigenvector update_function = BuildTreeTopology.update_using_laplacian # get the splits index_splits = BuildTreeTopology.get_splits(D, split_function, update_function) # start to prepare the reponse out = StringIO() for index_split in index_splits: taxon_split = [[ordered_labels[i] for i in group] for group in index_split] print >> out, split_to_string(taxon_split) # write the response return out.getvalue()
def get_full_tree_message(tree, m_to_string): """ In this function we find the Fiedler split of the full tree. @param tree: each node in this tree must have a name @param m_to_string: a function that converts a matrix to a string @return: a message about the split of the tips of the tree induced by the fiedler vector """ out = StringIO() # get the alphabetically ordered names ordered_names = list(sorted(node.get_name() for node in tree.preorder())) # get the corresponding ordered ids name_to_id = dict((node.get_name(), id(node)) for node in tree.preorder()) ordered_ids = [name_to_id[name] for name in ordered_names] # get the full weighted adjacency matrix A = np.array(tree.get_affinity_matrix(ordered_ids)) print >> out, 'the weighted reciprocal adjacency matrix of the full tree:' print >> out, m_to_string(get_reciprocal_matrix(A)) print >> out # get the full Laplacian matrix L = Euclid.adjacency_to_laplacian(A) # get the fiedler split v = BuildTreeTopology.laplacian_to_fiedler(L) print >> out, 'the Fiedler split of the full tree:' for name, value in zip(ordered_names, v): print >> out, name, ':', value return out.getvalue().strip()
def get_response_content(fs): out = StringIO() # try to make some graphs unconnected_count = 0 invalid_split_count = 0 valid_split_count = 0 for graph_index in range(fs.ngraphs): G = erdos_renyi(fs.nvertices, fs.pedge) if is_connected(G): # add interesting edge weights add_exponential_weights(G) # turn the adjacency matrix into a laplacian matrix L = Euclid.adjacency_to_laplacian(G) for v in range(fs.nvertices): small_index_to_big_index = {} for i_small, i_big in enumerate([i for i in range(fs.nvertices) if i != v]): small_index_to_big_index[i_small] = i_big # take the schur complement with respect to the given vertex L_reduced = get_single_element_schur_complement(L, v) assert len(L_reduced) == len(L) - 1 # get the loadings of the vertices of the reduced graph if fs.fiedler_cut: Y_reduced = BuildTreeTopology.laplacian_to_fiedler(L_reduced) elif fs.random_cut: Y_reduced = get_random_vector(L_reduced) assert len(Y_reduced) == len(L_reduced) # expand the fiedler vector with positive and negative valuations for the removed vertex found_valid_split = False for augmented_loading in (-1.0, 1.0): # get the augmented split vector for this assignment of the removed vertex Y_full = [0]*len(G) for i_reduced, loading in enumerate(Y_reduced): i_big = small_index_to_big_index[i_reduced] Y_full[i_big] = loading Y_full[v] = augmented_loading assert len(Y_full) == len(G) # get the two graphs defined by the split subgraph_a, subgraph_b = list(gen_subgraphs(G, Y_full)) # if the subgraphs are both connected then the split is valid if is_connected(subgraph_a) and is_connected(subgraph_b): found_valid_split = True # if a valid split was not found then show the matrix if found_valid_split: valid_split_count += 1 else: print >> out, 'Found a matrix that was split incompatibly by a cut of its schur complement!' print >> out, 'matrix:' print >> out, MatrixUtil.m_to_string(G) print >> out, 'index that was removed:', v invalid_split_count += 1 else: unconnected_count += 1 # show the number of connected and of unconnected graphs print >> out, 'this many random graphs were connected:', fs.ngraphs - unconnected_count print >> out, 'this many random graphs were not connected:', unconnected_count print >> out, 'this many splits were valid:', valid_split_count print >> out, 'this many splits were invalid:', invalid_split_count # return the result return out.getvalue()
def do_search(self, nseconds, sampling_function): """ @param nseconds: allowed search time or None @param sampling_function: a function that samples a branch length @return: True if a tree was found that met the criteria """ if not self.is_initialized(): raise RuntimeError("the search was not sufficiently initialized") true_splits = self.tree.get_nontrivial_splits() start_time = time.time() while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: return False # assign new sampled branch lengths for branch in self.tree.get_branches(): branch.length = sampling_function() # get the distance matrix so we can use a library function to get the split D = np.array(self.tree.get_distance_matrix()) ntips = len(D) # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves if self.force_difference or self.informative_full_split: A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index)) L_aug = Euclid.adjacency_to_laplacian(A_aug) v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug) left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug) left = [x for x in left_aug if x in range(ntips)] right = [x for x in right_aug if x in range(ntips)] leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right) if self.force_difference: if leaf_eigensplit_aug == self.desired_primary_split: self.aug_split_collision_count += 1 continue if self.informative_full_split: if min(len(s) for s in leaf_eigensplit_aug) < 2: self.aug_split_degenerate_count += 1 continue # get the eigensplit try: eigensplit = BuildTreeTopology.split_using_eigenvector(D) except BuildTreeTopology.DegenerateSplitException, e: self.degenerate_primary_split_count += 1 continue except BuildTreeTopology.InvalidSpectralSplitException, e: self.error_primary_split_count += 1 continue
def get_response_content(fs): # read the matrix D = np.array(fs.matrix) n = len(D) # read the ordered labels ordered_labels = Util.get_stripped_lines(StringIO(fs.labels)) selected_labels = Util.get_stripped_lines(StringIO(fs.selection)) # validate the input if n != len(ordered_labels): raise HandlingError("the number of taxon labels should match the number of rows in the distance matrix") # get the two sets of indices index_set_A = set(i for i, label in enumerate(ordered_labels) if label in selected_labels) index_set_B = set(range(n)) - index_set_A # get internal values related to the split R, alpha, beta, gamma = get_R_alpha_beta_gamma(D, index_set_B) # get the two new distance matrices D_A = BuildTreeTopology.update_generalized_nj(D, index_set_B) D_B = BuildTreeTopology.update_generalized_nj(D, index_set_A) # get the names associated with the indices of the new distance matrices all_names = [set([name]) for name in ordered_labels] D_A_names = [set_to_string(x) for x in SchurAlgebra.vmerge(all_names, index_set_B)] D_B_names = [set_to_string(x) for x in SchurAlgebra.vmerge(all_names, index_set_A)] # show the results out = StringIO() print >> out, "alpha:", alpha print >> out, "beta:", beta print >> out, "gamma:", gamma print >> out print >> out, "new distance matrix corresponding to the selected names:" print >> out, MatrixUtil.m_to_string(D_A) print >> out print >> out, "ordered labels corresponding to this matrix:" for name in D_A_names: print >> out, name print >> out print >> out, "new distance matrix corresponding to the non-selected names:" print >> out, MatrixUtil.m_to_string(D_B) print >> out print >> out, "ordered labels corresponding to this matrix:" for name in D_B_names: print >> out, name # return the response return out.getvalue()
def evaluate(self, true_splits, D_estimated): """ @param true_splits: a set of full splits that defines the true tree topology @param D_estimated: an estimated distance matrix conformant to the split labels @return: 1 if success, 0 if failure """ estimated_splits = BuildTreeTopology.get_splits(D_estimated, self.split_function, self.update_function) if estimated_splits == true_splits: return 1 else: return 0
def process(ntaxa, length, nseconds, builders, branch_length_sampler): """ @param ntaxa: the number of taxa in the sampled trees @param length: the length of sequences used to sample the distance matrix @param nseconds: allow this many seconds to run @param builders: tree builder objects @param branch_length_sampler: returns a tree drawn from some distribution @return: a multi-line string that summarizes the results """ start_time = time.time() # track the number of samples that failed for various reasons n_zero_errors = 0 n_infinite_errors = 0 n_failed_spectral_splits = 0 # define the number of attempts that fall into each of the four categories non_atteson_results = [[0, 0], [0, 0]] atteson_results = [[0, 0], [0, 0]] #pachter_results = [[0, 0], [0, 0]] # evaluate the quality of reconstructions from a bunch of different samples try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # sample the tree topology and get its set of implied full label splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() try: D = sample_distance_matrix(tree, length) a, b = [ builder.evaluate(true_splits, D) for builder in builders ] if BuildTreeTopology.is_atteson(tree, D): atteson_results[a][b] += 1 #elif BuildTreeTopology.is_quartet_additive(tree, D) and BuildTreeTopology.is_quartet_consistent(tree, D): #pachter_results[a][b] += 1 else: non_atteson_results[a][b] += 1 except InfiniteDistanceError as e: n_infinite_errors += 1 except ZeroDistanceError as e: n_zero_errors += 1 except BuildTreeTopology.InvalidSpectralSplitException, e: n_failed_spectral_splits += 1 except KeyboardInterrupt, e: pass
def process(ntaxa, length, nseconds, builders, branch_length_sampler): """ @param ntaxa: the number of taxa in the sampled trees @param length: the length of sequences used to sample the distance matrix @param nseconds: allow this many seconds to run @param builders: tree builder objects @param branch_length_sampler: returns a tree drawn from some distribution @return: a multi-line string that summarizes the results """ start_time = time.time() # track the number of samples that failed for various reasons n_zero_errors = 0 n_infinite_errors = 0 n_failed_spectral_splits = 0 # define the number of attempts that fall into each of the four categories non_atteson_results = [[0, 0], [0, 0]] atteson_results = [[0, 0], [0, 0]] #pachter_results = [[0, 0], [0, 0]] # evaluate the quality of reconstructions from a bunch of different samples try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # sample the tree topology and get its set of implied full label splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() try: D = sample_distance_matrix(tree, length) a, b = [builder.evaluate(true_splits, D) for builder in builders] if BuildTreeTopology.is_atteson(tree, D): atteson_results[a][b] += 1 #elif BuildTreeTopology.is_quartet_additive(tree, D) and BuildTreeTopology.is_quartet_consistent(tree, D): #pachter_results[a][b] += 1 else: non_atteson_results[a][b] += 1 except InfiniteDistanceError as e: n_infinite_errors += 1 except ZeroDistanceError as e: n_zero_errors += 1 except BuildTreeTopology.InvalidSpectralSplitException, e: n_failed_spectral_splits += 1 except KeyboardInterrupt, e: pass
def process(ntaxa, length, nseconds, branch_length_sampler, use_nj, use_modified_nj, use_all_spectral, use_one_spectral): """ @param ntaxa: the number of taxa in the sampled trees @param length: the length of sequences used to sample the distance matrix @param nseconds: allow this many seconds to run or None to run forever @param branch_length_sampler: a functor that returns a branch length and has a string cast @return: a multi-line string that summarizes the results """ start_time = time.time() # initialize the builder object builder = Builder() # track the number of samples that failed for various reasons n_zero_errors = 0 n_infinite_errors = 0 n_failed_spectral_splits = 0 # do a bunch of reconstructions of sampled distance matrices try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # sample the tree topology and get its set of implied full label splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() try: D = sample_distance_matrix(tree, length) # determine whether or not the distance matrix is Atteson with respect to the tree atteson = BuildTreeTopology.is_atteson(tree, D) # record information about the splits builder.evaluate(true_splits, D, atteson, use_nj, use_modified_nj, use_all_spectral, use_one_spectral) except InfiniteDistanceError as e: n_infinite_errors += 1 except ZeroDistanceError as e: n_zero_errors += 1 except BuildTreeTopology.InvalidSpectralSplitException, e: n_failed_spectral_splits += 1 except KeyboardInterrupt, e: pass
def get_subtree_messages(D, eigensplit, ordered_tip_names): """ @param D: the matrix of pairwise distances among tips of the tree @param eigensplit: the split induced by the fiedler vector @param ordered_tip_names: names of the tips of the tree conformant to v and D @return: a multi-line string """ out = StringIO() n = len(D) ordered_label_sets = [set([i]) for i in range(n)] all_labels = set(range(n)) for i, child in enumerate(eigensplit): complement = all_labels - child D_child = MatrixUtil.get_principal_submatrix(D, list(sorted(child))) child_label_sets = SchurAlgebra.vdelete(ordered_label_sets, complement) v_child = BuildTreeTopology.edm_to_fiedler(D_child) print >> out, 'the Fiedler split of Schur complements of subtree', i + 1 for label_set, value in zip(child_label_sets, v_child): s = label_set_to_string(label_set, ordered_tip_names) print >> out, s, ':', value print >> out return out.getvalue().strip()
def get_response_content(fs): # read the points and edges points, edges = read_points_and_edges(fs.graph_data) # get the width and height of the drawable area of the image width = fs.total_width - 2 * fs.border height = fs.total_height - 2 * fs.border if width < 1 or height < 1: msg = 'the image dimensions do not allow for enough drawable area' raise HandlingError(msg) # read the image info show_labels = None if fs.label_from_0: show_labels = 0 elif fs.label_from_1: show_labels = 1 # define the valuations which will define the node colors if fs.color_x: valuations = [p[0] for p in points] elif fs.color_fiedler_weighted or fs.color_fiedler_unweighted: if fs.color_fiedler_weighted: X = [np.array(p) for p in points] dists = [np.linalg.norm(X[j] - X[i]) for i, j in edges] weights = [1.0 / d for d in dists] else: weights = [1.0 for e in edges] L = edges_to_laplacian(edges, weights) valuations = BuildTreeTopology.laplacian_to_fiedler(L) else: valuations = [0 for p in points] valuations = [-v if fs.flip else v for v in valuations] colors = valuations_to_colors(valuations) # draw the image ext = Form.g_imageformat_to_ext[fs.imageformat] info = ImageInfo(fs.total_width, fs.total_height, fs.black, show_labels, fs.border, ext) try: return get_image_string(points, edges, colors, info) except CairoUtil.CairoUtilError as e: raise HandlingError(e)
def get_response_content(fs): # read the points and edges points, edges = read_points_and_edges(fs.graph_data) # get the width and height of the drawable area of the image width = fs.total_width - 2*fs.border height = fs.total_height - 2*fs.border if width < 1 or height < 1: msg = 'the image dimensions do not allow for enough drawable area' raise HandlingError(msg) # read the image info show_labels = None if fs.label_from_0: show_labels = 0 elif fs.label_from_1: show_labels = 1 # define the valuations which will define the node colors if fs.color_x: valuations = [p[0] for p in points] elif fs.color_fiedler_weighted or fs.color_fiedler_unweighted: if fs.color_fiedler_weighted: X = [np.array(p) for p in points] dists = [np.linalg.norm(X[j] - X[i]) for i, j in edges] weights = [1.0 / d for d in dists] else: weights = [1.0 for e in edges] L = edges_to_laplacian(edges, weights) valuations = BuildTreeTopology.laplacian_to_fiedler(L) else: valuations = [0 for p in points] valuations = [-v if fs.flip else v for v in valuations] colors = valuations_to_colors(valuations) # draw the image ext = Form.g_imageformat_to_ext[fs.imageformat] info = ImageInfo(fs.total_width, fs.total_height, fs.black, show_labels, fs.border, ext) try: return get_image_string(points, edges, colors, info) except CairoUtil.CairoUtilError as e: raise HandlingError(e)
def get_subtree_messages(D, eigensplit, ordered_tip_names): """ @param D: the matrix of pairwise distances among tips of the tree @param eigensplit: the split induced by the fiedler vector @param ordered_tip_names: names of the tips of the tree conformant to v and D @return: a multi-line string """ out = StringIO() n = len(D) ordered_label_sets = [set([i]) for i in range(n)] all_labels = set(range(n)) for i, child in enumerate(eigensplit): complement = all_labels - child D_child = MatrixUtil.get_principal_submatrix(D, list(sorted(child))) child_label_sets = SchurAlgebra.vdelete(ordered_label_sets, complement) v_child = BuildTreeTopology.edm_to_fiedler(D_child) print >> out, 'the Fiedler split of Schur complements of subtree', i+1 for label_set, value in zip(child_label_sets, v_child): s = label_set_to_string(label_set, ordered_tip_names) print >> out, s, ':', value print >> out return out.getvalue().strip()
try: D = sample_distance_matrix(tree, sequence_length) except InfiniteDistanceError as e: return incr_attribute(attribute_array, 'nsamples.rejected.inf') except ZeroDistanceError as e: return incr_attribute(attribute_array, 'nsamples.rejected.zero') except BuildTreeTopology.InvalidSpectralSplitException, e: return incr_attribute(attribute_array, 'nsamples.rejected.fail') # see if the top down reconstruction was successful try: splitter = BuildTreeTopology.split_using_eigenvector_with_nj_fallback if nj_like: updater = BuildTreeTopology.update_generalized_nj else: updater = BuildTreeTopology.update_using_laplacian all_spectral_splits = BuildTreeTopology.get_splits( D, splitter, updater) top_down_success = (all_spectral_splits == true_splits) except BuildTreeTopology.InvalidSpectralSplitException, e: return incr_attribute(attribute_array, 'nsamples.rejected.fail') # at this point the sample is accepted incr_attribute(attribute_array, 'nsamples.accepted') # determine whether or not the distance matrix is Atteson with respect to the tree if BuildTreeTopology.is_atteson(tree, D): incr_attribute(attribute_array, 'nsamples.accepted.atteson') # see if the bottom up reconstruction was successful nj_splits = BuildTreeTopology.get_splits(D, BuildTreeTopology.split_nj, BuildTreeTopology.update_nj) nj_success = (nj_splits == true_splits) # note the joint results of the two reconstruction methods if top_down_success and nj_success: incr_attribute(attribute_array, 'nsuccesses.both')
def get_standard_response(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # begin the response out = StringIO() # show a summary of the original data print >> out, 'data summary before removing branches with zero length:' print >> out, len(archaea_names), 'archaea names in the original tree' print >> out, len(bacteria_names), 'bacteria names in the original tree' print >> out, len(eukaryota_names), 'eukaryota names in the original tree' print >> out, len(all_names), 'total names in the original tree' print >> out # get the pruned full tree pruned_full_tree = get_pruned_tree(full_tree) ordered_names = list(node.get_name() for node in pruned_full_tree.gen_tips()) # show a summary of the processed data print >> out, 'data summary after removing branches with zero length:' print >> out, len(ordered_names), 'total names in the processed non-degenerate tree' print >> out # draw the pruned full tree print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:' formatted_tree_string = NewickIO.get_narrow_newick_string(pruned_full_tree, 120) print >> out, formatted_tree_string print >> out # split the distance matrix D = np.array(pruned_full_tree.get_distance_matrix(ordered_names)) L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # report the eigendecomposition print >> out, get_eigendecomposition_report(D) # report the clade intersections of sides of the split side_names = [set(ordered_names[i] for i in side) for side in eigensplit] clade_name_pairs = ((archaea_names, 'archaea'), (bacteria_names, 'bacteria'), (eukaryota_names, 'eukaryota')) print >> out, 'clade intersections with each side of the split:' for side, side_name in zip(side_names, ('left', 'right')): for clade, clade_name in clade_name_pairs: if clade & side: print >> out, 'the', side_name, 'side intersects', clade_name print >> out # prepare to do the secondary splits left_indices, right_indices = eigensplit full_label_sets = [set([i]) for i in range(len(ordered_names))] # get a secondary split for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)): L_s1 = SchurAlgebra.mmerge(L, index_complement) next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement) v = BuildTreeTopology.laplacian_to_fiedler(L_s1) left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v) left_sublabels = set() for i in left_subindices: left_sublabels.update(next_label_sets[i]) right_sublabels = set() for i in right_subindices: right_sublabels.update(next_label_sets[i]) left_subnames = set(ordered_names[i] for i in left_sublabels) right_subnames = set(ordered_names[i] for i in right_sublabels) print >> out, 'clade intersections with a subsplit:' for clade, clade_name in clade_name_pairs: if clade & left_subnames: print >> out, 'the left side intersects', clade_name for clade, clade_name in clade_name_pairs: if clade & right_subnames: print >> out, 'the right side intersects', clade_name print >> out # show debug info print >> out, 'archaea names:' print >> out, '\n'.join(x for x in sorted(archaea_names)) print >> out print >> out, 'bacteria names:' print >> out, '\n'.join(x for x in sorted(bacteria_names)) print >> out print >> out, 'eukaryota names:' print >> out, '\n'.join(x for x in sorted(eukaryota_names)) print >> out # return the response response_text = out.getvalue().strip() return [('Content-Type', 'text/plain')], response_text
def get_standard_response(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # begin the response out = StringIO() # show a summary of the original data print >> out, 'data summary before removing branches with zero length:' print >> out, len(archaea_names), 'archaea names in the original tree' print >> out, len(bacteria_names), 'bacteria names in the original tree' print >> out, len(eukaryota_names), 'eukaryota names in the original tree' print >> out, len(all_names), 'total names in the original tree' print >> out # get the pruned full tree pruned_full_tree = get_pruned_tree(full_tree) ordered_names = list(node.get_name() for node in pruned_full_tree.gen_tips()) # show a summary of the processed data print >> out, 'data summary after removing branches with zero length:' print >> out, len( ordered_names), 'total names in the processed non-degenerate tree' print >> out # draw the pruned full tree print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:' formatted_tree_string = NewickIO.get_narrow_newick_string( pruned_full_tree, 120) print >> out, formatted_tree_string print >> out # split the distance matrix D = np.array(pruned_full_tree.get_distance_matrix(ordered_names)) L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # report the eigendecomposition print >> out, get_eigendecomposition_report(D) # report the clade intersections of sides of the split side_names = [set(ordered_names[i] for i in side) for side in eigensplit] clade_name_pairs = ((archaea_names, 'archaea'), (bacteria_names, 'bacteria'), (eukaryota_names, 'eukaryota')) print >> out, 'clade intersections with each side of the split:' for side, side_name in zip(side_names, ('left', 'right')): for clade, clade_name in clade_name_pairs: if clade & side: print >> out, 'the', side_name, 'side intersects', clade_name print >> out # prepare to do the secondary splits left_indices, right_indices = eigensplit full_label_sets = [set([i]) for i in range(len(ordered_names))] # get a secondary split for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)): L_s1 = SchurAlgebra.mmerge(L, index_complement) next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement) v = BuildTreeTopology.laplacian_to_fiedler(L_s1) left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split( v) left_sublabels = set() for i in left_subindices: left_sublabels.update(next_label_sets[i]) right_sublabels = set() for i in right_subindices: right_sublabels.update(next_label_sets[i]) left_subnames = set(ordered_names[i] for i in left_sublabels) right_subnames = set(ordered_names[i] for i in right_sublabels) print >> out, 'clade intersections with a subsplit:' for clade, clade_name in clade_name_pairs: if clade & left_subnames: print >> out, 'the left side intersects', clade_name for clade, clade_name in clade_name_pairs: if clade & right_subnames: print >> out, 'the right side intersects', clade_name print >> out # show debug info print >> out, 'archaea names:' print >> out, '\n'.join(x for x in sorted(archaea_names)) print >> out print >> out, 'bacteria names:' print >> out, '\n'.join(x for x in sorted(bacteria_names)) print >> out print >> out, 'eukaryota names:' print >> out, '\n'.join(x for x in sorted(eukaryota_names)) print >> out # return the response response_text = out.getvalue().strip() return [('Content-Type', 'text/plain')], response_text
def get_verbose_summary(self): """ @return: a multiline string """ # begin the response out = StringIO() # show the number of taxa in various domains print >> out, self._get_name_summary() print >> out # show the pruned full tree formatted_tree_string = NewickIO.get_narrow_newick_string( self.pruned_tree, 120) print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:' print >> out, formatted_tree_string print >> out # split the distance matrix D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names)) L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # report the eigendecomposition print >> out, get_eigendecomposition_report(D) print >> out # report the clade intersections of sides of the split side_names = [ set(self.pruned_names[i] for i in side) for side in eigensplit ] print >> out, 'domains represented by each side of the primary split:' print >> out, 'the left side has:\t', ', '.join( self._get_domains(side_names[0])) print >> out, 'the right side has:\t', ', '.join( self._get_domains(side_names[1])) print >> out # prepare to do the secondary splits left_indices, right_indices = eigensplit full_label_sets = [set([i]) for i in range(len(self.pruned_names))] # do the secondary splits for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)): L_secondary = SchurAlgebra.mmerge(L, index_complement) next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement) v = BuildTreeTopology.laplacian_to_fiedler(L_secondary) left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split( v) left_sublabels = set() for i in left_subindices: left_sublabels.update(next_label_sets[i]) right_sublabels = set() for i in right_subindices: right_sublabels.update(next_label_sets[i]) left_subnames = set(self.pruned_names[i] for i in left_sublabels) right_subnames = set(self.pruned_names[i] for i in right_sublabels) print >> out, 'domains represented by a subsplit:' print >> out, 'the left side has:\t', ', '.join( self._get_domains(left_subnames)) print >> out, 'the right side has:\t', ', '.join( self._get_domains(right_subnames)) print >> out # return the multiline string return out.getvalue().strip()
def _do_analysis(self, use_generalized_nj): """ Do some splits of the tree. @param use_generalized_nj: True if we use an old method of outgrouping """ # define the distance matrix D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names)) # get the primary split of the criterion matrix L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # assert that the first split cleanly separates the bacteria from the rest left_indices, right_indices = eigensplit left_domains = self._get_domains( [self.pruned_names[x] for x in left_indices]) right_domains = self._get_domains( [self.pruned_names[x] for x in right_indices]) if ('bacteria' in left_domains) and ('bacteria' in right_domains): raise HandlingError('bacteria were not defined by the first split') # now we have enough info to define the first supplementary csv file self.first_split_object = SupplementarySpreadsheetObject( self.pruned_names, L, v) # define the bacteria indices vs the non-bacteria indices for the second split if 'bacteria' in left_domains: bacteria_indices = left_indices non_bacteria_indices = right_indices elif 'bacteria' in right_domains: bacteria_indices = right_indices non_bacteria_indices = left_indices # get the secondary split of interest if use_generalized_nj: D_secondary = BuildTreeTopology.update_generalized_nj( D, bacteria_indices) L_secondary = Euclid.edm_to_laplacian(D_secondary) else: L_secondary = SchurAlgebra.mmerge(L, bacteria_indices) full_label_sets = [set([i]) for i in range(len(self.pruned_names))] next_label_sets = SchurAlgebra.vmerge(full_label_sets, bacteria_indices) v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary) eigensplit_secondary = BuildTreeTopology.eigenvector_to_split( v_secondary) left_subindices, right_subindices = eigensplit_secondary pruned_names_secondary = [] for label_set in next_label_sets: if len(label_set) == 1: label = list(label_set)[0] pruned_names_secondary.append(self.pruned_names[label]) else: pruned_names_secondary.append('all-bacteria') # assert that the second split cleanly separates the eukaryota from the rest left_subdomains = self._get_domains( [pruned_names_secondary[x] for x in left_subindices]) right_subdomains = self._get_domains( [pruned_names_secondary[x] for x in right_subindices]) if ('eukaryota' in left_subdomains) and ('eukaryota' in right_subdomains): raise HandlingError( 'eukaryota were not defined by the second split') # now we have enough info to define the second supplementary csv file self.second_split_object = SupplementarySpreadsheetObject( pruned_names_secondary, L_secondary, v_secondary)
def process(ntaxa, nseconds, seqlen, nsamples, branch_length_sampler, use_pbar): """ @param ntaxa: the number of taxa per tree @param nseconds: stop after this many seconds @param seqlen: use this sequence length @param nsamples: stop after this many samples per sequence length @param branch_length_sampler: this function samples branch lengths independently @param use_pbar: True iff a progress bar should be used @return: a multi-line string of the contents of an R table """ # initialize the global rejection counts nrejected_zero = 0 nrejected_inf = 0 nrejected_fail = 0 naccepted = 0 # Initialize the accumulation matrix. # The rows specify the size of the smaller side of the initial split. # The columns specify the compatibility status of the split. nsmall_sizes = (ntaxa / 2) + 1 accum = np.zeros((nsmall_sizes, 2), dtype=np.int) # Repeatedly analyze samples. # We might have to stop early if we run out of time or if ctrl-c is pressed. # If we have to stop early, then show the results of the progress so far. termination_reason = 'no reason for termination was given' start_time = time.time() pbar = Progress.Bar(nsamples) if use_pbar else None try: for sample_index in range(nsamples): # keep trying to get an accepted sample while True: # check the time if nseconds and time.time() - start_time > nseconds: raise TimeoutError() # first sample a tree and get its set of informative splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() # Attempt to sample a distance matrix. # If the sample was rejected then note the reason and go back to the drawing board. try: D = sample_distance_matrix(tree, seqlen) except InfiniteDistanceError as e: nrejected_inf += 1 continue except ZeroDistanceError as e: nrejected_zero += 1 continue # Attempt to estimate the primary split of the tree from the distance matrix. # If there was a technical failure then note it and go back to the drawing board. # Otherwise note the compatibility and balance of the split. try: eigensplit = BuildTreeTopology.split_using_eigenvector(D) small_size = min(len(side) for side in eigensplit) if eigensplit in true_splits: compatibility = 1 else: compatibility = 0 except BuildTreeTopology.DegenerateSplitException, e: small_size = 0 compatibility = 1 except BuildTreeTopology.InvalidSpectralSplitException, e: nrejected_fail += 1 continue
def get_response_content(fs): out = StringIO() # try to make some graphs unconnected_count = 0 invalid_split_count = 0 valid_split_count = 0 for graph_index in range(fs.ngraphs): G = erdos_renyi(fs.nvertices, fs.pedge) if is_connected(G): # add interesting edge weights add_exponential_weights(G) # turn the adjacency matrix into a laplacian matrix L = Euclid.adjacency_to_laplacian(G) for v in range(fs.nvertices): small_index_to_big_index = {} for i_small, i_big in enumerate( [i for i in range(fs.nvertices) if i != v]): small_index_to_big_index[i_small] = i_big # take the schur complement with respect to the given vertex L_reduced = get_single_element_schur_complement(L, v) assert len(L_reduced) == len(L) - 1 # get the loadings of the vertices of the reduced graph if fs.fiedler_cut: Y_reduced = BuildTreeTopology.laplacian_to_fiedler( L_reduced) elif fs.random_cut: Y_reduced = get_random_vector(L_reduced) assert len(Y_reduced) == len(L_reduced) # expand the fiedler vector with positive and negative valuations for the removed vertex found_valid_split = False for augmented_loading in (-1.0, 1.0): # get the augmented split vector for this assignment of the removed vertex Y_full = [0] * len(G) for i_reduced, loading in enumerate(Y_reduced): i_big = small_index_to_big_index[i_reduced] Y_full[i_big] = loading Y_full[v] = augmented_loading assert len(Y_full) == len(G) # get the two graphs defined by the split subgraph_a, subgraph_b = list(gen_subgraphs(G, Y_full)) # if the subgraphs are both connected then the split is valid if is_connected(subgraph_a) and is_connected(subgraph_b): found_valid_split = True # if a valid split was not found then show the matrix if found_valid_split: valid_split_count += 1 else: print >> out, 'Found a matrix that was split incompatibly by a cut of its schur complement!' print >> out, 'matrix:' print >> out, MatrixUtil.m_to_string(G) print >> out, 'index that was removed:', v invalid_split_count += 1 else: unconnected_count += 1 # show the number of connected and of unconnected graphs print >> out, 'this many random graphs were connected:', fs.ngraphs - unconnected_count print >> out, 'this many random graphs were not connected:', unconnected_count print >> out, 'this many splits were valid:', valid_split_count print >> out, 'this many splits were invalid:', invalid_split_count # return the result return out.getvalue()
class TreeSearch: """ This is a virtual base class. """ def __init__(self): # boolean requirements defined by the user self.informative_children = None self.force_difference = None self.informative_full_split = None self.invalid_dendrogram = None # search options defined by the subclass self.tree = None self.desired_primary_split = None self.id_to_index = None # initialize the counts that are tracked for bookkeeping self.aug_split_collision_count = 0 self.aug_split_degenerate_count = 0 self.error_primary_split_count = 0 self.invalid_primary_split_count = 0 self.degenerate_primary_split_count = 0 self.undesired_primary_split_count = 0 self.desired_primary_split_count = 0 self.uninformative_child_count = 0 self.informative_child_count = 0 self.valid_dendrogram_count = 0 self.success_count = 0 def is_initialized(self): required_data = [ self.informative_children, self.force_difference, self.informative_full_split, self.invalid_dendrogram, self.tree, self.desired_primary_split, self.id_to_index] return not (None in required_data) def get_result_text(self): """ @return: a multi-line string of text """ out = StringIO() if self.force_difference or self.informative_full_split: print >> out, 'full graph split stats:' print >> out, self.aug_split_collision_count, print >> out, 'full graph splits collided with the desired primary split' print >> out, self.aug_split_degenerate_count, print >> out, 'full graph splits were degenerate' print >> out print >> out, 'primary split stats:' print >> out, self.error_primary_split_count, print >> out, 'errors in finding the primary split (should be 0)' print >> out, self.invalid_primary_split_count, print >> out, 'invalid primary splits (should be 0)' print >> out, self.degenerate_primary_split_count, print >> out, 'degenerate primary splits' print >> out, self.undesired_primary_split_count, print >> out, 'primary splits were not the target split' print >> out, self.desired_primary_split_count, print >> out, 'primary splits were the target split' print >> out if self.informative_children: print >> out, 'secondary split stats:' print >> out, self.uninformative_child_count, print >> out, 'samples had at least one uninformative child tree' print >> out, self.informative_child_count, print>> out, 'samples had two informative child trees' print >> out if self.invalid_dendrogram: print >> out, 'naive dendrogram stats:' print >> out, self.valid_dendrogram_count, print >> out, 'naive dendrograms were valid' print >> out return out.getvalue().strip() def do_search(self, nseconds, sampling_function): """ @param nseconds: allowed search time or None @param sampling_function: a function that samples a branch length @return: True if a tree was found that met the criteria """ if not self.is_initialized(): raise RuntimeError('the search was not sufficiently initialized') true_splits = self.tree.get_nontrivial_splits() start_time = time.time() while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: return False # assign new sampled branch lengths for branch in self.tree.get_branches(): branch.length = sampling_function() # get the distance matrix so we can use a library function to get the split D = np.array(self.tree.get_distance_matrix()) ntips = len(D) # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves if self.force_difference or self.informative_full_split: A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index)) L_aug = Euclid.adjacency_to_laplacian(A_aug) v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug) left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug) left = [x for x in left_aug if x in range(ntips)] right = [x for x in right_aug if x in range(ntips)] leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right) if self.force_difference: if leaf_eigensplit_aug == self.desired_primary_split: self.aug_split_collision_count += 1 continue if self.informative_full_split: if min(len(s) for s in leaf_eigensplit_aug) < 2: self.aug_split_degenerate_count += 1 continue # get the eigensplit try: eigensplit = BuildTreeTopology.split_using_eigenvector(D) except BuildTreeTopology.DegenerateSplitException, e: self.degenerate_primary_split_count += 1 continue except BuildTreeTopology.InvalidSpectralSplitException, e: self.error_primary_split_count += 1 continue if eigensplit not in true_splits: raise RuntimeError('INVALID SPLIT:' + tree.get_newick_string()) if eigensplit != self.desired_primary_split: self.undesired_primary_split_count += 1 continue self.desired_primary_split_count += 1 # check the splits of the two child trees degenerate_subsplit_count = 0 L = Euclid.edm_to_laplacian(D) for side in eigensplit: L_child = SchurAlgebra.mmerge(L, side) v = BuildTreeTopology.laplacian_to_fiedler(L_child) child_eigensplit = BuildTreeTopology.eigenvector_to_split(v) if min(len(s) for s in child_eigensplit) < 2: degenerate_subsplit_count += 1 if degenerate_subsplit_count: self.uninformative_child_count += 1 else: self.informative_child_count += 1 if self.informative_children: if degenerate_subsplit_count: continue # check the dendrogram if self.invalid_dendrogram: labels = range(len(D)) hierarchy = Dendrogram.get_hierarchy(D, Dendrogram.spectral_split, labels) dendrogram_splits = set(Dendrogram.hierarchy_to_nontrivial_splits(hierarchy)) if dendrogram_splits == true_splits: self.valid_dendrogram_count += 1 continue # the tree has met all of the requirements return True
try: D = sample_distance_matrix(tree, sequence_length) except InfiniteDistanceError as e: return incr_attribute(attribute_array, 'nsamples.rejected.inf') except ZeroDistanceError as e: return incr_attribute(attribute_array, 'nsamples.rejected.zero') except BuildTreeTopology.InvalidSpectralSplitException, e: return incr_attribute(attribute_array, 'nsamples.rejected.fail') # see if the top down reconstruction was successful try: splitter = BuildTreeTopology.split_using_eigenvector_with_nj_fallback if nj_like: updater = BuildTreeTopology.update_generalized_nj else: updater = BuildTreeTopology.update_using_laplacian all_spectral_splits = BuildTreeTopology.get_splits(D, splitter, updater) top_down_success = (all_spectral_splits == true_splits) except BuildTreeTopology.InvalidSpectralSplitException, e: return incr_attribute(attribute_array, 'nsamples.rejected.fail') # at this point the sample is accepted incr_attribute(attribute_array, 'nsamples.accepted') # determine whether or not the distance matrix is Atteson with respect to the tree if BuildTreeTopology.is_atteson(tree, D): incr_attribute(attribute_array, 'nsamples.accepted.atteson') # see if the bottom up reconstruction was successful nj_splits = BuildTreeTopology.get_splits(D, BuildTreeTopology.split_nj, BuildTreeTopology.update_nj) nj_success = (nj_splits == true_splits) # note the joint results of the two reconstruction methods if top_down_success and nj_success: incr_attribute(attribute_array, 'nsuccesses.both') elif (not top_down_success) and (not nj_success):