def get_response_content(fs): # read the query tree query_tree = NewickIO.parse(fs.query, FelTree.NewickTree) # read the reference tree reference_tree = NewickIO.parse(fs.reference, FelTree.NewickTree) # calculate the loss using the requested loss function if fs.uniform: loss_numerator = TreeComparison.get_split_distance( query_tree, reference_tree) elif fs.weighted: loss_numerator = TreeComparison.get_weighted_split_distance( query_tree, reference_tree) # do the normalization if requested if fs.normalize: if fs.uniform: loss_denominator = float( TreeComparison.get_nontrivial_split_count(reference_tree)) elif fs.weighted: loss_denominator = float( TreeComparison.get_weighted_split_count(reference_tree)) else: loss_denominator = 1 # return the response if loss_denominator: return str(loss_numerator / loss_denominator) + '\n' else: return 'normalization failed\n'
def test_mito_matrix(self): D = g_mito_matrix n = len(D) observed_Q = get_Q_matrix(D) expected_Q = g_mito_matrix_q # assert that the diagonal elements of the observed Q matrix are exactly zero for i, row in enumerate(observed_Q): self.assertEqual(row[i], 0) # assert that the observed Q matrix is approximately equal to the expected Q matrix abs_tol = .001 for i in range(n): for j in range(n): abs_delta = abs(observed_Q[i][j] - expected_Q[i][j]) self.failUnless(abs_delta < abs_tol) # use neighbor joining to reconstruct the tree observed_tree = make_tree(D, g_mito_states) # load the expected tree expected_tree = NewickIO.parse(g_mito_tree_string, FelTree.NewickTree) # for the observed and expected trees calculate the induced partitions and corresponding branch lengths observed_partitions_and_lengths = TreeComparison.get_partitions_and_branch_lengths( observed_tree) expected_partitions_and_lengths = TreeComparison.get_partitions_and_branch_lengths( expected_tree) # the number of partitions should be the same self.assertEqual(len(observed_partitions_and_lengths), len(expected_partitions_and_lengths)) # the partitions should be the same observed_partitions = set( [part for part, length in observed_partitions_and_lengths]) expected_partitions = set( [part for part, length in expected_partitions_and_lengths]) observed_only = observed_partitions - expected_partitions expected_only = expected_partitions - observed_partitions lines = [ 'observed partitions include: ' + str(observed_only), 'expected partitions include: ' + str(expected_only) ] self.assertEqual(observed_partitions, expected_partitions, '\n'.join(lines)) # corresponding partitions should have the same lengths observed_part_to_length = dict(observed_partitions_and_lengths) expected_part_to_length = dict(expected_partitions_and_lengths) lines = [] for part in observed_partitions: observed_length = observed_part_to_length[part] expected_length = expected_part_to_length[part] abs_tol = .00001 abs_delta = abs(observed_length - expected_length) if abs_delta > abs_tol: lines.append('partition:' + str(part)) lines.append('observed branch length:' + str(observed_length)) lines.append('expected branch length:' + str(expected_length)) error_message = '\n'.join(lines) self.failIf(error_message, error_message)
def __init__(self, tree, epsilon): """ @param tree: a newick tree in the felsenstein-inspired format @param epsilon: determines whether loadings are considered negligible """ # clear some flags that describe events that occur during reconstruction self.is_negligible = False self.is_incomplete = False self.is_conflicting = False # define the trees self.tree = tree self.reconstructed_tree = None # set the threshold for loading negligibility self.epsilon = epsilon # define some arbitrary ordering of tip names self.ordered_names = [node.get_name() for node in tree.gen_tips()] # get the distance matrix with respect to this ordering D = tree.get_distance_matrix(self.ordered_names) # get the Gower doubly centered matrix G = MatrixUtil.double_centered(np.array(D)) # get the eigendecomposition of the Gower matrix eigenvalues, eigenvector_transposes = np.linalg.eigh(G) eigenvectors = eigenvector_transposes.T self.sorted_eigensystem = list( reversed( list( sorted((abs(w), v) for w, v in zip(eigenvalues, eigenvectors))))) # build the tree recursively using the sorted eigensystem indices = set(range(len(self.ordered_names))) try: # try to reconstruct the tree root = self._build_tree(indices, 0) root.set_branch_length(None) output_tree = Newick.NewickTree(root) # convert the tree to the FelTree format newick_string = NewickIO.get_newick_string(output_tree) self.reconstructed_tree = NewickIO.parse(newick_string, FelTree.NewickTree) except NegligibleError: self.is_negligible = True except IncompleteError: self.is_incomplete = True else: # compare the splits defined by the reconstructed tree # to splits in the original tree expected_partitions = TreeComparison.get_nontrivial_partitions( self.tree) observed_partitions = TreeComparison.get_nontrivial_partitions( self.reconstructed_tree) invalid_partitions = observed_partitions - expected_partitions if invalid_partitions: self.is_conflicting = True
def __init__(self, tree, epsilon): """ @param tree: a newick tree in the felsenstein-inspired format @param epsilon: determines whether loadings are considered negligible """ # clear some flags that describe events that occur during reconstruction self.is_negligible = False self.is_incomplete = False self.is_conflicting = False # define the trees self.tree = tree self.reconstructed_tree = None # set the threshold for loading negligibility self.epsilon = epsilon # define some arbitrary ordering of tip names self.ordered_names = [node.get_name() for node in tree.gen_tips()] # get the distance matrix with respect to this ordering D = tree.get_distance_matrix(self.ordered_names) # get the Gower doubly centered matrix G = MatrixUtil.double_centered(np.array(D)) # get the eigendecomposition of the Gower matrix eigenvalues, eigenvector_transposes = np.linalg.eigh(G) eigenvectors = eigenvector_transposes.T self.sorted_eigensystem = list(reversed(list(sorted((abs(w), v) for w, v in zip(eigenvalues, eigenvectors))))) # build the tree recursively using the sorted eigensystem indices = set(range(len(self.ordered_names))) try: # try to reconstruct the tree root = self._build_tree(indices, 0) root.set_branch_length(None) output_tree = Newick.NewickTree(root) # convert the tree to the FelTree format newick_string = NewickIO.get_newick_string(output_tree) self.reconstructed_tree = NewickIO.parse( newick_string, FelTree.NewickTree) except NegligibleError: self.is_negligible = True except IncompleteError: self.is_incomplete = True else: # compare the splits defined by the reconstructed tree # to splits in the original tree expected_partitions = TreeComparison.get_nontrivial_partitions( self.tree) observed_partitions = TreeComparison.get_nontrivial_partitions( self.reconstructed_tree) invalid_partitions = observed_partitions - expected_partitions if invalid_partitions: self.is_conflicting = True
def run(self, distance_matrices, ordered_names): """ This function stores the losses for each reconstruction. @param distance_matrices: a sequence of distance matrices @param ordered_names: order of taxa in the distance matrix """ if self.start_time is not None: msg = 'each simulation object should be run only once' raise HandlingError(msg) if not distance_matrices: raise HandlingErrror('no distance matrices were provided') tip_name_set = set(node.name for node in self.original_tree.gen_tips()) if tip_name_set != set(ordered_names): raise HandlingError('leaf name mismatch') self.start_time = time.time() # Define the reference tree and its maximum cost # under different loss functions. reference_tree = self.original_tree max_error_count = TreeComparison.get_nontrivial_split_count( reference_tree) max_loss_value = TreeComparison.get_weighted_split_count( reference_tree) for distance_matrix in distance_matrices: # create the tree builder tree_builder = NeighborhoodJoining.TreeBuilder( distance_matrix, ordered_names, self.splitter) # set parameters of the validating tree builder tree_builder.set_fallback_name(self.fallback_name) # build the tree try: query_tree = tree_builder.build() except NeighborhoodJoining.NeighborhoodJoiningError as e: raise HandlingError(e) # Note the number and weight of partition errors # during the reconstruction. error_count = TreeComparison.get_split_distance( query_tree, reference_tree) loss_value = TreeComparison.get_weighted_split_distance( query_tree, reference_tree) # make sure that the summary is internally consistent assert error_count <= max_error_count, (error_count, max_error_count) assert loss_value <= max_loss_value, (loss_value, max_loss_value) # save the reconstruction characteristics to use later self.error_counts.append(error_count) self.loss_values.append(loss_value) self.max_error_counts.append(max_error_count) self.max_loss_values.append(max_loss_value) self.stop_time = time.time()
def test_mito_matrix(self): D = g_mito_matrix n = len(D) observed_Q = get_Q_matrix(D) expected_Q = g_mito_matrix_q # assert that the diagonal elements of the observed Q matrix are exactly zero for i, row in enumerate(observed_Q): self.assertEqual(row[i], 0) # assert that the observed Q matrix is approximately equal to the expected Q matrix abs_tol = .001 for i in range(n): for j in range(n): abs_delta = abs(observed_Q[i][j] - expected_Q[i][j]) self.failUnless(abs_delta < abs_tol) # use neighbor joining to reconstruct the tree observed_tree = make_tree(D, g_mito_states) # load the expected tree expected_tree = NewickIO.parse(g_mito_tree_string, FelTree.NewickTree) # for the observed and expected trees calculate the induced partitions and corresponding branch lengths observed_partitions_and_lengths = TreeComparison.get_partitions_and_branch_lengths(observed_tree) expected_partitions_and_lengths = TreeComparison.get_partitions_and_branch_lengths(expected_tree) # the number of partitions should be the same self.assertEqual(len(observed_partitions_and_lengths), len(expected_partitions_and_lengths)) # the partitions should be the same observed_partitions = set([part for part, length in observed_partitions_and_lengths]) expected_partitions = set([part for part, length in expected_partitions_and_lengths]) observed_only = observed_partitions - expected_partitions expected_only = expected_partitions - observed_partitions lines = [ 'observed partitions include: ' + str(observed_only), 'expected partitions include: ' + str(expected_only) ] self.assertEqual(observed_partitions, expected_partitions, '\n'.join(lines)) # corresponding partitions should have the same lengths observed_part_to_length = dict(observed_partitions_and_lengths) expected_part_to_length = dict(expected_partitions_and_lengths) lines = [] for part in observed_partitions: observed_length = observed_part_to_length[part] expected_length = expected_part_to_length[part] abs_tol = .00001 abs_delta = abs(observed_length - expected_length) if abs_delta > abs_tol: lines.append('partition:' + str(part)) lines.append('observed branch length:' + str(observed_length)) lines.append('expected branch length:' + str(expected_length)) error_message = '\n'.join(lines) self.failIf(error_message, error_message)
def __call__(self, tree): # get the partitions implied by the tree valid_partitions = TreeComparison.get_partitions(tree) # Get the partition implied by the Fiedler split # of the graph derived from the tree. tip_nodes = list(tree.gen_tips()) D = tree.get_partial_distance_matrix( [id(node) for node in tip_nodes]) y = get_vector(D).tolist() name_selection = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem > 0) name_complement = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem <= 0) name_partition = frozenset((name_selection, name_complement)) if name_partition not in valid_partitions: msg = '\n'.join([ 'invalid partition found:', 'tree:', NewickIO.get_newick_string(tree), 'invalid partition:', name_partition]) if not self.fout: self.fout = open(self.counterexample_filename, 'wt') print >> self.fout, msg print msg self.ncounterexamples += 1 # do not stop looking, even if a counterexample is found return False
def __call__(self, tree): # get the partitions implied by the tree valid_partitions = TreeComparison.get_partitions(tree) # Get the partition implied by the Fiedler split # of the graph derived from the tree. tip_nodes = list(tree.gen_tips()) D = tree.get_partial_distance_matrix([id(node) for node in tip_nodes]) y = get_vector(D).tolist() name_selection = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem > 0) name_complement = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem <= 0) name_partition = frozenset((name_selection, name_complement)) if name_partition not in valid_partitions: msg = '\n'.join([ 'invalid partition found:', 'tree:', NewickIO.get_newick_string(tree), 'invalid partition:', name_partition ]) if not self.fout: self.fout = open(self.counterexample_filename, 'wt') print >> self.fout, msg print msg self.ncounterexamples += 1 # do not stop looking, even if a counterexample is found return False
def get_response_content(fs): # get the newick trees. trees = [] for tree_string in iterutils.stripped_lines(StringIO(fs.trees)): # parse each tree and make sure that it conforms to various requirements tree = NewickIO.parse(tree_string, FelTree.NewickTree) tip_names = [tip.get_name() for tip in tree.gen_tips()] if len(tip_names) < 4: raise HandlingError('expected at least four tips but found ' + str(len(tip_names))) if any(name is None for name in tip_names): raise HandlingError('each terminal node must be labeled') if len(set(tip_names)) != len(tip_names): raise HandlingError('each terminal node label must be unique') trees.append(tree) # begin the response out = StringIO() # look at each tree nerrors = 0 ncounterexamples = 0 for tree in trees: # get the set of valid partitions implied by the tree valid_parts = TreeComparison.get_partitions(tree) ordered_tip_names = [tip.get_name() for tip in tree.gen_tips()] # assert that the partition implied by the correct formula is valid D = np.array(tree.get_distance_matrix(ordered_tip_names)) loadings = get_principal_coordinate(D) nonneg_leaf_set = frozenset( tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: nerrors += 1 print >> out, 'error: a partition that was supposed to be valid was found to be invalid' print >> out, 'tree:', NewickIO.get_newick_string(tree) print >> out, 'invalid partition:', partition_to_string(part) print >> out # check the validity of the partition implied by the incorrect formula Q = D * D loadings = get_principal_coordinate(Q) nonneg_leaf_set = frozenset( tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: ncounterexamples += 1 print >> out, 'found a counterexample!' print >> out, 'tree:', NewickIO.get_newick_string(tree) print >> out, 'invalid partition:', partition_to_string(part) print >> out print >> out, 'errors found:', nerrors print >> out, 'counterexamples found:', ncounterexamples # return the response return out.getvalue()
def main(): filename = 'counterexamples.out' fout = open(filename, 'wt') print 'Does monotonically transforming the pairwise leaf distances affect the compatibility' print 'of the split found using principal coordinate analysis?' print 'I am looking through random trees for a tree that is split incompatibly' print 'when distances are squared.' print 'Use control-c to stop the program when you get bored.' try: count = 0 ncounterexamples = 0 nerrors = 0 while True: count += 1 # get a random tree n_base_leaves = 4 n_expected_extra_leaves = 1 expected_branch_length = 1 tree = TreeSampler.sample_tree(n_base_leaves, n_expected_extra_leaves, expected_branch_length) # get the set of valid partitions implied by the tree valid_parts = TreeComparison.get_partitions(tree) ordered_tip_names = [tip.get_name() for tip in tree.gen_tips()] # assert that the partition implied by the correct formula is valid D = np.array(tree.get_distance_matrix(ordered_tip_names)) loadings = get_principal_coordinate(D) nonneg_leaf_set = frozenset( tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset( tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: nerrors += 1 print >> fout, 'error: a partition that was supposed to be valid was found to be invalid' print >> fout, 'tree:', NewickIO.get_newick_string(tree) print >> fout, 'invalid partition:', partition_to_string(part) print >> fout # check the validity of the partition implied by the incorrect formula Q = D * D loadings = get_principal_coordinate(Q) nonneg_leaf_set = frozenset( tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset( tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: ncounterexamples += 1 print >> fout, 'found a counterexample!' print >> fout, 'tree:', NewickIO.get_newick_string(tree) print >> fout, 'invalid partition:', partition_to_string(part) print >> fout except KeyboardInterrupt, e: print 'trees examined:', count print 'errors:', nerrors print 'counterexamples:', ncounterexamples
def run(self, distance_matrices, ordered_names): """ This function stores the losses for each reconstruction. @param distance_matrices: a sequence of distance matrices @param ordered_names: order of taxa in the distance matrix """ if self.start_time is not None: msg = "each simulation object should be run only once" raise HandlingError(msg) if not distance_matrices: raise HandlingErrror("no distance matrices were provided") tip_name_set = set(node.name for node in self.original_tree.gen_tips()) if tip_name_set != set(ordered_names): raise HandlingError("leaf name mismatch") self.start_time = time.time() # Define the reference tree and its maximum cost # under different loss functions. reference_tree = self.original_tree max_error_count = TreeComparison.get_nontrivial_split_count(reference_tree) max_loss_value = TreeComparison.get_weighted_split_count(reference_tree) for distance_matrix in distance_matrices: # create the tree builder tree_builder = NeighborhoodJoining.TreeBuilder(distance_matrix, ordered_names, self.splitter) # set parameters of the validating tree builder tree_builder.set_fallback_name(self.fallback_name) # build the tree try: query_tree = tree_builder.build() except NeighborhoodJoining.NeighborhoodJoiningError as e: raise HandlingError(e) # Note the number and weight of partition errors # during the reconstruction. error_count = TreeComparison.get_split_distance(query_tree, reference_tree) loss_value = TreeComparison.get_weighted_split_distance(query_tree, reference_tree) # make sure that the summary is internally consistent assert error_count <= max_error_count, (error_count, max_error_count) assert loss_value <= max_loss_value, (loss_value, max_loss_value) # save the reconstruction characteristics to use later self.error_counts.append(error_count) self.loss_values.append(loss_value) self.max_error_counts.append(max_error_count) self.max_loss_values.append(max_loss_value) self.stop_time = time.time()
def get_response_content(fs): # get the newick trees. trees = [] for tree_string in iterutils.stripped_lines(StringIO(fs.trees)): # parse each tree and make sure that it conforms to various requirements tree = NewickIO.parse(tree_string, FelTree.NewickTree) tip_names = [tip.get_name() for tip in tree.gen_tips()] if len(tip_names) < 4: raise HandlingError('expected at least four tips but found ' + str(len(tip_names))) if any(name is None for name in tip_names): raise HandlingError('each terminal node must be labeled') if len(set(tip_names)) != len(tip_names): raise HandlingError('each terminal node label must be unique') trees.append(tree) # begin the response out = StringIO() # look at each tree nerrors = 0 ncounterexamples = 0 for tree in trees: # get the set of valid partitions implied by the tree valid_parts = TreeComparison.get_partitions(tree) ordered_tip_names = [tip.get_name() for tip in tree.gen_tips()] # assert that the partition implied by the correct formula is valid D = np.array(tree.get_distance_matrix(ordered_tip_names)) loadings = get_principal_coordinate(D) nonneg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: nerrors += 1 print >> out, 'error: a partition that was supposed to be valid was found to be invalid' print >> out, 'tree:', NewickIO.get_newick_string(tree) print >> out, 'invalid partition:', partition_to_string(part) print >> out # check the validity of the partition implied by the incorrect formula Q = D * D loadings = get_principal_coordinate(Q) nonneg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: ncounterexamples += 1 print >> out, 'found a counterexample!' print >> out, 'tree:', NewickIO.get_newick_string(tree) print >> out, 'invalid partition:', partition_to_string(part) print >> out print >> out, 'errors found:', nerrors print >> out, 'counterexamples found:', ncounterexamples # return the response return out.getvalue()
def main(): filename = 'counterexamples.out' fout = open(filename, 'wt') print 'Does monotonically transforming the pairwise leaf distances affect the compatibility' print 'of the split found using principal coordinate analysis?' print 'I am looking through random trees for a tree that is split incompatibly' print 'when distances are squared.' print 'Use control-c to stop the program when you get bored.' try: count = 0 ncounterexamples = 0 nerrors = 0 while True: count += 1 # get a random tree n_base_leaves = 4 n_expected_extra_leaves = 1 expected_branch_length = 1 tree = TreeSampler.sample_tree(n_base_leaves, n_expected_extra_leaves, expected_branch_length) # get the set of valid partitions implied by the tree valid_parts = TreeComparison.get_partitions(tree) ordered_tip_names = [tip.get_name() for tip in tree.gen_tips()] # assert that the partition implied by the correct formula is valid D = np.array(tree.get_distance_matrix(ordered_tip_names)) loadings = get_principal_coordinate(D) nonneg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: nerrors += 1 print >> fout, 'error: a partition that was supposed to be valid was found to be invalid' print >> fout, 'tree:', NewickIO.get_newick_string(tree) print >> fout, 'invalid partition:', partition_to_string(part) print >> fout # check the validity of the partition implied by the incorrect formula Q = D * D loadings = get_principal_coordinate(Q) nonneg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: ncounterexamples += 1 print >> fout, 'found a counterexample!' print >> fout, 'tree:', NewickIO.get_newick_string(tree) print >> fout, 'invalid partition:', partition_to_string(part) print >> fout except KeyboardInterrupt, e: print 'trees examined:', count print 'errors:', nerrors print 'counterexamples:', ncounterexamples
def examine_mds_splits(): """ Examine properties of the hyperplane orthogonal to the MDS axis of a hyperellipse. The hyperellipse is the Steiner circumscribed hyperellipse that intersects points of the embedded leaves of a tree. Earlier results show that the hyperplane orthogonal to the principal axis of this hyperellipse should separate the leaves in a way that is compatible with the topology of the tree. Here we investigate the conjecture that this same hyperplane also splits internal vertices in a way that is compatible with the topology of the tree. """ count = 0 ncontrol_noneuclidean_counterexamples = 0 ncontrol_secondary_counterexamples = 0 print 'Does the principal hyperplane of the leaves always intersect the tree at exactly one point?' print 'Press control-C to stop looking for a counterexample...' try: while True: # pick a random number of taxa to use as leaves in the tree ntaxa = random.randrange(3, 12) # sample an xtree with exponentially distributed branch lengths xtree = TreeSampler.sample_agglomerated_tree(ntaxa) for branch in xtree.get_branches(): mu = 2.0 branch.length = random.expovariate(1/mu) # convert the xtree to a FelTree so we can use the internal vertices tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get the full id splits of the tree, including internal nodes id_set = set(id(node) for node in tree.preorder()) d = TreeComparison._get_branch_id_to_node_id_set(tree) full_id_splits = set(frozenset((frozenset(x), frozenset(id_set-x))) for x in d.values()) # get ordered ids and the number of leaves ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) # get the projection D_full = np.array(tree.get_full_distance_matrix(ordered_ids)) projected_points = do_projection(D_full, nleaves) # get the split implied by the principal hyperplane of the leaves left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[0] < 0) right_ids = id_set - left_ids split = frozenset((frozenset(left_ids), frozenset(right_ids))) # if the split is not compatible with the tree then we have found a counterexample if split not in full_id_splits: print 'counterexample:' print tree_string break # now do a control where I look at the wrong eigenvector left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[1] < 0) right_ids = id_set - left_ids split = frozenset((frozenset(left_ids), frozenset(right_ids))) if split not in full_id_splits: ncontrol_secondary_counterexamples += 1 # now do a control that should provide the occasional counterexample D_control = np.sqrt(D_full) projected_points = do_projection(D_control, nleaves) left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[0] < 0) right_ids = id_set - left_ids split = frozenset((frozenset(left_ids), frozenset(right_ids))) if split not in full_id_splits: ncontrol_noneuclidean_counterexamples += 1 # increment the count count += 1 except KeyboardInterrupt, e: print 'Checked', count, 'trees and found no counterexample.' print 'Found', ncontrol_secondary_counterexamples, 'control counterexamples where I use the wrong eigenvector.' print 'Found', ncontrol_noneuclidean_counterexamples, 'control counterexamples where I use the wrong distance matrix.'
def get_response_content(fs): # get the newick trees. trees = [] for tree_string in iterutils.stripped_lines(fs.trees.splitlines()): # parse each tree and make sure that it conforms to various requirements tree = NewickIO.parse(tree_string, FelTree.NewickTree) tip_names = [tip.get_name() for tip in tree.gen_tips()] if len(tip_names) < 4: raise HandlingError('expected at least four tips ' 'but found ' + str(len(tip_names))) if any(name is None for name in tip_names): raise HandlingError('each terminal node must be labeled') if len(set(tip_names)) != len(tip_names): raise HandlingError('each terminal node label must be unique') trees.append(tree) # create the response out = StringIO() same_count = 0 diff_count = 0 for tree in trees: # make the local paragraph that will be shown if there is an event local_out = StringIO() has_event = False # print the tree print >> local_out, NewickIO.get_newick_string(tree) # get the tip nodes and the internal nodes tip_nodes = [] internal_nodes = [] for node in tree.preorder(): if node.is_tip(): tip_nodes.append(node) else: internal_nodes.append(node) all_nodes = tip_nodes + internal_nodes # get all tip name partitions implied by the tree topology valid_partitions = TreeComparison.get_partitions(tree) # get results from the augmented distance matrix D_full = tree.get_partial_distance_matrix( [id(node) for node in all_nodes]) y_full = get_vector(D_full).tolist() y = y_full[:len(tip_nodes)] name_selection = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem > 0) name_complement = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem <= 0) name_partition_a = frozenset((name_selection, name_complement)) if name_partition_a not in valid_partitions: print >> local_out, 'augmented distance matrix split fail:', print >> local_out, name_partition_a has_event = True # get results from the not-augmented distance matrix D = tree.get_partial_distance_matrix([id(node) for node in tip_nodes]) y = get_vector(D).tolist() name_selection = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem > 0) name_complement = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem <= 0) name_partition_b = frozenset((name_selection, name_complement)) if name_partition_b not in valid_partitions: print >> local_out, 'not-augmented distance matrix split fail:', print >> local_out, name_partition_b has_event = True # compare the name partitions if name_partition_a == name_partition_b: same_count += 1 else: diff_count += 1 print >> local_out, 'this tree was split differently ' print >> local_out, 'by the different methods:' print >> local_out, 'augmented distance matrix split:', print >> local_out, name_partition_a print >> local_out, 'not-augmented distance matrix split:', print >> local_out, name_partition_b has_event = True # print a newline between trees if has_event: print >> out, local_out.getvalue() # write the summary print >> out, 'for this many trees the same split was found:', print >> out, same_count print >> out, 'for this many trees different splits were found:', print >> out, diff_count # write the response return out.getvalue()
def get_response_content(fs): # get the newick trees. trees = [] for tree_string in iterutils.stripped_lines(fs.trees.splitlines()): # parse each tree and make sure that it conforms to various requirements tree = NewickIO.parse(tree_string, FelTree.NewickTree) tip_names = [tip.get_name() for tip in tree.gen_tips()] if len(tip_names) < 4: raise HandlingError( 'expected at least four tips ' 'but found ' + str(len(tip_names))) if any(name is None for name in tip_names): raise HandlingError('each terminal node must be labeled') if len(set(tip_names)) != len(tip_names): raise HandlingError('each terminal node label must be unique') trees.append(tree) # create the response out = StringIO() same_count = 0 diff_count = 0 for tree in trees: # make the local paragraph that will be shown if there is an event local_out = StringIO() has_event = False # print the tree print >> local_out, NewickIO.get_newick_string(tree) # get the tip nodes and the internal nodes tip_nodes = [] internal_nodes = [] for node in tree.preorder(): if node.is_tip(): tip_nodes.append(node) else: internal_nodes.append(node) all_nodes = tip_nodes + internal_nodes # get all tip name partitions implied by the tree topology valid_partitions = TreeComparison.get_partitions(tree) # get results from the augmented distance matrix D_full = tree.get_partial_distance_matrix( [id(node) for node in all_nodes]) y_full = get_vector(D_full).tolist() y = y_full[:len(tip_nodes)] name_selection = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem > 0) name_complement = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem <= 0) name_partition_a = frozenset((name_selection, name_complement)) if name_partition_a not in valid_partitions: print >> local_out, 'augmented distance matrix split fail:', print >> local_out, name_partition_a has_event = True # get results from the not-augmented distance matrix D = tree.get_partial_distance_matrix([id(node) for node in tip_nodes]) y = get_vector(D).tolist() name_selection = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem > 0) name_complement = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem <= 0) name_partition_b = frozenset((name_selection, name_complement)) if name_partition_b not in valid_partitions: print >> local_out, 'not-augmented distance matrix split fail:', print >> local_out, name_partition_b has_event = True # compare the name partitions if name_partition_a == name_partition_b: same_count += 1 else: diff_count += 1 print >> local_out, 'this tree was split differently ' print >> local_out, 'by the different methods:' print >> local_out, 'augmented distance matrix split:', print >> local_out, name_partition_a print >> local_out, 'not-augmented distance matrix split:', print >> local_out, name_partition_b has_event = True # print a newline between trees if has_event: print >> out, local_out.getvalue() # write the summary print >> out, 'for this many trees the same split was found:', print >> out, same_count print >> out, 'for this many trees different splits were found:', print >> out, diff_count # write the response return out.getvalue()
def examine_mds_splits(): """ Examine properties of the hyperplane orthogonal to the MDS axis of a hyperellipse. The hyperellipse is the Steiner circumscribed hyperellipse that intersects points of the embedded leaves of a tree. Earlier results show that the hyperplane orthogonal to the principal axis of this hyperellipse should separate the leaves in a way that is compatible with the topology of the tree. Here we investigate the conjecture that this same hyperplane also splits internal vertices in a way that is compatible with the topology of the tree. """ count = 0 ncontrol_noneuclidean_counterexamples = 0 ncontrol_secondary_counterexamples = 0 print 'Does the principal hyperplane of the leaves always intersect the tree at exactly one point?' print 'Press control-C to stop looking for a counterexample...' try: while True: # pick a random number of taxa to use as leaves in the tree ntaxa = random.randrange(3, 12) # sample an xtree with exponentially distributed branch lengths xtree = TreeSampler.sample_agglomerated_tree(ntaxa) for branch in xtree.get_branches(): mu = 2.0 branch.length = random.expovariate(1 / mu) # convert the xtree to a FelTree so we can use the internal vertices tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get the full id splits of the tree, including internal nodes id_set = set(id(node) for node in tree.preorder()) d = TreeComparison._get_branch_id_to_node_id_set(tree) full_id_splits = set( frozenset((frozenset(x), frozenset(id_set - x))) for x in d.values()) # get ordered ids and the number of leaves ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) # get the projection D_full = np.array(tree.get_full_distance_matrix(ordered_ids)) projected_points = do_projection(D_full, nleaves) # get the split implied by the principal hyperplane of the leaves left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[0] < 0) right_ids = id_set - left_ids split = frozenset((frozenset(left_ids), frozenset(right_ids))) # if the split is not compatible with the tree then we have found a counterexample if split not in full_id_splits: print 'counterexample:' print tree_string break # now do a control where I look at the wrong eigenvector left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[1] < 0) right_ids = id_set - left_ids split = frozenset((frozenset(left_ids), frozenset(right_ids))) if split not in full_id_splits: ncontrol_secondary_counterexamples += 1 # now do a control that should provide the occasional counterexample D_control = np.sqrt(D_full) projected_points = do_projection(D_control, nleaves) left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[0] < 0) right_ids = id_set - left_ids split = frozenset((frozenset(left_ids), frozenset(right_ids))) if split not in full_id_splits: ncontrol_noneuclidean_counterexamples += 1 # increment the count count += 1 except KeyboardInterrupt, e: print 'Checked', count, 'trees and found no counterexample.' print 'Found', ncontrol_secondary_counterexamples, 'control counterexamples where I use the wrong eigenvector.' print 'Found', ncontrol_noneuclidean_counterexamples, 'control counterexamples where I use the wrong distance matrix.'