Ejemplo n.º 1
0
def get_sample_results(sequence_length, ntaxa, nj_like, branch_length_sampler):
    """
    @param sequence_length: the length of each sequence in the sampled alignment
    @param ntaxa: the number of sequences in the sampled tree
    @param nj_like: True to create subsequent distance matrices using a generalized neighbor-joining-like approach
    @param branch_length_sampler: the length of each branch is independently sampled by this function
    @return: a numpy array conformant to the global header list
    """
    # initialize the array that will be returned
    attribute_array = np.zeros((len(g_headers),), dtype=np.int)
    # first sample a tree and get its set of informative splits
    tree = TreeSampler.sample_agglomerated_tree(ntaxa)
    true_splits = tree.get_nontrivial_splits()
    # sample the branch lengths
    for branch in tree.get_branches():
        branch.length = branch_length_sampler()
    # sample a distance matrix
    try:
        D = sample_distance_matrix(tree, sequence_length)
    except InfiniteDistanceError as e:
        return incr_attribute(attribute_array, 'nsamples.rejected.inf')
    except ZeroDistanceError as e:
        return incr_attribute(attribute_array, 'nsamples.rejected.zero')
    except BuildTreeTopology.InvalidSpectralSplitException, e:
        return incr_attribute(attribute_array, 'nsamples.rejected.fail')
Ejemplo n.º 2
0
def process(ntaxa, nseconds, branch_length_sampler):
    """
    The sampling functor returns a branch length and has a string cast.
    @param ntaxa: the number of taxa in the sampled trees
    @param nseconds: allow this many seconds to run or None to run forever
    @param branch_length_sampler: a sampling functor
    @return: a multi-line string that summarizes the results
    """
    data_rows = []
    start_time = time.time()
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample the tree
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            # get the atteson bound
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            atteson_bound = 0.5 * min(b.length for b in tree.get_branches())
            # get the spectral bound
            D = np.array(tree.get_distance_matrix())
            k = len(D)
            spectral_bound = get_stability(D) / k
            # store the row
            row = [atteson_bound, spectral_bound, tree.get_newick_string()]
            data_rows.append(row)
    except KeyboardInterrupt, e:
        pass
Ejemplo n.º 3
0
def get_sample_results(sequence_length, ntaxa, nj_like, branch_length_sampler):
    """
    @param sequence_length: the length of each sequence in the sampled alignment
    @param ntaxa: the number of sequences in the sampled tree
    @param nj_like: True to create subsequent distance matrices using a generalized neighbor-joining-like approach
    @param branch_length_sampler: the length of each branch is independently sampled by this function
    @return: a numpy array conformant to the global header list
    """
    # initialize the array that will be returned
    attribute_array = np.zeros((len(g_headers), ), dtype=np.int)
    # first sample a tree and get its set of informative splits
    tree = TreeSampler.sample_agglomerated_tree(ntaxa)
    true_splits = tree.get_nontrivial_splits()
    # sample the branch lengths
    for branch in tree.get_branches():
        branch.length = branch_length_sampler()
    # sample a distance matrix
    try:
        D = sample_distance_matrix(tree, sequence_length)
    except InfiniteDistanceError as e:
        return incr_attribute(attribute_array, 'nsamples.rejected.inf')
    except ZeroDistanceError as e:
        return incr_attribute(attribute_array, 'nsamples.rejected.zero')
    except BuildTreeTopology.InvalidSpectralSplitException, e:
        return incr_attribute(attribute_array, 'nsamples.rejected.fail')
Ejemplo n.º 4
0
def process(ntaxa, nseconds, nsamples, branch_length_sampler, use_pbar):
    """
    @param ntaxa: the number of taxa per tree
    @param nseconds: stop after this many seconds
    @param nsamples: stop after this many samples
    @param branch_length_sampler: this function samples branch lengths independently
    @param use_pbar: True iff a progress bar should be used
    @return: a multi-line string of the contents of an R table
    """
    a_successes = 0
    a_failures = 0
    b_successes = 0
    b_failures = 0
    # Repeatedly analyze samples.
    # We might have to stop early if we run out of time or if ctrl-c is pressed.
    # If we have to stop early, then show the results of the progress so far.
    termination_reason = 'no reason for termination was given'
    start_time = time.time()
    pbar = Progress.Bar(nsamples) if use_pbar else None
    try:
        for sample_index in range(nsamples):
            # check the time
            if nseconds and time.time() - start_time > nseconds:
                raise TimeoutError()
            # sample a tree
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            D = np.array(tree.get_distance_matrix())
            # get the split defined by the tree
            original_split = get_split(D)
            # get the stability of the split
            stability = get_stability(D)
            # sample a perturbation matrix that should not change the split
            E = sample_perturbation_matrix(ntaxa, stability/2)
            # evaluate the split induced by the unerperturbed perturbed distance matrix
            perturbed_split = get_split(D + E)
            if original_split == perturbed_split:
                a_successes += 1
            else:
                a_failures += 1
            # evaluage the split induced by the overperturbed distance matrix
            perturbed_split = get_split(D + E*200)
            if original_split == perturbed_split:
                b_successes += 1
            else:
                b_failures += 1
            # update the progress bar
            if pbar:
                pbar.update(sample_index + 1)
        else:
            termination_reason = 'the requested number of samples was attained'
    except KeyboardInterrupt, e:
        termination_reason = 'keyboard interrupt'
Ejemplo n.º 5
0
def process(ntaxa, length, nseconds, builders, branch_length_sampler):
    """
    @param ntaxa: the number of taxa in the sampled trees
    @param length: the length of sequences used to sample the distance matrix
    @param nseconds: allow this many seconds to run
    @param builders: tree builder objects
    @param branch_length_sampler: returns a tree drawn from some distribution
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    # track the number of samples that failed for various reasons
    n_zero_errors = 0
    n_infinite_errors = 0
    n_failed_spectral_splits = 0
    # define the number of attempts that fall into each of the four categories
    non_atteson_results = [[0, 0], [0, 0]]
    atteson_results = [[0, 0], [0, 0]]
    #pachter_results = [[0, 0], [0, 0]]
    # evaluate the quality of reconstructions from a bunch of different samples
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample the tree topology and get its set of implied full label splits
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            true_splits = tree.get_nontrivial_splits()
            # sample the branch lengths
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            try:
                D = sample_distance_matrix(tree, length)
                a, b = [
                    builder.evaluate(true_splits, D) for builder in builders
                ]
                if BuildTreeTopology.is_atteson(tree, D):
                    atteson_results[a][b] += 1
                #elif BuildTreeTopology.is_quartet_additive(tree, D) and BuildTreeTopology.is_quartet_consistent(tree, D):
                #pachter_results[a][b] += 1
                else:
                    non_atteson_results[a][b] += 1
            except InfiniteDistanceError as e:
                n_infinite_errors += 1
            except ZeroDistanceError as e:
                n_zero_errors += 1
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                n_failed_spectral_splits += 1
    except KeyboardInterrupt, e:
        pass
Ejemplo n.º 6
0
def process(ntaxa, nseconds, branch_length_sampler):
    """
    @param ntaxa: the number of taxa in the sampled trees
    @param nseconds: allow this many seconds to run or None to run forever
    @param branch_length_sampler: a functor that returns a branch length and has a string cast
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    # initialize some state that will be tracked over the entire run
    degenerate_count = 0
    invalid_split_count = 0
    valid_split_count = 0
    spectral_error_count = 0
    atteson_error_count = 0
    counterexample_D = None
    counterexample_tree = None
    # do a bunch of reconstructions from sampled distance matrices
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample the tree topology and get its set of implied full label splits
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            true_splits = tree.get_nontrivial_splits()
            # sample the branch lengths
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            # sample the atteson distance matrix
            D = sample_atteson_distance_matrix(tree)
            # assert that the atteson condition is true
            if not BuildTreeTopology.is_atteson(tree, D):
                atteson_error_count += 1
            else:
                try:
                    # see if the eigensplit is in the set of true splits
                    eigensplit = BuildTreeTopology.split_using_eigenvector(D)
                    if eigensplit in true_splits:
                        valid_split_count += 1
                    else:
                        invalid_split_count += 1
                        counterexample_D = D
                        counterexample_tree = tree
                        break
                except BuildTreeTopology.DegenerateSplitException, e:
                    degenerate_count += 1
                except BuildTreeTopology.InvalidSpectralSplitException, e:
                    spectral_error_count += 1
Ejemplo n.º 7
0
def process(ntaxa, nseconds, branch_length_sampler):
    """
    @param ntaxa: the number of taxa in the sampled trees
    @param nseconds: allow this many seconds to run or None to run forever
    @param branch_length_sampler: a functor that returns a branch length and has a string cast
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    # initialize some state that will be tracked over the entire run
    degenerate_count = 0
    invalid_split_count = 0
    valid_split_count = 0
    spectral_error_count = 0
    atteson_error_count = 0
    counterexample_D = None
    counterexample_tree = None
    # do a bunch of reconstructions from sampled distance matrices
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample the tree topology and get its set of implied full label splits
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            true_splits = tree.get_nontrivial_splits()
            # sample the branch lengths
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            # sample the atteson distance matrix
            D = sample_atteson_distance_matrix(tree)
            # assert that the atteson condition is true
            if not BuildTreeTopology.is_atteson(tree, D):
                atteson_error_count += 1
            else:
                try:
                    # see if the eigensplit is in the set of true splits
                    eigensplit = BuildTreeTopology.split_using_eigenvector(D)
                    if eigensplit in true_splits:
                        valid_split_count += 1
                    else:
                        invalid_split_count += 1
                        counterexample_D = D
                        counterexample_tree = tree
                        break
                except BuildTreeTopology.DegenerateSplitException, e:
                    degenerate_count += 1
                except BuildTreeTopology.InvalidSpectralSplitException, e:
                    spectral_error_count += 1
Ejemplo n.º 8
0
def process(ntaxa, length, nseconds, builders, branch_length_sampler):
    """
    @param ntaxa: the number of taxa in the sampled trees
    @param length: the length of sequences used to sample the distance matrix
    @param nseconds: allow this many seconds to run
    @param builders: tree builder objects
    @param branch_length_sampler: returns a tree drawn from some distribution
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    # track the number of samples that failed for various reasons
    n_zero_errors = 0
    n_infinite_errors = 0
    n_failed_spectral_splits = 0
    # define the number of attempts that fall into each of the four categories
    non_atteson_results = [[0, 0], [0, 0]]
    atteson_results = [[0, 0], [0, 0]]
    #pachter_results = [[0, 0], [0, 0]]
    # evaluate the quality of reconstructions from a bunch of different samples
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample the tree topology and get its set of implied full label splits
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            true_splits = tree.get_nontrivial_splits()
            # sample the branch lengths
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            try:
                D = sample_distance_matrix(tree, length)
                a, b = [builder.evaluate(true_splits, D) for builder in builders]
                if BuildTreeTopology.is_atteson(tree, D):
                    atteson_results[a][b] += 1
                #elif BuildTreeTopology.is_quartet_additive(tree, D) and BuildTreeTopology.is_quartet_consistent(tree, D):
                    #pachter_results[a][b] += 1
                else:
                    non_atteson_results[a][b] += 1
            except InfiniteDistanceError as e:
                n_infinite_errors += 1
            except ZeroDistanceError as e:
                n_zero_errors += 1
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                n_failed_spectral_splits += 1
    except KeyboardInterrupt, e:
        pass
Ejemplo n.º 9
0
def process(ntaxa):
    """
    @param ntaxa: use this many taxa per tree
    @return: a multi-line string that summarizes the results
    """
    np.set_printoptions(linewidth=200)
    # sample an xtree topology
    xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
    # convert the xtree to a FelTree, although I guess this might not be necessary
    tree_string = xtree.get_newick_string()
    tree = NewickIO.parse(tree_string, FelTree.NewickTree)
    # get ordered ids and the number of leaves and some auxiliary variables
    ordered_ids = get_ordered_ids(tree)
    nleaves = len(list(tree.gen_tips()))
    id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids))
    # sample random branch lengths
    sample_branch_lengths(tree)
    # get the weighted tree string
    weighted_tree_string = NewickIO.get_newick_string(tree)
    # get the distance matrix relating all vertices
    D = np.array(tree.get_partial_distance_matrix(ordered_ids))
    # create a mass vector that sums to one
    m = np.array([random.randrange(1, 10) for i in range(len(D))], dtype=float)
    m /= sum(m)
    # get the S matrix
    S = edm_to_S(D, m)
    # get the pseudoinverse of S
    S_pinv = np.linalg.pinv(S)
    # make the response
    out = StringIO()
    print >> out, 'newick tree:', weighted_tree_string
    print >> out
    print >> out, 'm:'
    print >> out, m
    print >> out
    print >> out, 'D:'
    print >> out, D
    print >> out
    print >> out, 'S:'
    print >> out, S
    print >> out
    print >> out, 'pseudoinverse of S:'
    print >> out, S_pinv
    print >> out
    return out.getvalue().strip()
Ejemplo n.º 10
0
def process(ntaxa, length, nseconds, branch_length_sampler, use_nj,
            use_modified_nj, use_all_spectral, use_one_spectral):
    """
    @param ntaxa: the number of taxa in the sampled trees
    @param length: the length of sequences used to sample the distance matrix
    @param nseconds: allow this many seconds to run or None to run forever
    @param branch_length_sampler: a functor that returns a branch length and has a string cast
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    # initialize the builder object
    builder = Builder()
    # track the number of samples that failed for various reasons
    n_zero_errors = 0
    n_infinite_errors = 0
    n_failed_spectral_splits = 0
    # do a bunch of reconstructions of sampled distance matrices
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample the tree topology and get its set of implied full label splits
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            true_splits = tree.get_nontrivial_splits()
            # sample the branch lengths
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            try:
                D = sample_distance_matrix(tree, length)
                # determine whether or not the distance matrix is Atteson with respect to the tree
                atteson = BuildTreeTopology.is_atteson(tree, D)
                # record information about the splits
                builder.evaluate(true_splits, D, atteson, use_nj,
                                 use_modified_nj, use_all_spectral,
                                 use_one_spectral)
            except InfiniteDistanceError as e:
                n_infinite_errors += 1
            except ZeroDistanceError as e:
                n_zero_errors += 1
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                n_failed_spectral_splits += 1
    except KeyboardInterrupt, e:
        pass
Ejemplo n.º 11
0
def process(ntaxa, length, nseconds, branch_length_sampler, use_nj, use_modified_nj, use_all_spectral, use_one_spectral):
    """
    @param ntaxa: the number of taxa in the sampled trees
    @param length: the length of sequences used to sample the distance matrix
    @param nseconds: allow this many seconds to run or None to run forever
    @param branch_length_sampler: a functor that returns a branch length and has a string cast
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    # initialize the builder object
    builder = Builder()
    # track the number of samples that failed for various reasons
    n_zero_errors = 0
    n_infinite_errors = 0
    n_failed_spectral_splits = 0
    # do a bunch of reconstructions of sampled distance matrices
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample the tree topology and get its set of implied full label splits
            tree = TreeSampler.sample_agglomerated_tree(ntaxa)
            true_splits = tree.get_nontrivial_splits()
            # sample the branch lengths
            for branch in tree.get_branches():
                branch.length = branch_length_sampler()
            try:
                D = sample_distance_matrix(tree, length)
                # determine whether or not the distance matrix is Atteson with respect to the tree
                atteson = BuildTreeTopology.is_atteson(tree, D)
                # record information about the splits
                builder.evaluate(true_splits, D, atteson, use_nj, use_modified_nj, use_all_spectral, use_one_spectral)
            except InfiniteDistanceError as e:
                n_infinite_errors += 1
            except ZeroDistanceError as e:
                n_zero_errors += 1
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                n_failed_spectral_splits += 1
    except KeyboardInterrupt, e:
        pass
Ejemplo n.º 12
0
def process(ntaxa, nseconds):
    """
    @param nseconds: allow this many seconds to run or None to run forever
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    nsamples_rejected = 0
    nsamples_accepted = 0
    pattern_to_topo_surrogate = {}
    pattern_to_tree_string = {}
    counterexample_message = 'no counterexample was found'
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample an xtree topology
            xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
            # convert the xtree to a FelTree, although I guess this might not be necessary
            tree_string = xtree.get_newick_string()
            tree = NewickIO.parse(tree_string, FelTree.NewickTree)
            # get ordered ids and the number of leaves and some auxiliary variables
            ordered_ids = get_ordered_ids(tree)
            nleaves = len(list(tree.gen_tips()))
            id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids))
            # force every branch length to be the unit length
            reset_branch_lengths(tree)
            # get the unweighted distance matrix among tips in convenient hashable form
            D_unit = np.array(tree.get_partial_distance_matrix(ordered_ids))
            topo_surrogate = tuple(tuple(row.tolist()) for row in D_unit)
            # sample random branch lengths
            sample_branch_lengths(tree)
            # get the weighted tree string
            weighted_tree_string = NewickIO.get_newick_string(tree)
            # get the distance matrix relating the leaves
            D = np.array(tree.get_partial_distance_matrix(ordered_ids))
            # get the projections onto the MDS axes of the leaves
            X = Euclid.edm_to_points(D)
            # if any coordinate is near zero then reject the sample
            if np.min(np.abs(X)) < g_epsilon:
                nsamples_rejected += 1
                continue
            # do an orthogonal transformation that puts the first point in the positive orthant
            canonizing_vector = np.array(point_to_orthant(X[0]))
            X *= canonizing_vector
            # get the canonical sign pattern
            sign_pattern = tuple(point_to_orthant(row) for row in X)
            # compare the topo surrogate of this sign pattern to the one in memory
            expected_topo_surrogate = pattern_to_topo_surrogate.get(sign_pattern, None)
            if expected_topo_surrogate:
                if topo_surrogate != expected_topo_surrogate:
                    remembered_tree_string = pattern_to_tree_string[sign_pattern]
                    msg = 'these trees have the same sign pattern but different topologies: {%s, %s}' % (weighted_tree_string, remembered_tree_string)
                    raise CounterexampleError(msg)
            else:
                pattern_to_topo_surrogate[sign_pattern] = topo_surrogate
                pattern_to_tree_string[sign_pattern] = weighted_tree_string
            # increment the count of accepted samples
            nsamples_accepted += 1
    except KeyboardInterrupt, e:
        pass
Ejemplo n.º 13
0
def process(ntaxa):
    np.set_printoptions(linewidth=200)
    out = StringIO()
    # sample an xtree topology
    xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
    # sample an xtree with exponentially distributed branch lengths
    mu = 2.0
    for branch in xtree.get_branches():
        branch.length = random.expovariate(1/mu)
    # convert the xtree to a FelTree so we can use the internal vertices
    tree_string = xtree.get_newick_string()
    tree = NewickIO.parse(tree_string, FelTree.NewickTree)
    # get ordered ids and the number of leaves and some auxiliary variables
    ordered_ids = get_ordered_ids(tree)
    nleaves = len(list(tree.gen_tips()))
    id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids))
    # get the distance matrix relating all of the points
    D_full = np.array(tree.get_full_distance_matrix(ordered_ids))
    # Now do the projection so that
    # the resulting points are in the subspace whose basis vectors are the axes of the leaf ellipsoid.
    # First get the points such that the n rows in X are points in n-1 dimensional space.
    X = Euclid.edm_to_points(D_full)
    print >> out, 'points with centroid at origin:'
    print >> out, X
    print >> out
    # Translate all of the points so that the origin is at the centroid of the leaves.
    X -= np.mean(X[:nleaves], 0)
    print >> out, 'points with centroid of leaves at origin:'
    print >> out, X
    print >> out
    # Extract the subset of points that define the leaves.
    L = X[:nleaves]
    # Find the orthogonal transformation of the leaves onto their MDS axes.
    # According to the python svd documentation, singular values are sorted most important to least important.
    U, s, Vt = np.linalg.svd(L)
    # Transform all of the points (including the internal vertices) according to this orthogonal transformation.
    # The axes are now the axes of the Steiner circumscribed ellipsoid of the leaf vertices.
    # I am using M.T[:k].T to get the first k columns of M.
    Z = np.dot(X, Vt.T)
    print >> out, 'orthogonally transformed points (call this Z):'
    print >> out, Z
    print >> out
    Y = Z.T[:(nleaves-1)].T
    print >> out, 'projection of the points onto the axes of the leaf ellipsoid,'
    print >> out, '(these are the first columns of Z; call this projected matrix Y):'
    print >> out, Y
    print >> out
    # Show the inner products.
    inner_products_of_columns = np.dot(Y.T, Y)
    print >> out, "pairwise inner products of the columns of Y (that is, Y'Y)"
    print >> out, inner_products_of_columns
    print >> out
    # Show other inner products.
    inner_products_of_columns = np.dot(Y[:5].T, Y[:5])
    print >> out, "pairwise inner products of the first few columns of Y"
    print >> out, inner_products_of_columns
    print >> out
    # Extract the subset of points that define the points of articulation.
    # Note that the origin is the centroid of the leaves.
    R = X[nleaves:]
    Y_leaves = Y[:nleaves]
    W = np.dot(np.linalg.pinv(L), Y_leaves)
    print >> out, 'leaf projection using pseudoinverse (first few rows of Y):'
    print >> out, np.dot(L, W)
    print >> out
    print >> out, 'projection of points of articulation using pseudoinverse (remaining rows of Y):'
    print >> out, np.dot(R, W)
    print >> out
    # Get all of the points in high dimensional space.
    X = Euclid.edm_to_points(D_full)
    # Get the MDS onto the lower dimensional space.
    X = X.T[:(nleaves-1)].T
    assert np.allclose(sum(X, 0), 0)
    print >> out, 'all points projected onto the first principal axes of the full ellipsoid:'
    print >> out, X
    print >> out
    # Look at only the leaves in this space.
    L = X[:nleaves]
    L -= np.mean(L, 0)
    print >> out, 'leaves projected onto the first principal axes of the full ellipsoid and then centered:'
    print >> out, L
    print >> out
    # Re-project the leaves onto the axes of leaf ellipsoid.
    D_leaves = Euclid.dccov_to_edm(np.dot(L, L.T))
    Y = Euclid.edm_to_points(D_leaves)
    print >> out, 'leaves further projected onto principal axes of their own ellipsoid:'
    print >> out, Y
    print >> out
    # Try something else
    D_all = Euclid.dccov_to_edm(np.dot(X, X.T))
    Y = Euclid.edm_to_points(D_all).T[:(nleaves-1)].T
    print >> out, 'all points further projected onto their own principal axes of inertia:'
    print >> out, Y
    print >> out
    # Try the same thing some more
    D_again = Euclid.dccov_to_edm(np.dot(Y, Y.T))
    Z = Euclid.edm_to_points(D_again).T[:(nleaves-1)].T
    print >> out, 'all points further projected onto their own principal axes of inertia (second iteration):'
    print >> out, Z
    print >> out
    return out.getvalue().strip()
Ejemplo n.º 14
0
def examine_mds_splits():
    """
    Examine properties of the hyperplane orthogonal to the MDS axis of a hyperellipse.
    The hyperellipse is the Steiner circumscribed hyperellipse that intersects
    points of the embedded leaves of a tree.
    Earlier results show that the hyperplane orthogonal to the principal
    axis of this hyperellipse should separate the leaves in a way that is compatible
    with the topology of the tree.
    Here we investigate the conjecture that this same hyperplane
    also splits internal vertices in a way that is compatible with the topology of the tree.
    """
    count = 0
    ncontrol_noneuclidean_counterexamples = 0
    ncontrol_secondary_counterexamples = 0
    print 'Does the principal hyperplane of the leaves always intersect the tree at exactly one point?'
    print 'Press control-C to stop looking for a counterexample...'
    try:
        while True:
            # pick a random number of taxa to use as leaves in the tree
            ntaxa = random.randrange(3, 12)
            # sample an xtree with exponentially distributed branch lengths
            xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
            for branch in xtree.get_branches():
                mu = 2.0
                branch.length = random.expovariate(1/mu)
            # convert the xtree to a FelTree so we can use the internal vertices
            tree_string = xtree.get_newick_string()
            tree = NewickIO.parse(tree_string, FelTree.NewickTree)
            # get the full id splits of the tree, including internal nodes
            id_set = set(id(node) for node in tree.preorder())
            d = TreeComparison._get_branch_id_to_node_id_set(tree)
            full_id_splits = set(frozenset((frozenset(x), frozenset(id_set-x))) for x in d.values())
            # get ordered ids and the number of leaves
            ordered_ids = get_ordered_ids(tree)
            nleaves = len(list(tree.gen_tips()))
            # get the projection
            D_full = np.array(tree.get_full_distance_matrix(ordered_ids))
            projected_points = do_projection(D_full, nleaves)
            # get the split implied by the principal hyperplane of the leaves
            left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[0] < 0)
            right_ids = id_set - left_ids
            split = frozenset((frozenset(left_ids), frozenset(right_ids)))
            # if the split is not compatible with the tree then we have found a counterexample
            if split not in full_id_splits:
                print 'counterexample:'
                print tree_string
                break
            # now do a control where I look at the wrong eigenvector
            left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[1] < 0)
            right_ids = id_set - left_ids
            split = frozenset((frozenset(left_ids), frozenset(right_ids)))
            if split not in full_id_splits:
                ncontrol_secondary_counterexamples += 1
            # now do a control that should provide the occasional counterexample
            D_control = np.sqrt(D_full)
            projected_points = do_projection(D_control, nleaves)
            left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[0] < 0)
            right_ids = id_set - left_ids
            split = frozenset((frozenset(left_ids), frozenset(right_ids)))
            if split not in full_id_splits:
                ncontrol_noneuclidean_counterexamples += 1
            # increment the count
            count += 1
    except KeyboardInterrupt, e:
        print 'Checked', count, 'trees and found no counterexample.'
        print 'Found', ncontrol_secondary_counterexamples, 'control counterexamples where I use the wrong eigenvector.'
        print 'Found', ncontrol_noneuclidean_counterexamples, 'control counterexamples where I use the wrong distance matrix.'
Ejemplo n.º 15
0
def process(nseconds=None):
    """
    @param nseconds: allow this many seconds to run or None to run forever
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    nsampled_trees = 0
    # track the number of observations of each number of cuts on each axis for each hyperellipse
    internal_important_axis_to_ncuts_dict = {}
    internal_unimportant_axis_to_ncuts_dict = {}
    external_axis_to_ncuts_dict = {}
    # track the number of bad axes of each principality for each hyperellipse
    internal_important_bad_axis_dict = {}
    internal_unimportant_bad_axis_dict = {}
    external_bad_axis_dict = {}
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # pick a random number of taxa to use as leaves in the tree
            ntaxa = random.randrange(3, 12)
            # sample an xtree topology
            xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
            # sample an xtree with exponentially distributed branch lengths
            mu = 2.0
            for branch in xtree.get_branches():
                branch.length = random.expovariate(1 / mu)
            # convert the xtree to a FelTree so we can use the internal vertices
            tree_string = xtree.get_newick_string()
            tree = NewickIO.parse(tree_string, FelTree.NewickTree)
            # get ordered ids and the number of leaves and some auxiliary variables
            ordered_ids = get_ordered_ids(tree)
            nleaves = len(list(tree.gen_tips()))
            id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids))
            # compute the set of pairs of indices corresponding to branches
            neighbor_index_pairs = set()
            for parent in tree.preorder():
                for child in parent.gen_children():
                    parent_index = id_to_index[id(parent)]
                    child_index = id_to_index[id(child)]
                    index_pair = frozenset((parent_index, child_index))
                    neighbor_index_pairs.add(index_pair)
            # get the distance matrix relating all of the points
            D_full = np.array(tree.get_full_distance_matrix(ordered_ids))
            # analyze the intersections of the axes of the ellipsoid that includes internal points
            internal_projection = do_internal_projection(D_full)
            npoints, naxes = internal_projection.shape
            # analyze low axes
            for axis in range(0, nleaves - 1):
                if any(
                        abs(internal_projection[i, axis]) < g_loading_epsilon
                        for i in range(npoints)):
                    internal_important_bad_axis_dict[
                        axis] = internal_important_bad_axis_dict.get(axis,
                                                                     0) + 1
                else:
                    ncuts = 0
                    for indexa, indexb in neighbor_index_pairs:
                        if internal_projection[
                                indexa, axis] * internal_projection[indexb,
                                                                    axis] < 0:
                            ncuts += 1
                    ncuts_dict = internal_important_axis_to_ncuts_dict.get(
                        axis, {})
                    ncuts_dict[ncuts] = ncuts_dict.get(ncuts, 0) + 1
                    internal_important_axis_to_ncuts_dict[axis] = ncuts_dict
            # analyze high axes
            for axis in range(nleaves - 1, naxes):
                if any(
                        abs(internal_projection[i, axis]) < g_loading_epsilon
                        for i in range(npoints)):
                    internal_unimportant_bad_axis_dict[
                        axis] = internal_unimportant_bad_axis_dict.get(
                            axis, 0) + 1
                else:
                    ncuts = 0
                    for indexa, indexb in neighbor_index_pairs:
                        if internal_projection[
                                indexa, axis] * internal_projection[indexb,
                                                                    axis] < 0:
                            ncuts += 1
                    ncuts_dict = internal_unimportant_axis_to_ncuts_dict.get(
                        axis, {})
                    ncuts_dict[ncuts] = ncuts_dict.get(ncuts, 0) + 1
                    internal_unimportant_axis_to_ncuts_dict[axis] = ncuts_dict
            # analyze the intersections of the axes of the ellipsoid that includes only leaf points
            external_projection = do_external_projection(D_full, nleaves)
            npoints, naxes = external_projection.shape
            for axis in range(naxes):
                if any(
                        abs(external_projection[i, axis]) < g_loading_epsilon
                        for i in range(npoints)):
                    external_bad_axis_dict[axis] = external_bad_axis_dict.get(
                        axis, 0) + 1
                else:
                    ncuts = 0
                    for indexa, indexb in neighbor_index_pairs:
                        if external_projection[
                                indexa, axis] * external_projection[indexb,
                                                                    axis] < 0:
                            ncuts += 1
                    ncuts_dict = external_axis_to_ncuts_dict.get(axis, {})
                    ncuts_dict[ncuts] = ncuts_dict.get(ncuts, 0) + 1
                    external_axis_to_ncuts_dict[axis] = ncuts_dict
            # increment the count of sampled trees
            nsampled_trees += 1
    except KeyboardInterrupt, e:
        pass
Ejemplo n.º 16
0
def examine_mds_splits():
    """
    Examine properties of the hyperplane orthogonal to the MDS axis of a hyperellipse.
    The hyperellipse is the Steiner circumscribed hyperellipse that intersects
    points of the embedded leaves of a tree.
    Earlier results show that the hyperplane orthogonal to the principal
    axis of this hyperellipse should separate the leaves in a way that is compatible
    with the topology of the tree.
    Here we investigate the conjecture that this same hyperplane
    also splits internal vertices in a way that is compatible with the topology of the tree.
    """
    count = 0
    ncontrol_noneuclidean_counterexamples = 0
    ncontrol_secondary_counterexamples = 0
    print 'Does the principal hyperplane of the leaves always intersect the tree at exactly one point?'
    print 'Press control-C to stop looking for a counterexample...'
    try:
        while True:
            # pick a random number of taxa to use as leaves in the tree
            ntaxa = random.randrange(3, 12)
            # sample an xtree with exponentially distributed branch lengths
            xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
            for branch in xtree.get_branches():
                mu = 2.0
                branch.length = random.expovariate(1 / mu)
            # convert the xtree to a FelTree so we can use the internal vertices
            tree_string = xtree.get_newick_string()
            tree = NewickIO.parse(tree_string, FelTree.NewickTree)
            # get the full id splits of the tree, including internal nodes
            id_set = set(id(node) for node in tree.preorder())
            d = TreeComparison._get_branch_id_to_node_id_set(tree)
            full_id_splits = set(
                frozenset((frozenset(x), frozenset(id_set - x)))
                for x in d.values())
            # get ordered ids and the number of leaves
            ordered_ids = get_ordered_ids(tree)
            nleaves = len(list(tree.gen_tips()))
            # get the projection
            D_full = np.array(tree.get_full_distance_matrix(ordered_ids))
            projected_points = do_projection(D_full, nleaves)
            # get the split implied by the principal hyperplane of the leaves
            left_ids = set(i for i, point in zip(ordered_ids, projected_points)
                           if point[0] < 0)
            right_ids = id_set - left_ids
            split = frozenset((frozenset(left_ids), frozenset(right_ids)))
            # if the split is not compatible with the tree then we have found a counterexample
            if split not in full_id_splits:
                print 'counterexample:'
                print tree_string
                break
            # now do a control where I look at the wrong eigenvector
            left_ids = set(i for i, point in zip(ordered_ids, projected_points)
                           if point[1] < 0)
            right_ids = id_set - left_ids
            split = frozenset((frozenset(left_ids), frozenset(right_ids)))
            if split not in full_id_splits:
                ncontrol_secondary_counterexamples += 1
            # now do a control that should provide the occasional counterexample
            D_control = np.sqrt(D_full)
            projected_points = do_projection(D_control, nleaves)
            left_ids = set(i for i, point in zip(ordered_ids, projected_points)
                           if point[0] < 0)
            right_ids = id_set - left_ids
            split = frozenset((frozenset(left_ids), frozenset(right_ids)))
            if split not in full_id_splits:
                ncontrol_noneuclidean_counterexamples += 1
            # increment the count
            count += 1
    except KeyboardInterrupt, e:
        print 'Checked', count, 'trees and found no counterexample.'
        print 'Found', ncontrol_secondary_counterexamples, 'control counterexamples where I use the wrong eigenvector.'
        print 'Found', ncontrol_noneuclidean_counterexamples, 'control counterexamples where I use the wrong distance matrix.'
Ejemplo n.º 17
0
def process(nseconds=None):
    """
    @param nseconds: allow this many seconds to run or None to run forever
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    nsampled_trees = 0
    counterexample_message = 'no counterexample was found'
    northants_passed = 0
    northants_failed = 0
    ncontrol_orthants_passed = 0
    ncontrol_orthants_failed = 0
    branch_cut_hist = {}
    control_branch_cut_hist = {}
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # pick a random number of taxa to use as leaves in the tree
            ntaxa = random.randrange(3, 12)
            # sample an xtree topology
            xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
            # sample an xtree with exponentially distributed branch lengths
            mu = 2.0
            for branch in xtree.get_branches():
                branch.length = random.expovariate(1 / mu)
            # convert the xtree to a FelTree so we can use the internal vertices
            tree_string = xtree.get_newick_string()
            tree = NewickIO.parse(tree_string, FelTree.NewickTree)
            # get ordered ids and the number of leaves and some auxiliary variables
            ordered_ids = get_ordered_ids(tree)
            nleaves = len(list(tree.gen_tips()))
            id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids))
            # compute the set of pairs of indices corresponding to branches
            neighbor_index_pairs = set()
            for parent in tree.preorder():
                for child in parent.gen_children():
                    parent_index = id_to_index[id(parent)]
                    child_index = id_to_index[id(child)]
                    index_pair = frozenset((parent_index, child_index))
                    neighbor_index_pairs.add(index_pair)
            # get the distance matrix relating all of the points
            D_full = np.array(tree.get_full_distance_matrix(ordered_ids))
            D_bad = np.sqrt(D_full)
            for D in (D_full, D_bad):
                # get the projections onto the MDS axes of the leaves
                projection = do_projection(D, nleaves)
                npoints, naxes = projection.shape
                # recursively split the points by hyperplanes of principal axes
                next_id_set_list = [set(ordered_ids)]
                for axis in range(naxes):
                    id_set_list = next_id_set_list
                    # create the list of sets of points in principal orthants
                    next_id_set_list = []
                    for id_set in id_set_list:
                        neg_id_set = set(
                            myid for myid in id_set
                            if projection[id_to_index[myid], axis] < 0)
                        nonneg_id_set = set(
                            myid for myid in id_set
                            if projection[id_to_index[myid], axis] >= 0)
                        for next_set in (neg_id_set, nonneg_id_set):
                            if len(next_set) > 1:
                                next_id_set_list.append(next_set)
                    # each set of points should be connected
                    for id_set in next_id_set_list:
                        bconnected = is_connected(tree, id_set)
                        if bconnected and (D is D_full):
                            northants_passed += 1
                        elif (not bconnected) and (D is D_full):
                            northants_failed += 1
                            msg = 'found a counterexample in principal orthant %d of the tree %s' % (
                                axis + 1, tree_string)
                            raise CounterexampleError(msg)
                        elif bconnected and (D is not D_full):
                            ncontrol_orthants_passed += 1
                        elif (not bconnected) and (D is not D_full):
                            ncontrol_orthants_failed += 1
                # define the applicable histogram
                hist = branch_cut_hist if D is D_full else control_branch_cut_hist
                # check the number of cuts per branch
                for i, j in neighbor_index_pairs:
                    ncuts = sum(
                        1 for axis in range(naxes)
                        if projection[i, axis] * projection[j, axis] < 0)
                    hist[ncuts] = hist.get(ncuts, 0) + 1
            # increment the count of sampled trees
            nsampled_trees += 1
    except KeyboardInterrupt, e:
        pass
Ejemplo n.º 18
0
def process(ntaxa, nseconds, seqlen, nsamples, branch_length_sampler,
            use_pbar):
    """
    @param ntaxa: the number of taxa per tree
    @param nseconds: stop after this many seconds
    @param seqlen: use this sequence length
    @param nsamples: stop after this many samples per sequence length
    @param branch_length_sampler: this function samples branch lengths independently
    @param use_pbar: True iff a progress bar should be used
    @return: a multi-line string of the contents of an R table
    """
    # initialize the global rejection counts
    nrejected_zero = 0
    nrejected_inf = 0
    nrejected_fail = 0
    naccepted = 0
    # Initialize the accumulation matrix.
    # The rows specify the size of the smaller side of the initial split.
    # The columns specify the compatibility status of the split.
    nsmall_sizes = (ntaxa / 2) + 1
    accum = np.zeros((nsmall_sizes, 2), dtype=np.int)
    # Repeatedly analyze samples.
    # We might have to stop early if we run out of time or if ctrl-c is pressed.
    # If we have to stop early, then show the results of the progress so far.
    termination_reason = 'no reason for termination was given'
    start_time = time.time()
    pbar = Progress.Bar(nsamples) if use_pbar else None
    try:
        for sample_index in range(nsamples):
            # keep trying to get an accepted sample
            while True:
                # check the time
                if nseconds and time.time() - start_time > nseconds:
                    raise TimeoutError()
                # first sample a tree and get its set of informative splits
                tree = TreeSampler.sample_agglomerated_tree(ntaxa)
                true_splits = tree.get_nontrivial_splits()
                # sample the branch lengths
                for branch in tree.get_branches():
                    branch.length = branch_length_sampler()
                # Attempt to sample a distance matrix.
                # If the sample was rejected then note the reason and go back to the drawing board.
                try:
                    D = sample_distance_matrix(tree, seqlen)
                except InfiniteDistanceError as e:
                    nrejected_inf += 1
                    continue
                except ZeroDistanceError as e:
                    nrejected_zero += 1
                    continue
                # Attempt to estimate the primary split of the tree from the distance matrix.
                # If there was a technical failure then note it and go back to the drawing board.
                # Otherwise note the compatibility and balance of the split.
                try:
                    eigensplit = BuildTreeTopology.split_using_eigenvector(D)
                    small_size = min(len(side) for side in eigensplit)
                    if eigensplit in true_splits:
                        compatibility = 1
                    else:
                        compatibility = 0
                except BuildTreeTopology.DegenerateSplitException, e:
                    small_size = 0
                    compatibility = 1
                except BuildTreeTopology.InvalidSpectralSplitException, e:
                    nrejected_fail += 1
                    continue
Ejemplo n.º 19
0
def process(nseconds=None):
    """
    @param nseconds: allow this many seconds to run or None to run forever
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    nsampled_trees = 0
    counterexample_message = 'no counterexample was found'
    nvertex_connectivity_failures = 0
    nfragment_fragment_collisions = 0
    nfragment_vertex_collisions = 0
    ncontrol_vertex_connectivity_failures = 0
    ncontrol_fragment_fragment_collisions = 0
    ncontrol_fragment_vertex_collisions = 0
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # pick a random number of taxa to use as leaves in the tree
            ntaxa = random.randrange(3, 12)
            # sample an xtree topology
            xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
            # sample an xtree with exponentially distributed branch lengths
            mu = 2.0
            for branch in xtree.get_branches():
                branch.length = random.expovariate(1/mu)
            # convert the xtree to a FelTree so we can use the internal vertices
            tree_string = xtree.get_newick_string()
            tree = NewickIO.parse(tree_string, FelTree.NewickTree)
            # get ordered ids and the number of leaves and some auxiliary variables
            ordered_ids = get_ordered_ids(tree)
            nleaves = len(list(tree.gen_tips()))
            id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids))
            # compute the set of pairs of indices corresponding to branches
            neighbor_index_pairs = set()
            for parent in tree.preorder():
                for child in parent.gen_children():
                    parent_index = id_to_index[id(parent)]
                    child_index = id_to_index[id(child)]
                    index_pair = frozenset((parent_index, child_index))
                    neighbor_index_pairs.add(index_pair)
            # get the distance matrix relating all of the points
            D_full = np.array(tree.get_full_distance_matrix(ordered_ids))
            D_bad = np.sqrt(D_full)
            for D in (D_full, D_bad):
                # get the projections onto the MDS axes of the leaves
                projection = do_projection(D, nleaves)
                npoints, naxes_total = projection.shape
                # look for a counterexample for each possible number of principal hyperplanes
                for naxes in range(1, naxes_total+1):
                    # some orthants are occupied by a fragment of an edge
                    forbidden_orthants = set()
                    for indexa, indexb in neighbor_index_pairs:
                        # get the endpoints of the edge in the Euclidean subspace
                        pta = projection[indexa][:naxes]
                        ptb = projection[indexb][:naxes]
                        # look at the orthants blocked by the fragments of this edge
                        orthants = get_blocked_orthants(pta, ptb)
                        if orthants & forbidden_orthants:
                            if D is D_full:
                                nfragment_fragment_collisions += 1
                                msg = 'two edge fragments occupy the same orthant in %d dimensions in the tree %s' % (naxes, tree_string)
                                raise CounterexampleError(msg)
                            else:
                                ncontrol_fragment_fragment_collisions += 1
                        forbidden_orthants.update(orthants)
                    # no vertex should share an orthant with an edge fragment
                    for i in range(npoints):
                        p = projection[i][:naxes]
                        orthant = point_to_orthant(p)
                        if orthant in forbidden_orthants:
                            if D is D_full:
                                nfragment_vertex_collisions += 1
                                msg = 'a vertex occupies the same orthant as an edge fragment in %d dimensions in the tree %s' % (naxes, tree_string)
                                raise CounterexampleError(msg)
                            else:
                                ncontrol_fragment_vertex_collisions += 1
                    # now partition the vertices by orthant and check their connectivity
                    orthant_to_id_set = {}
                    for i in range(npoints):
                        p = projection[i][:naxes]
                        orthant = point_to_orthant(p)
                        id_set = orthant_to_id_set.get(orthant, set())
                        id_set.add(ordered_ids[i])
                        orthant_to_id_set[orthant] = id_set
                    for id_set in orthant_to_id_set.values():
                        if not is_connected(tree, id_set):
                            if D is D_full:
                                nvertex_connectivity_failures += 1
                                msg = 'found disconnected vertices in an orthant in %d dimensions in the tree %s' % (naxes, tree_string)
                                raise CounterexampleError(msg)
                            else:
                                ncontrol_vertex_connectivity_failures += 1
            # increment the count of sampled trees
            nsampled_trees += 1
    except KeyboardInterrupt, e:
        pass
Ejemplo n.º 20
0
def process(ntaxa, nseconds):
    """
    @param nseconds: allow this many seconds to run or None to run forever
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    nsamples_rejected = 0
    nsamples_accepted = 0
    pattern_to_topo_surrogate = {}
    pattern_to_tree_string = {}
    counterexample_message = 'no counterexample was found'
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # sample an xtree topology
            xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
            # convert the xtree to a FelTree, although I guess this might not be necessary
            tree_string = xtree.get_newick_string()
            tree = NewickIO.parse(tree_string, FelTree.NewickTree)
            # get ordered ids and the number of leaves and some auxiliary variables
            ordered_ids = get_ordered_ids(tree)
            nleaves = len(list(tree.gen_tips()))
            id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids))
            # force every branch length to be the unit length
            reset_branch_lengths(tree)
            # get the unweighted distance matrix among tips in convenient hashable form
            D_unit = np.array(tree.get_partial_distance_matrix(ordered_ids))
            topo_surrogate = tuple(tuple(row.tolist()) for row in D_unit)
            # sample random branch lengths
            sample_branch_lengths(tree)
            # get the weighted tree string
            weighted_tree_string = NewickIO.get_newick_string(tree)
            # get the distance matrix relating the leaves
            D = np.array(tree.get_partial_distance_matrix(ordered_ids))
            # get the projections onto the MDS axes of the leaves
            X = Euclid.edm_to_points(D)
            # if any coordinate is near zero then reject the sample
            if np.min(np.abs(X)) < g_epsilon:
                nsamples_rejected += 1
                continue
            # do an orthogonal transformation that puts the first point in the positive orthant
            canonizing_vector = np.array(point_to_orthant(X[0]))
            X *= canonizing_vector
            # get the canonical sign pattern
            sign_pattern = tuple(point_to_orthant(row) for row in X)
            # compare the topo surrogate of this sign pattern to the one in memory
            expected_topo_surrogate = pattern_to_topo_surrogate.get(
                sign_pattern, None)
            if expected_topo_surrogate:
                if topo_surrogate != expected_topo_surrogate:
                    remembered_tree_string = pattern_to_tree_string[
                        sign_pattern]
                    msg = 'these trees have the same sign pattern but different topologies: {%s, %s}' % (
                        weighted_tree_string, remembered_tree_string)
                    raise CounterexampleError(msg)
            else:
                pattern_to_topo_surrogate[sign_pattern] = topo_surrogate
                pattern_to_tree_string[sign_pattern] = weighted_tree_string
            # increment the count of accepted samples
            nsamples_accepted += 1
    except KeyboardInterrupt, e:
        pass
Ejemplo n.º 21
0
def process(nseconds=None):
    """
    @param nseconds: allow this many seconds to run or None to run forever
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    nsampled_trees = 0
    counterexample_message = 'no counterexample was found'
    nvertex_connectivity_failures = 0
    nfragment_fragment_collisions = 0
    nfragment_vertex_collisions = 0
    ncontrol_vertex_connectivity_failures = 0
    ncontrol_fragment_fragment_collisions = 0
    ncontrol_fragment_vertex_collisions = 0
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # pick a random number of taxa to use as leaves in the tree
            ntaxa = random.randrange(3, 12)
            # sample an xtree topology
            xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
            # sample an xtree with exponentially distributed branch lengths
            mu = 2.0
            for branch in xtree.get_branches():
                branch.length = random.expovariate(1 / mu)
            # convert the xtree to a FelTree so we can use the internal vertices
            tree_string = xtree.get_newick_string()
            tree = NewickIO.parse(tree_string, FelTree.NewickTree)
            # get ordered ids and the number of leaves and some auxiliary variables
            ordered_ids = get_ordered_ids(tree)
            nleaves = len(list(tree.gen_tips()))
            id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids))
            # compute the set of pairs of indices corresponding to branches
            neighbor_index_pairs = set()
            for parent in tree.preorder():
                for child in parent.gen_children():
                    parent_index = id_to_index[id(parent)]
                    child_index = id_to_index[id(child)]
                    index_pair = frozenset((parent_index, child_index))
                    neighbor_index_pairs.add(index_pair)
            # get the distance matrix relating all of the points
            D_full = np.array(tree.get_full_distance_matrix(ordered_ids))
            D_bad = np.sqrt(D_full)
            for D in (D_full, D_bad):
                # get the projections onto the MDS axes of the leaves
                projection = do_projection(D, nleaves)
                npoints, naxes_total = projection.shape
                # look for a counterexample for each possible number of principal hyperplanes
                for naxes in range(1, naxes_total + 1):
                    # some orthants are occupied by a fragment of an edge
                    forbidden_orthants = set()
                    for indexa, indexb in neighbor_index_pairs:
                        # get the endpoints of the edge in the Euclidean subspace
                        pta = projection[indexa][:naxes]
                        ptb = projection[indexb][:naxes]
                        # look at the orthants blocked by the fragments of this edge
                        orthants = get_blocked_orthants(pta, ptb)
                        if orthants & forbidden_orthants:
                            if D is D_full:
                                nfragment_fragment_collisions += 1
                                msg = 'two edge fragments occupy the same orthant in %d dimensions in the tree %s' % (
                                    naxes, tree_string)
                                raise CounterexampleError(msg)
                            else:
                                ncontrol_fragment_fragment_collisions += 1
                        forbidden_orthants.update(orthants)
                    # no vertex should share an orthant with an edge fragment
                    for i in range(npoints):
                        p = projection[i][:naxes]
                        orthant = point_to_orthant(p)
                        if orthant in forbidden_orthants:
                            if D is D_full:
                                nfragment_vertex_collisions += 1
                                msg = 'a vertex occupies the same orthant as an edge fragment in %d dimensions in the tree %s' % (
                                    naxes, tree_string)
                                raise CounterexampleError(msg)
                            else:
                                ncontrol_fragment_vertex_collisions += 1
                    # now partition the vertices by orthant and check their connectivity
                    orthant_to_id_set = {}
                    for i in range(npoints):
                        p = projection[i][:naxes]
                        orthant = point_to_orthant(p)
                        id_set = orthant_to_id_set.get(orthant, set())
                        id_set.add(ordered_ids[i])
                        orthant_to_id_set[orthant] = id_set
                    for id_set in orthant_to_id_set.values():
                        if not is_connected(tree, id_set):
                            if D is D_full:
                                nvertex_connectivity_failures += 1
                                msg = 'found disconnected vertices in an orthant in %d dimensions in the tree %s' % (
                                    naxes, tree_string)
                                raise CounterexampleError(msg)
                            else:
                                ncontrol_vertex_connectivity_failures += 1
            # increment the count of sampled trees
            nsampled_trees += 1
    except KeyboardInterrupt, e:
        pass
Ejemplo n.º 22
0
def process(nseconds=None):
    """
    @param nseconds: allow this many seconds to run or None to run forever
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    nsampled_trees = 0
    # track the number of observations of each number of cuts on each axis for each hyperellipse
    internal_important_axis_to_ncuts_dict = {}
    internal_unimportant_axis_to_ncuts_dict = {}
    external_axis_to_ncuts_dict = {}
    # track the number of bad axes of each principality for each hyperellipse
    internal_important_bad_axis_dict = {}
    internal_unimportant_bad_axis_dict = {}
    external_bad_axis_dict = {}
    try:
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                break
            # pick a random number of taxa to use as leaves in the tree
            ntaxa = random.randrange(3, 12)
            # sample an xtree topology
            xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
            # sample an xtree with exponentially distributed branch lengths
            mu = 2.0
            for branch in xtree.get_branches():
                branch.length = random.expovariate(1/mu)
            # convert the xtree to a FelTree so we can use the internal vertices
            tree_string = xtree.get_newick_string()
            tree = NewickIO.parse(tree_string, FelTree.NewickTree)
            # get ordered ids and the number of leaves and some auxiliary variables
            ordered_ids = get_ordered_ids(tree)
            nleaves = len(list(tree.gen_tips()))
            id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids))
            # compute the set of pairs of indices corresponding to branches
            neighbor_index_pairs = set()
            for parent in tree.preorder():
                for child in parent.gen_children():
                    parent_index = id_to_index[id(parent)]
                    child_index = id_to_index[id(child)]
                    index_pair = frozenset((parent_index, child_index))
                    neighbor_index_pairs.add(index_pair)
            # get the distance matrix relating all of the points
            D_full = np.array(tree.get_full_distance_matrix(ordered_ids))
            # analyze the intersections of the axes of the ellipsoid that includes internal points
            internal_projection = do_internal_projection(D_full)
            npoints, naxes = internal_projection.shape
            # analyze low axes
            for axis in range(0, nleaves-1):
                if any(abs(internal_projection[i, axis]) < g_loading_epsilon for i in range(npoints)):
                    internal_important_bad_axis_dict[axis] = internal_important_bad_axis_dict.get(axis, 0) + 1
                else:
                    ncuts = 0
                    for indexa, indexb in neighbor_index_pairs:
                        if internal_projection[indexa, axis] * internal_projection[indexb, axis] < 0:
                            ncuts += 1
                    ncuts_dict = internal_important_axis_to_ncuts_dict.get(axis, {})
                    ncuts_dict[ncuts] = ncuts_dict.get(ncuts, 0) + 1
                    internal_important_axis_to_ncuts_dict[axis] = ncuts_dict
            # analyze high axes
            for axis in range(nleaves-1, naxes):
                if any(abs(internal_projection[i, axis]) < g_loading_epsilon for i in range(npoints)):
                    internal_unimportant_bad_axis_dict[axis] = internal_unimportant_bad_axis_dict.get(axis, 0) + 1
                else:
                    ncuts = 0
                    for indexa, indexb in neighbor_index_pairs:
                        if internal_projection[indexa, axis] * internal_projection[indexb, axis] < 0:
                            ncuts += 1
                    ncuts_dict = internal_unimportant_axis_to_ncuts_dict.get(axis, {})
                    ncuts_dict[ncuts] = ncuts_dict.get(ncuts, 0) + 1
                    internal_unimportant_axis_to_ncuts_dict[axis] = ncuts_dict
            # analyze the intersections of the axes of the ellipsoid that includes only leaf points
            external_projection = do_external_projection(D_full, nleaves)
            npoints, naxes = external_projection.shape
            for axis in range(naxes):
                if any(abs(external_projection[i, axis]) < g_loading_epsilon for i in range(npoints)):
                    external_bad_axis_dict[axis] = external_bad_axis_dict.get(axis, 0) + 1
                else:
                    ncuts = 0
                    for indexa, indexb in neighbor_index_pairs:
                        if external_projection[indexa, axis] * external_projection[indexb, axis] < 0:
                            ncuts += 1
                    ncuts_dict = external_axis_to_ncuts_dict.get(axis, {})
                    ncuts_dict[ncuts] = ncuts_dict.get(ncuts, 0) + 1
                    external_axis_to_ncuts_dict[axis] = ncuts_dict
            # increment the count of sampled trees
            nsampled_trees += 1
    except KeyboardInterrupt, e:
        pass
Ejemplo n.º 23
0
def process(ntaxa, nseconds, seqlen, nsamples, branch_length_sampler, use_pbar):
    """
    @param ntaxa: the number of taxa per tree
    @param nseconds: stop after this many seconds
    @param seqlen: use this sequence length
    @param nsamples: stop after this many samples per sequence length
    @param branch_length_sampler: this function samples branch lengths independently
    @param use_pbar: True iff a progress bar should be used
    @return: a multi-line string of the contents of an R table
    """
    # initialize the global rejection counts
    nrejected_zero = 0
    nrejected_inf = 0
    nrejected_fail = 0
    naccepted = 0
    # Initialize the accumulation matrix.
    # The rows specify the size of the smaller side of the initial split.
    # The columns specify the compatibility status of the split.
    nsmall_sizes = (ntaxa / 2) + 1
    accum = np.zeros((nsmall_sizes, 2), dtype=np.int)
    # Repeatedly analyze samples.
    # We might have to stop early if we run out of time or if ctrl-c is pressed.
    # If we have to stop early, then show the results of the progress so far.
    termination_reason = 'no reason for termination was given'
    start_time = time.time()
    pbar = Progress.Bar(nsamples) if use_pbar else None
    try:
        for sample_index in range(nsamples):
            # keep trying to get an accepted sample
            while True:
                # check the time
                if nseconds and time.time() - start_time > nseconds:
                    raise TimeoutError()
                # first sample a tree and get its set of informative splits
                tree = TreeSampler.sample_agglomerated_tree(ntaxa)
                true_splits = tree.get_nontrivial_splits()
                # sample the branch lengths
                for branch in tree.get_branches():
                    branch.length = branch_length_sampler()
                # Attempt to sample a distance matrix.
                # If the sample was rejected then note the reason and go back to the drawing board.
                try:
                    D = sample_distance_matrix(tree, seqlen)
                except InfiniteDistanceError as e:
                    nrejected_inf += 1
                    continue
                except ZeroDistanceError as e:
                    nrejected_zero += 1
                    continue
                # Attempt to estimate the primary split of the tree from the distance matrix.
                # If there was a technical failure then note it and go back to the drawing board.
                # Otherwise note the compatibility and balance of the split.
                try:
                    eigensplit = BuildTreeTopology.split_using_eigenvector(D)
                    small_size = min(len(side) for side in eigensplit)
                    if eigensplit in true_splits:
                        compatibility = 1
                    else:
                        compatibility = 0
                except BuildTreeTopology.DegenerateSplitException, e:
                    small_size = 0
                    compatibility = 1
                except BuildTreeTopology.InvalidSpectralSplitException, e:
                    nrejected_fail += 1
                    continue
Ejemplo n.º 24
0
def process(ntaxa):
    np.set_printoptions(linewidth=200)
    out = StringIO()
    # sample an xtree topology
    xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
    # sample an xtree with exponentially distributed branch lengths
    mu = 2.0
    for branch in xtree.get_branches():
        branch.length = random.expovariate(1 / mu)
    # convert the xtree to a FelTree so we can use the internal vertices
    tree_string = xtree.get_newick_string()
    tree = NewickIO.parse(tree_string, FelTree.NewickTree)
    # get ordered ids and the number of leaves and some auxiliary variables
    ordered_ids = get_ordered_ids(tree)
    nleaves = len(list(tree.gen_tips()))
    id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids))
    # get the distance matrix relating all of the points
    D_full = np.array(tree.get_full_distance_matrix(ordered_ids))
    # Now do the projection so that
    # the resulting points are in the subspace whose basis vectors are the axes of the leaf ellipsoid.
    # First get the points such that the n rows in X are points in n-1 dimensional space.
    X = Euclid.edm_to_points(D_full)
    print >> out, 'points with centroid at origin:'
    print >> out, X
    print >> out
    # Translate all of the points so that the origin is at the centroid of the leaves.
    X -= np.mean(X[:nleaves], 0)
    print >> out, 'points with centroid of leaves at origin:'
    print >> out, X
    print >> out
    # Extract the subset of points that define the leaves.
    L = X[:nleaves]
    # Find the orthogonal transformation of the leaves onto their MDS axes.
    # According to the python svd documentation, singular values are sorted most important to least important.
    U, s, Vt = np.linalg.svd(L)
    # Transform all of the points (including the internal vertices) according to this orthogonal transformation.
    # The axes are now the axes of the Steiner circumscribed ellipsoid of the leaf vertices.
    # I am using M.T[:k].T to get the first k columns of M.
    Z = np.dot(X, Vt.T)
    print >> out, 'orthogonally transformed points (call this Z):'
    print >> out, Z
    print >> out
    Y = Z.T[:(nleaves - 1)].T
    print >> out, 'projection of the points onto the axes of the leaf ellipsoid,'
    print >> out, '(these are the first columns of Z; call this projected matrix Y):'
    print >> out, Y
    print >> out
    # Show the inner products.
    inner_products_of_columns = np.dot(Y.T, Y)
    print >> out, "pairwise inner products of the columns of Y (that is, Y'Y)"
    print >> out, inner_products_of_columns
    print >> out
    # Show other inner products.
    inner_products_of_columns = np.dot(Y[:5].T, Y[:5])
    print >> out, "pairwise inner products of the first few columns of Y"
    print >> out, inner_products_of_columns
    print >> out
    # Extract the subset of points that define the points of articulation.
    # Note that the origin is the centroid of the leaves.
    R = X[nleaves:]
    Y_leaves = Y[:nleaves]
    W = np.dot(np.linalg.pinv(L), Y_leaves)
    print >> out, 'leaf projection using pseudoinverse (first few rows of Y):'
    print >> out, np.dot(L, W)
    print >> out
    print >> out, 'projection of points of articulation using pseudoinverse (remaining rows of Y):'
    print >> out, np.dot(R, W)
    print >> out
    # Get all of the points in high dimensional space.
    X = Euclid.edm_to_points(D_full)
    # Get the MDS onto the lower dimensional space.
    X = X.T[:(nleaves - 1)].T
    assert np.allclose(sum(X, 0), 0)
    print >> out, 'all points projected onto the first principal axes of the full ellipsoid:'
    print >> out, X
    print >> out
    # Look at only the leaves in this space.
    L = X[:nleaves]
    L -= np.mean(L, 0)
    print >> out, 'leaves projected onto the first principal axes of the full ellipsoid and then centered:'
    print >> out, L
    print >> out
    # Re-project the leaves onto the axes of leaf ellipsoid.
    D_leaves = Euclid.dccov_to_edm(np.dot(L, L.T))
    Y = Euclid.edm_to_points(D_leaves)
    print >> out, 'leaves further projected onto principal axes of their own ellipsoid:'
    print >> out, Y
    print >> out
    # Try something else
    D_all = Euclid.dccov_to_edm(np.dot(X, X.T))
    Y = Euclid.edm_to_points(D_all).T[:(nleaves - 1)].T
    print >> out, 'all points further projected onto their own principal axes of inertia:'
    print >> out, Y
    print >> out
    # Try the same thing some more
    D_again = Euclid.dccov_to_edm(np.dot(Y, Y.T))
    Z = Euclid.edm_to_points(D_again).T[:(nleaves - 1)].T
    print >> out, 'all points further projected onto their own principal axes of inertia (second iteration):'
    print >> out, Z
    print >> out
    return out.getvalue().strip()