Ejemplo n.º 1
0
def fitch_map_mutations(tree, genotypes, alleles):
    """
    Returns the Fitch parsimony reconstruction for the specified set of genotypes.
    The reconstruction is specified by returning the ancestral state and a
    list of mutations on the tree. Each mutation is a (node, parent, state)
    triple, where node is the node over which the transition occurs, the
    parent is the index of the parent transition above it on the tree (or -1
    if there is none) and state is the new state.
    """
    genotypes = np.array(genotypes)
    # Encode the set operations using a numpy array.
    not_missing = genotypes != -1
    if np.sum(not_missing) == 0:
        raise ValueError("Must have at least one non-missing genotype")
    num_alleles = np.max(genotypes[not_missing]) + 1
    A = np.zeros((tree.num_nodes, num_alleles), dtype=np.int8)
    for allele, u in zip(genotypes, tree.tree_sequence.samples()):
        if allele != -1:
            A[u, allele] = 1
        else:
            A[u] = 1
    for u in tree.nodes(order="postorder"):
        if not tree.is_sample(u):
            A[u] = 1
            for v in tree.children(u):
                A[u] = np.logical_and(A[u], A[v])
            if np.sum(A[u]) == 0:
                for v in tree.children(u):
                    A[u] = np.logical_or(A[u], A[v])

    root_states = np.zeros_like(A[0])
    for root in tree.roots:
        root_states = np.logical_or(root_states, A[root])
    ancestral_state = np.where(root_states == 1)[0][0]

    mutations = []
    state = {}
    for root in tree.roots:
        state[root] = ancestral_state
        parent = tskit.NULL
        if A[root, ancestral_state] != 1:
            state[root] = np.where(A[root] == 1)[0][0]
            mutations.append(tskit.Mutation(
                node=root, parent=tskit.NULL, derived_state=alleles[state[root]]))
            parent = len(mutations) - 1
        stack = [(root, parent)]
        while len(stack) > 0:
            u, parent_mutation = stack.pop()
            for v in tree.children(u):
                state[v] = state[u]
                if A[v, state[u]] != 1:
                    state[v] = np.where(A[v] == 1)[0][0]
                    mutations.append(tskit.Mutation(
                        node=v, parent=parent_mutation, derived_state=alleles[state[v]]))
                    stack.append((v, len(mutations) - 1))
                else:
                    stack.append((v, parent_mutation))
    return alleles[ancestral_state], mutations
Ejemplo n.º 2
0
 def test_mutation_over_0_5(self):
     # Bug reported in https://github.com/tskit-dev/tskit/issues/987
     genotypes = [1, 0, 0, 0, 0, 1]
     ancestral_state, transitions = self.do_map_mutations(
         self.tree, genotypes)
     assert ancestral_state == "0"
     assert len(transitions) == 2
     assert transitions[0] == tskit.Mutation(node=0, derived_state="1")
     assert transitions[1] == tskit.Mutation(node=5, derived_state="1")
Ejemplo n.º 3
0
 def test_multi_mutation_missing_data(self):
     genotypes = [1, 2, -1, 0, 0]
     ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes)
     self.assertEqual(ancestral_state, "0")
     self.assertEqual(len(transitions), 2)
     self.assertEqual(
         transitions[0], tskit.Mutation(node=5, parent=-1, derived_state="1"))
     self.assertEqual(
         transitions[1], tskit.Mutation(node=1, parent=0, derived_state="2"))
Ejemplo n.º 4
0
 def test_mutation_over_7_0(self):
     genotypes = [2, 1, 0, 0, 1]
     ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes)
     self.assertEqual(ancestral_state, "0")
     self.assertEqual(len(transitions), 2)
     self.assertEqual(
         transitions[0], tskit.Mutation(node=7, parent=-1, derived_state="1"))
     self.assertEqual(
         transitions[1], tskit.Mutation(node=0, parent=0, derived_state="2"))
Ejemplo n.º 5
0
 def test_three_states_freq_n_minus_2(self, n):
     tree = tskit.Tree.generate_star(n)
     genotypes = np.zeros(n, dtype=np.int8)
     genotypes[0] = 1
     genotypes[1] = 2
     ancestral_state, transitions = self.do_map_mutations(tree, genotypes)
     assert ancestral_state == "0"
     assert len(transitions) == 2
     assert transitions[0] == tskit.Mutation(node=1, derived_state="2")
     assert transitions[1] == tskit.Mutation(node=0, derived_state="1")
Ejemplo n.º 6
0
 def test_three_clades(self):
     genotypes = np.zeros(27, dtype=int)
     genotypes[9:18] = 1
     genotypes[18:27] = 2
     ancestral_state, transitions = self.do_map_mutations(
         self.tree, genotypes)
     assert ancestral_state == "0"
     assert len(transitions) == 2
     assert transitions[0] == tskit.Mutation(node=38, derived_state="2")
     assert transitions[1] == tskit.Mutation(node=34, derived_state="1")
Ejemplo n.º 7
0
 def test_nonzero_ancestral_state(self):
     genotypes = np.ones(27, dtype=int)
     genotypes[0] = 0
     genotypes[26] = 0
     ancestral_state, transitions = self.do_map_mutations(
         self.tree, genotypes)
     assert ancestral_state == "1"
     assert len(transitions) == 2
     assert transitions[0] == tskit.Mutation(node=26, derived_state="0")
     assert transitions[1] == tskit.Mutation(node=0, derived_state="0")
Ejemplo n.º 8
0
 def test_mutation_over_7_back_mutation_4(self):
     genotypes = [1, 0, 1, 0, 0, 1]
     ancestral_state, transitions = self.do_map_mutations(
         self.tree, genotypes)
     assert ancestral_state == "0"
     assert len(transitions) == 2
     assert transitions[0] == tskit.Mutation(node=7, derived_state="1")
     assert transitions[1] == tskit.Mutation(node=4,
                                             derived_state="0",
                                             parent=0)
Ejemplo n.º 9
0
 def test_mutation_over_7_0_alleles(self):
     genotypes = [2, 1, 0, 0, 1]
     alleles = ["ANC", "ONE", "TWO"]
     ancestral_state, transitions = self.do_map_mutations(
         self.tree, genotypes, alleles)
     self.assertEqual(ancestral_state, "ANC")
     self.assertEqual(len(transitions), 2)
     self.assertEqual(
         transitions[0], tskit.Mutation(node=7, parent=-1, derived_state="ONE"))
     self.assertEqual(
         transitions[1], tskit.Mutation(node=0, parent=0, derived_state="TWO"))
Ejemplo n.º 10
0
 def test_multi_mutation_missing_data(self):
     genotypes = [1, 2, -1, 0, 0]
     ancestral_state, transitions = self.do_map_mutations(
         self.tree, genotypes)
     assert ancestral_state == "0"
     assert len(transitions) == 2
     assert transitions[0] == tskit.Mutation(node=5,
                                             parent=-1,
                                             derived_state="1")
     assert transitions[1] == tskit.Mutation(node=1,
                                             parent=0,
                                             derived_state="2")
Ejemplo n.º 11
0
 def test_mutation_over_7_0_alleles(self):
     genotypes = [2, 1, 0, 0, 1]
     alleles = ["ANC", "ONE", "TWO"]
     ancestral_state, transitions = self.do_map_mutations(
         self.tree, genotypes, alleles)
     assert ancestral_state == "ANC"
     assert len(transitions) == 2
     assert transitions[0] == tskit.Mutation(node=7,
                                             parent=-1,
                                             derived_state="ONE")
     assert transitions[1] == tskit.Mutation(node=0,
                                             parent=0,
                                             derived_state="TWO")
Ejemplo n.º 12
0
 def test_mutation_over_27_29(self):
     genotypes = np.zeros(27, dtype=int)
     genotypes[0:3] = 1
     genotypes[6:9] = 1
     ancestral_state, transitions = self.do_map_mutations(
         self.tree, genotypes)
     assert ancestral_state == "0"
     assert len(transitions) == 2
     # the algorithm chooses a back mutation instead
     assert transitions[0] == tskit.Mutation(node=30, derived_state="1")
     assert transitions[1] == tskit.Mutation(node=28,
                                             derived_state="0",
                                             parent=0)
Ejemplo n.º 13
0
 def test_mutation_over_0_missing_data_4(self):
     genotypes = [1, 0, 0, 0, -1]
     ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes)
     self.assertEqual(ancestral_state, "0")
     self.assertEqual(len(transitions), 1)
     self.assertEqual(
         transitions[0], tskit.Mutation(node=0, parent=-1, derived_state="1"))
Ejemplo n.º 14
0
 def test_mutation_over_6(self):
     genotypes = [1, 0, 1, 0, 1, 0]
     ancestral_state, transitions = self.do_map_mutations(
         self.tree, genotypes)
     assert ancestral_state == "0"
     assert len(transitions) == 1
     assert transitions[0] == tskit.Mutation(node=6, derived_state="1")
Ejemplo n.º 15
0
 def test_mutation_over_0_missing_data_4(self):
     genotypes = [1, 0, 0, 0, -1]
     ancestral_state, transitions = self.do_map_mutations(
         self.tree, genotypes)
     assert ancestral_state == "0"
     assert len(transitions) == 1
     assert transitions[0] == tskit.Mutation(node=0,
                                             parent=-1,
                                             derived_state="1")
Ejemplo n.º 16
0
 def test_missing_data(self, n):
     tree = tskit.Tree.generate_star(n)
     genotypes = np.zeros(n, dtype=np.int8)
     genotypes[0] = tskit.MISSING_DATA
     genotypes[1] = 1
     ancestral_state, transitions = self.do_map_mutations(tree, genotypes)
     assert ancestral_state == "0"
     assert len(transitions) == 1
     assert transitions[0] == tskit.Mutation(node=1, derived_state="1")
Ejemplo n.º 17
0
 def make_mutation(id_):
     site, node, derived_state, parent, metadata = tree_sequence.get_mutation(
         id_)
     return tskit.Mutation(id_=id_,
                           site=site,
                           node=node,
                           derived_state=derived_state,
                           parent=parent,
                           metadata=metadata)
Ejemplo n.º 18
0
 def make_mutation(id_):
     site, node, derived_state, parent, metadata = ll_ts.get_mutation(id_)
     return tskit.Mutation(
         id_=id_,
         site=site,
         node=node,
         derived_state=derived_state,
         parent=parent,
         metadata=metadata,
     )
Ejemplo n.º 19
0
    def test_mutation_over_leaf_sibling_missing(self):
        genotypes = [0, 0, 1, -1, 0]
        ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes)
        self.assertEqual(ancestral_state, "0")
        self.assertEqual(len(transitions), 1)
        # We assume that the mutation is over the parent of 2 and the missing data
        # so we impute that 3 also has allele 1. This suprising behaviour to me:
        # I would have thought it was more parsimonious to assume that the missing
        # data had the ancestral state. However, the number of *state changes*
        # is the same, which is what the algorithm is minimising.
        self.assertEqual(
            transitions[0], tskit.Mutation(node=6, parent=-1, derived_state="1"))

        # Reverse is the same
        genotypes = [0, 0, -1, 1, 0]
        ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes)
        self.assertEqual(ancestral_state, "0")
        self.assertEqual(len(transitions), 1)
        self.assertEqual(
            transitions[0], tskit.Mutation(node=6, parent=-1, derived_state="1"))
Ejemplo n.º 20
0
 def make_mutation(id_):
     site, node, derived_state, parent, metadata = ll_ts.get_mutation(
         id_)
     return tskit.Mutation(
         id_=id_,
         site=site,
         node=node,
         derived_state=derived_state,
         parent=parent,
         encoded_metadata=metadata,
         metadata_decoder=tskit.metadata.parse_metadata_schema(
             ll_ts.get_table_metadata_schemas().mutation).decode_row,
     )
Ejemplo n.º 21
0
def hartigan_map_mutations(tree, genotypes, alleles):
    """
    Returns a Hartigan parsimony reconstruction for the specified set of genotypes.
    The reconstruction is specified by returning the ancestral state and a
    list of mutations on the tree. Each mutation is a (node, parent, state)
    triple, where node is the node over which the transition occurs, the
    parent is the index of the parent transition above it on the tree (or -1
    if there is none) and state is the new state.
    """
    genotypes = np.array(genotypes)
    not_missing = genotypes != -1
    if np.sum(not_missing) == 0:
        raise ValueError("Must have at least one non-missing genotype")
    num_alleles = np.max(genotypes[not_missing]) + 1
    num_nodes = tree.tree_sequence.num_nodes

    # use a numpy array of 0/1 values to represent the set of states
    # to make the code as similar as possible to the C implementation.
    optimal_set = np.zeros((num_nodes + 1, num_alleles), dtype=np.int8)
    for allele, u in zip(genotypes, tree.tree_sequence.samples()):
        if allele != -1:
            optimal_set[u, allele] = 1
        else:
            optimal_set[u] = 1

    allele_count = np.zeros(num_alleles, dtype=int)
    for root in tree.roots:
        for u in tree.nodes(root, order="postorder"):
            allele_count[:] = 0
            for v in tree.children(u):
                for j in range(num_alleles):
                    allele_count[j] += optimal_set[v, j]
            if not tree.is_sample(u):
                max_allele_count = np.max(allele_count)
                optimal_set[u, allele_count == max_allele_count] = 1

    allele_count[:] = 0
    for v in tree.roots:
        for j in range(num_alleles):
            allele_count[j] += optimal_set[v, j]
    max_allele_count = np.max(allele_count)
    optimal_root_set = np.zeros(num_alleles, dtype=int)
    optimal_root_set[allele_count == max_allele_count] = 1
    ancestral_state = np.argmax(optimal_root_set)

    @attr.s
    class StackElement:
        node = attr.ib()
        state = attr.ib()
        mutation_parent = attr.ib()

    mutations = []
    for root in tree.roots:
        stack = [StackElement(root, ancestral_state, -1)]
        while len(stack) > 0:
            s = stack.pop()
            if optimal_set[s.node, s.state] == 0:
                s.state = np.argmax(optimal_set[s.node])
                mutation = tskit.Mutation(
                    node=s.node,
                    derived_state=alleles[s.state],
                    parent=s.mutation_parent,
                )
                s.mutation_parent = len(mutations)
                mutations.append(mutation)
            for v in tree.children(s.node):
                stack.append(StackElement(v, s.state, s.mutation_parent))
    return alleles[ancestral_state], mutations