def fitch_map_mutations(tree, genotypes, alleles): """ Returns the Fitch parsimony reconstruction for the specified set of genotypes. The reconstruction is specified by returning the ancestral state and a list of mutations on the tree. Each mutation is a (node, parent, state) triple, where node is the node over which the transition occurs, the parent is the index of the parent transition above it on the tree (or -1 if there is none) and state is the new state. """ genotypes = np.array(genotypes) # Encode the set operations using a numpy array. not_missing = genotypes != -1 if np.sum(not_missing) == 0: raise ValueError("Must have at least one non-missing genotype") num_alleles = np.max(genotypes[not_missing]) + 1 A = np.zeros((tree.num_nodes, num_alleles), dtype=np.int8) for allele, u in zip(genotypes, tree.tree_sequence.samples()): if allele != -1: A[u, allele] = 1 else: A[u] = 1 for u in tree.nodes(order="postorder"): if not tree.is_sample(u): A[u] = 1 for v in tree.children(u): A[u] = np.logical_and(A[u], A[v]) if np.sum(A[u]) == 0: for v in tree.children(u): A[u] = np.logical_or(A[u], A[v]) root_states = np.zeros_like(A[0]) for root in tree.roots: root_states = np.logical_or(root_states, A[root]) ancestral_state = np.where(root_states == 1)[0][0] mutations = [] state = {} for root in tree.roots: state[root] = ancestral_state parent = tskit.NULL if A[root, ancestral_state] != 1: state[root] = np.where(A[root] == 1)[0][0] mutations.append(tskit.Mutation( node=root, parent=tskit.NULL, derived_state=alleles[state[root]])) parent = len(mutations) - 1 stack = [(root, parent)] while len(stack) > 0: u, parent_mutation = stack.pop() for v in tree.children(u): state[v] = state[u] if A[v, state[u]] != 1: state[v] = np.where(A[v] == 1)[0][0] mutations.append(tskit.Mutation( node=v, parent=parent_mutation, derived_state=alleles[state[v]])) stack.append((v, len(mutations) - 1)) else: stack.append((v, parent_mutation)) return alleles[ancestral_state], mutations
def test_mutation_over_0_5(self): # Bug reported in https://github.com/tskit-dev/tskit/issues/987 genotypes = [1, 0, 0, 0, 0, 1] ancestral_state, transitions = self.do_map_mutations( self.tree, genotypes) assert ancestral_state == "0" assert len(transitions) == 2 assert transitions[0] == tskit.Mutation(node=0, derived_state="1") assert transitions[1] == tskit.Mutation(node=5, derived_state="1")
def test_multi_mutation_missing_data(self): genotypes = [1, 2, -1, 0, 0] ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 2) self.assertEqual( transitions[0], tskit.Mutation(node=5, parent=-1, derived_state="1")) self.assertEqual( transitions[1], tskit.Mutation(node=1, parent=0, derived_state="2"))
def test_mutation_over_7_0(self): genotypes = [2, 1, 0, 0, 1] ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 2) self.assertEqual( transitions[0], tskit.Mutation(node=7, parent=-1, derived_state="1")) self.assertEqual( transitions[1], tskit.Mutation(node=0, parent=0, derived_state="2"))
def test_three_states_freq_n_minus_2(self, n): tree = tskit.Tree.generate_star(n) genotypes = np.zeros(n, dtype=np.int8) genotypes[0] = 1 genotypes[1] = 2 ancestral_state, transitions = self.do_map_mutations(tree, genotypes) assert ancestral_state == "0" assert len(transitions) == 2 assert transitions[0] == tskit.Mutation(node=1, derived_state="2") assert transitions[1] == tskit.Mutation(node=0, derived_state="1")
def test_three_clades(self): genotypes = np.zeros(27, dtype=int) genotypes[9:18] = 1 genotypes[18:27] = 2 ancestral_state, transitions = self.do_map_mutations( self.tree, genotypes) assert ancestral_state == "0" assert len(transitions) == 2 assert transitions[0] == tskit.Mutation(node=38, derived_state="2") assert transitions[1] == tskit.Mutation(node=34, derived_state="1")
def test_nonzero_ancestral_state(self): genotypes = np.ones(27, dtype=int) genotypes[0] = 0 genotypes[26] = 0 ancestral_state, transitions = self.do_map_mutations( self.tree, genotypes) assert ancestral_state == "1" assert len(transitions) == 2 assert transitions[0] == tskit.Mutation(node=26, derived_state="0") assert transitions[1] == tskit.Mutation(node=0, derived_state="0")
def test_mutation_over_7_back_mutation_4(self): genotypes = [1, 0, 1, 0, 0, 1] ancestral_state, transitions = self.do_map_mutations( self.tree, genotypes) assert ancestral_state == "0" assert len(transitions) == 2 assert transitions[0] == tskit.Mutation(node=7, derived_state="1") assert transitions[1] == tskit.Mutation(node=4, derived_state="0", parent=0)
def test_mutation_over_7_0_alleles(self): genotypes = [2, 1, 0, 0, 1] alleles = ["ANC", "ONE", "TWO"] ancestral_state, transitions = self.do_map_mutations( self.tree, genotypes, alleles) self.assertEqual(ancestral_state, "ANC") self.assertEqual(len(transitions), 2) self.assertEqual( transitions[0], tskit.Mutation(node=7, parent=-1, derived_state="ONE")) self.assertEqual( transitions[1], tskit.Mutation(node=0, parent=0, derived_state="TWO"))
def test_multi_mutation_missing_data(self): genotypes = [1, 2, -1, 0, 0] ancestral_state, transitions = self.do_map_mutations( self.tree, genotypes) assert ancestral_state == "0" assert len(transitions) == 2 assert transitions[0] == tskit.Mutation(node=5, parent=-1, derived_state="1") assert transitions[1] == tskit.Mutation(node=1, parent=0, derived_state="2")
def test_mutation_over_7_0_alleles(self): genotypes = [2, 1, 0, 0, 1] alleles = ["ANC", "ONE", "TWO"] ancestral_state, transitions = self.do_map_mutations( self.tree, genotypes, alleles) assert ancestral_state == "ANC" assert len(transitions) == 2 assert transitions[0] == tskit.Mutation(node=7, parent=-1, derived_state="ONE") assert transitions[1] == tskit.Mutation(node=0, parent=0, derived_state="TWO")
def test_mutation_over_27_29(self): genotypes = np.zeros(27, dtype=int) genotypes[0:3] = 1 genotypes[6:9] = 1 ancestral_state, transitions = self.do_map_mutations( self.tree, genotypes) assert ancestral_state == "0" assert len(transitions) == 2 # the algorithm chooses a back mutation instead assert transitions[0] == tskit.Mutation(node=30, derived_state="1") assert transitions[1] == tskit.Mutation(node=28, derived_state="0", parent=0)
def test_mutation_over_0_missing_data_4(self): genotypes = [1, 0, 0, 0, -1] ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( transitions[0], tskit.Mutation(node=0, parent=-1, derived_state="1"))
def test_mutation_over_6(self): genotypes = [1, 0, 1, 0, 1, 0] ancestral_state, transitions = self.do_map_mutations( self.tree, genotypes) assert ancestral_state == "0" assert len(transitions) == 1 assert transitions[0] == tskit.Mutation(node=6, derived_state="1")
def test_mutation_over_0_missing_data_4(self): genotypes = [1, 0, 0, 0, -1] ancestral_state, transitions = self.do_map_mutations( self.tree, genotypes) assert ancestral_state == "0" assert len(transitions) == 1 assert transitions[0] == tskit.Mutation(node=0, parent=-1, derived_state="1")
def test_missing_data(self, n): tree = tskit.Tree.generate_star(n) genotypes = np.zeros(n, dtype=np.int8) genotypes[0] = tskit.MISSING_DATA genotypes[1] = 1 ancestral_state, transitions = self.do_map_mutations(tree, genotypes) assert ancestral_state == "0" assert len(transitions) == 1 assert transitions[0] == tskit.Mutation(node=1, derived_state="1")
def make_mutation(id_): site, node, derived_state, parent, metadata = tree_sequence.get_mutation( id_) return tskit.Mutation(id_=id_, site=site, node=node, derived_state=derived_state, parent=parent, metadata=metadata)
def make_mutation(id_): site, node, derived_state, parent, metadata = ll_ts.get_mutation(id_) return tskit.Mutation( id_=id_, site=site, node=node, derived_state=derived_state, parent=parent, metadata=metadata, )
def test_mutation_over_leaf_sibling_missing(self): genotypes = [0, 0, 1, -1, 0] ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) # We assume that the mutation is over the parent of 2 and the missing data # so we impute that 3 also has allele 1. This suprising behaviour to me: # I would have thought it was more parsimonious to assume that the missing # data had the ancestral state. However, the number of *state changes* # is the same, which is what the algorithm is minimising. self.assertEqual( transitions[0], tskit.Mutation(node=6, parent=-1, derived_state="1")) # Reverse is the same genotypes = [0, 0, -1, 1, 0] ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( transitions[0], tskit.Mutation(node=6, parent=-1, derived_state="1"))
def make_mutation(id_): site, node, derived_state, parent, metadata = ll_ts.get_mutation( id_) return tskit.Mutation( id_=id_, site=site, node=node, derived_state=derived_state, parent=parent, encoded_metadata=metadata, metadata_decoder=tskit.metadata.parse_metadata_schema( ll_ts.get_table_metadata_schemas().mutation).decode_row, )
def hartigan_map_mutations(tree, genotypes, alleles): """ Returns a Hartigan parsimony reconstruction for the specified set of genotypes. The reconstruction is specified by returning the ancestral state and a list of mutations on the tree. Each mutation is a (node, parent, state) triple, where node is the node over which the transition occurs, the parent is the index of the parent transition above it on the tree (or -1 if there is none) and state is the new state. """ genotypes = np.array(genotypes) not_missing = genotypes != -1 if np.sum(not_missing) == 0: raise ValueError("Must have at least one non-missing genotype") num_alleles = np.max(genotypes[not_missing]) + 1 num_nodes = tree.tree_sequence.num_nodes # use a numpy array of 0/1 values to represent the set of states # to make the code as similar as possible to the C implementation. optimal_set = np.zeros((num_nodes + 1, num_alleles), dtype=np.int8) for allele, u in zip(genotypes, tree.tree_sequence.samples()): if allele != -1: optimal_set[u, allele] = 1 else: optimal_set[u] = 1 allele_count = np.zeros(num_alleles, dtype=int) for root in tree.roots: for u in tree.nodes(root, order="postorder"): allele_count[:] = 0 for v in tree.children(u): for j in range(num_alleles): allele_count[j] += optimal_set[v, j] if not tree.is_sample(u): max_allele_count = np.max(allele_count) optimal_set[u, allele_count == max_allele_count] = 1 allele_count[:] = 0 for v in tree.roots: for j in range(num_alleles): allele_count[j] += optimal_set[v, j] max_allele_count = np.max(allele_count) optimal_root_set = np.zeros(num_alleles, dtype=int) optimal_root_set[allele_count == max_allele_count] = 1 ancestral_state = np.argmax(optimal_root_set) @attr.s class StackElement: node = attr.ib() state = attr.ib() mutation_parent = attr.ib() mutations = [] for root in tree.roots: stack = [StackElement(root, ancestral_state, -1)] while len(stack) > 0: s = stack.pop() if optimal_set[s.node, s.state] == 0: s.state = np.argmax(optimal_set[s.node]) mutation = tskit.Mutation( node=s.node, derived_state=alleles[s.state], parent=s.mutation_parent, ) s.mutation_parent = len(mutations) mutations.append(mutation) for v in tree.children(s.node): stack.append(StackElement(v, s.state, s.mutation_parent)) return alleles[ancestral_state], mutations