def test_with_mutations(self): N = 10 ngens = 100 tables = wf_sim(N=N, ngens=ngens, deep_history=False, seed=self.random_seed) tables.sort() ts = msprime.load_tables(**tables.asdict()) ts = tsutil.jukes_cantor(ts, 10, 0.1, seed=self.random_seed) tables = ts.tables self.assertGreater(tables.sites.num_rows, 0) self.assertGreater(tables.mutations.num_rows, 0) samples = np.where( tables.nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype(np.int32) tables.sort() tables.simplify(samples) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) self.assertGreater(tables.sites.num_rows, 0) self.assertGreater(tables.mutations.num_rows, 0) ts = msprime.load_tables(**tables.asdict()) self.assertEqual(ts.sample_size, N) for hap in ts.haplotypes(): self.assertEqual(len(hap), ts.num_sites)
def test_with_recurrent_mutations(self): # actually with only ONE site, at 0.0 N = 10 ngens = 100 tables = wf_sim(N=N, ngens=ngens, deep_history=False, seed=self.random_seed) msprime.sort_tables(**tables.asdict()) ts = msprime.load_tables(**tables.asdict()) ts = tsutil.jukes_cantor(ts, 1, 10, seed=self.random_seed) tables = ts.tables self.assertEqual(tables.sites.num_rows, 1) self.assertGreater(tables.mutations.num_rows, 0) nodes = tables.nodes samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype( np.int32) # before simplify for h in ts.haplotypes(): self.assertEqual(len(h), 1) # after simplify msprime.simplify_tables(samples=samples, nodes=tables.nodes, edges=tables.edges, sites=tables.sites, mutations=tables.mutations) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) self.assertEqual(tables.sites.num_rows, 1) self.assertGreater(tables.mutations.num_rows, 0) ts = msprime.load_tables(**tables.asdict()) self.assertEqual(ts.sample_size, N) for hap in ts.haplotypes(): self.assertEqual(len(hap), ts.num_sites)
def run_vcf(args): ts = msprime.load(args.file) total_sites = ts.num_sites # Subset the tree sequence down to num_sites. t = ts.dump_tables() t.sites.set_columns( position=t.sites.position[:args.num_sites], ancestral_state=t.sites.ancestral_state[:args.num_sites], ancestral_state_length=t.sites.ancestral_state_length[:args.num_sites]) t.mutations.set_columns( site=t.mutations.site[:args.num_sites], node=t.mutations.node[:args.num_sites], derived_state=t.mutations.derived_state[:args.num_sites], derived_state_length=t.mutations.derived_state_length[:args.num_sites]) ts = msprime.load_tables(**t.asdict()) print("subset down to ", ts.num_sites, "sites") megabyte = 1024 * 1024 terabyte = megabyte * 1024 * 1024 with io.StringIO() as output: ts.write_vcf(output) size = output.tell() print("Wrote {:.2f} MiB".format(size / megabyte)) projected = (size / args.num_sites) * total_sites print("Estimate {:.2f} TiB".format(projected / terabyte))
def insert_branch_mutations(ts, mutations_per_branch=1): """ Returns a copy of the specified tree sequence with a mutation on every branch in every tree. """ sites = msprime.SiteTable() mutations = msprime.MutationTable() for tree in ts.trees(): site = len(sites) sites.add_row(position=tree.interval[0], ancestral_state='0') for root in tree.roots: state = {root: 0} mutation = {root: -1} stack = [root] while len(stack) > 0: u = stack.pop() stack.extend(tree.children(u)) v = tree.parent(u) if v != msprime.NULL_NODE: state[u] = state[v] parent = mutation[v] for j in range(mutations_per_branch): state[u] = (state[u] + 1) % 2 mutation[u] = len(mutations) mutations.add_row( site=site, node=u, derived_state=str(state[u]), parent=parent) parent = mutation[u] tables = ts.tables add_provenance(tables.provenances, "insert_branch_mutations") return msprime.load_tables( nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations, provenances=tables.provenances)
def get_common_mutations_ts(args, tree_sequence, log): common_sites = msprime.SiteTable() common_mutations = msprime.MutationTable() # Get the mutations > MAF. n_haps = tree_sequence.get_sample_size() log.log('Determining sites > MAF cutoff {m}'.format(m=args.maf)) for tree in tree_sequence.trees(): for site in tree.sites(): f = tree.get_num_leaves(site.mutations[0].node) / n_haps if f > args.maf and f < 1 - args.maf: common_site_id = common_sites.add_row( position=site.position, ancestral_state=site.ancestral_state) common_mutations.add_row( site=common_site_id, node=site.mutations[0].node, derived_state=site.mutations[0].derived_state) tables = tree_sequence.dump_tables() new_tree_sequence = msprime.load_tables(nodes=tables.nodes, edges=tables.edges, sites=common_sites, mutations=common_mutations) return new_tree_sequence
def resolve_polytomies(ts, polytomy_func): """ polytomy_func should take a set of edge records, and an edgesets and a nodes object to be added to. """ new_edgesets = msprime.EdgesetTable() nodes, mutations = get_nodes_and_mutations(ts) edge_records = [[]] #store the edge records per parent, split into contiguous blocks for e in ts.edgesets(): #assume records are in order if len(edge_records[0]==0) or e.parent == records[0][0].parent: if e.right==edge_records[-1][-1].left: #contiguous with the last record edge_records[-1].append(e) else: #this is the same parent, but not contiguous edge_records.append([e]) else: #submit records for polytomy resolution - may require new nodes to be created polytomy_func(edge_records, new_edgesets, nodes) edge_records = [[e]] if edge_records: #last loop polytomy_func(edge_records, nodes, new_edgeset) return msprime.load_tables(nodes=nodes, edgesets=new_edgesets, mutations=mutations)
def test_pickle(self): ts = msprime.simulate(10, random_seed=1) tables = ts.dump_tables() nodes = tables.nodes # For each node, we create some Python metadata that can be pickled metadata = [{ "one": j, "two": 2 * j, "three": list(range(j)) } for j in range(len(nodes))] encoded, offset = msprime.pack_bytes(list(map(pickle.dumps, metadata))) nodes.set_columns(flags=nodes.flags, time=nodes.time, population=nodes.population, metadata_offset=offset, metadata=encoded) self.assertTrue(np.array_equal(nodes.metadata_offset, offset)) self.assertTrue(np.array_equal(nodes.metadata, encoded)) ts1 = msprime.load_tables(nodes=nodes, edges=tables.edges) for j, node in enumerate(ts1.nodes()): decoded_metadata = pickle.loads(node.metadata) self.assertEqual(decoded_metadata, metadata[j]) ts1.dump(self.temp_file) ts2 = msprime.load(self.temp_file) self.assertEqual(ts1.tables.nodes, ts2.tables.nodes)
def strip_singletons(ts, maf): """ TODO: include maf filtering... done?? modified from Jerome's :param maf: :param ts: :return: """ n = ts.get_sample_size() sites = msprime.SiteTable() mutations = msprime.MutationTable() for tree in ts.trees(): for site in tree.sites(): assert len(site.mutations) == 1 # Only supports infinite sites muts. mut = site.mutations[0] f = tree.get_num_leaves(mut.node) / n if (tree.num_samples(mut.node) > 1) and (f > maf): site_id = sites.add_row( position=site.position, ancestral_state=site.ancestral_state) mutations.add_row( site=site_id, node=mut.node, derived_state=mut.derived_state ) tables = ts.dump_tables() new_ts = msprime.load_tables( nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations ) return new_ts
def add_random_metadata(ts, seed=1, max_length=10): """ Returns a copy of the specified tree sequence with random metadata assigned to the nodes, sites and mutations. """ tables = ts.dump_tables() np.random.seed(seed) length = np.random.randint(0, max_length, ts.num_nodes) offset = np.cumsum(np.hstack(([0], length)), dtype=np.uint32) # Older versions of numpy didn't have a dtype argument for randint, so # must use astype instead. metadata = np.random.randint(-127, 127, offset[-1]).astype(np.int8) nodes = tables.nodes nodes.set_columns( flags=nodes.flags, population=nodes.population, time=nodes.time, metadata_offset=offset, metadata=metadata, individual=nodes.individual) length = np.random.randint(0, max_length, ts.num_sites) offset = np.cumsum(np.hstack(([0], length)), dtype=np.uint32) metadata = np.random.randint(-127, 127, offset[-1]).astype(np.int8) sites = tables.sites sites.set_columns( position=sites.position, ancestral_state=sites.ancestral_state, ancestral_state_offset=sites.ancestral_state_offset, metadata_offset=offset, metadata=metadata) length = np.random.randint(0, max_length, ts.num_mutations) offset = np.cumsum(np.hstack(([0], length)), dtype=np.uint32) metadata = np.random.randint(-127, 127, offset[-1]).astype(np.int8) mutations = tables.mutations mutations.set_columns( site=mutations.site, node=mutations.node, parent=mutations.parent, derived_state=mutations.derived_state, derived_state_offset=mutations.derived_state_offset, metadata_offset=offset, metadata=metadata) length = np.random.randint(0, max_length, ts.num_individuals) offset = np.cumsum(np.hstack(([0], length)), dtype=np.uint32) metadata = np.random.randint(-127, 127, offset[-1]).astype(np.int8) individuals = tables.individuals individuals.set_columns( flags=individuals.flags, location=individuals.location, location_offset=individuals.location_offset, metadata_offset=offset, metadata=metadata) length = np.random.randint(0, max_length, ts.num_populations) offset = np.cumsum(np.hstack(([0], length)), dtype=np.uint32) metadata = np.random.randint(-127, 127, offset[-1]).astype(np.int8) populations = tables.populations populations.set_columns(metadata_offset=offset, metadata=metadata) add_provenance(tables.provenances, "add_random_metadata") ts = msprime.load_tables(**tables.asdict()) return ts
def simplify(self): # print("START") # self.print_state() if self.ts.num_edges > 0: all_edges = list(self.ts.edges()) edges = all_edges[:1] for e in all_edges[1:]: if e.parent != edges[0].parent: self.process_parent_edges(edges) edges = [] edges.append(e) self.process_parent_edges(edges) # Record any final mutations over the roots. for input_id in list(self.A.keys()): x = self.A[input_id] while x is not None: mutations = self.get_mutations(input_id, x.left, x.right) for mutation_id in mutations: # print("Recording mutation over root", x.node, mutation_id) self.record_mutation(x.node, mutation_id) x = x.next self.finalise_sites() node_map = np.zeros(self.ts.num_nodes, np.int32) - 1 for input_id, output_id in self.node_id_map.items(): node_map[input_id] = output_id ts = msprime.load_tables(nodes=self.node_table, edges=self.edge_table, sites=self.site_table, mutations=self.mutation_table, sequence_length=self.sequence_length) return ts, node_map
def strip_singletons(ts): """ Returns a copy of the specified tree sequence with singletons removed. """ sites = msprime.SiteTable() mutations = msprime.MutationTable() dropped_mutations = 0 for variant in ts.variants(): if np.sum(variant.genotypes) > 1: site_id = sites.add_row( position=variant.site.position, ancestral_state=variant.site.ancestral_state) assert len(variant.site.mutations) >= 1 mutation = variant.site.mutations[0] parent_id = mutations.add_row(site=site_id, node=mutation.node, derived_state=mutation.derived_state) for error in variant.site.mutations[1:]: parent = -1 if error.parent != -1: parent = parent_id mutations.add_row(site=site_id, node=error.node, derived_state=error.derived_state, parent=parent) tables = ts.dump_tables() return msprime.load_tables(nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations)
def single_childify(ts): """ Builds a new equivalent tree sequence which contains an extra node in the middle of all exising branches. """ tables = ts.dump_tables() edges = tables.edges nodes = tables.nodes sites = tables.sites mutations = tables.mutations time = nodes.time[:] edges.reset() for edge in ts.edges(): # Insert a new node in between the parent and child. u = len(nodes) t = time[edge.child] + (time[edge.parent] - time[edge.child]) / 2 nodes.add_row(time=t) edges.add_row( left=edge.left, right=edge.right, parent=u, child=edge.child) edges.add_row( left=edge.left, right=edge.right, parent=edge.parent, child=u) msprime.sort_tables( nodes=nodes, edges=edges, sites=sites, mutations=mutations) add_provenance(tables.provenances, "insert_redundant_breakpoints") new_ts = msprime.load_tables( nodes=nodes, edges=edges, sites=sites, mutations=mutations, provenances=tables.provenances) return new_ts
def get_ancestral_haplotypes(ts): """ Returns a numpy array of the haplotypes of the ancestors in the specified tree sequence. """ nodes = ts.tables.nodes flags = nodes.flags[:] flags[:] = 1 nodes.set_columns(time=nodes.time, flags=flags) sites = [site.position for site in ts.sites()] tsp = msprime.load_tables(nodes=nodes, edges=ts.tables.edges, sites=ts.tables.sites, mutations=ts.tables.mutations) B = tsp.genotype_matrix().T A = np.zeros((ts.num_nodes, ts.num_sites), dtype=np.uint8) A[:] = inference.UNKNOWN_ALLELE for edge in ts.edges(): start = bisect.bisect_left(sites, edge.left) end = bisect.bisect_right(sites, edge.right) if sites[end - 1] == edge.right: end -= 1 A[edge.parent, start:end] = B[edge.parent, start:end] A[:ts.num_samples] = B[:ts.num_samples] return A
def insert_multichar_mutations(ts, seed=1, max_len=10): """ Returns a copy of the specified tree sequence with multiple chararacter mutations on a randomly chosen branch in every tree. """ rng = random.Random(seed) letters = ["A", "C", "T", "G"] sites = msprime.SiteTable() mutations = msprime.MutationTable() for tree in ts.trees(): site = len(sites) ancestral_state = rng.choice(letters) * rng.randint(0, max_len) sites.add_row(position=tree.interval[0], ancestral_state=ancestral_state) nodes = list(tree.nodes()) nodes.remove(tree.root) u = rng.choice(nodes) derived_state = ancestral_state while ancestral_state == derived_state: derived_state = rng.choice(letters) * rng.randint(0, max_len) mutations.add_row(site=site, node=u, derived_state=derived_state) tables = ts.tables add_provenance(tables.provenances, "insert_multichar_mutations") return msprime.load_tables( nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations, provenances=tables.provenances)
def jukes_cantor(ts, num_sites, mu, multiple_per_node=True, seed=None): """ Returns a copy of the specified tree sequence with Jukes-Cantor mutations applied at the specfied rate at the specifed number of sites. Site positions are chosen uniformly. """ random.seed(seed) positions = [ ts.sequence_length * random.random() for _ in range(num_sites) ] positions.sort() sites = msprime.SiteTable(num_sites) mutations = msprime.MutationTable(num_sites) trees = ts.trees() t = next(trees) for position in positions: while position >= t.interval[1]: t = next(trees) generate_site_mutations(t, position, mu, sites, mutations, multiple_per_node=multiple_per_node) tables = ts.dump_tables() add_provenance(tables.provenances, "jukes_cantor") new_ts = msprime.load_tables(nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations, provenances=tables.provenances) return new_ts
def finalise(self, simplify=True, stabilise_node_ordering=False): logger.info("Finalising tree sequence") ts = self.get_tree_sequence(all_sites=True) if simplify: logger.info("Running simplify on {} nodes and {} edges".format( ts.num_nodes, ts.num_edges)) if stabilise_node_ordering: # Ensure all the node times are distinct so that they will have # stable IDs after simplifying. This could possibly also be done # by reversing the IDs within a time slice. This is used for comparing # tree sequences produced by perfect inference. tables = ts.tables time = tables.nodes.time for t in range(1, int(time[0])): index = np.where(time == t)[0] k = index.shape[0] time[index] += np.arange(k)[::-1] / k tables.nodes.set_columns(flags=tables.nodes.flags, time=time) msprime.sort_tables(**tables.asdict()) ts = msprime.load_tables(**tables.asdict()) ts = ts.simplify(samples=self.sample_ids, filter_zero_mutation_sites=False) logger.info( "Finished simplify; now have {} nodes and {} edges".format( ts.num_nodes, ts.num_edges)) return ts
def test_one_generation_no_deep_history(self): N = 20 tables = wf_sim(N=N, ngens=1, deep_history=False, seed=self.random_seed) self.assertEqual(tables.nodes.num_rows, 2 * N) self.assertGreater(tables.edges.num_rows, 0) self.assertEqual(tables.sites.num_rows, 0) self.assertEqual(tables.mutations.num_rows, 0) self.assertEqual(tables.migrations.num_rows, 0) nodes = tables.nodes edges = tables.edges samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype( np.int32) msprime.sort_tables(nodes=nodes, edges=edges) msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) ts = msprime.load_tables(nodes=nodes, edges=edges) for tree in ts.trees(): all_samples = set() for root in tree.roots: root_samples = set(tree.samples(root)) self.assertEqual(len(root_samples & all_samples), 0) all_samples |= root_samples self.assertEqual(all_samples, set(ts.samples()))
def wright_fisher(N, delta, L, T): """ Direct implementation of Algorithm W. """ edges = msprime.EdgeTable() tau = [] P = [j for j in range(N)] for j in range(N): tau.append(T) t = T n = N while t > 0: t -= 1 j = 0 Pp = [P[j] for j in range(N)] while j < N: if random.random() < delta: Pp[j] = n tau.append(t) a = random.randint(0, N - 1) b = random.randint(0, N - 1) x = random.uniform(0, L) edges.add_row(0, x, P[a], n) edges.add_row(x, L, P[b], n) n += 1 j += 1 P = Pp nodes = msprime.NodeTable() P = set(P) for j in range(n): nodes.add_row(time=tau[j], flags=int(j in P)) msprime.sort_tables(nodes=nodes, edges=edges) return msprime.load_tables(nodes=nodes, edges=edges)
def _load_legacy_hdf5_v3(root, remove_duplicate_positions): # get the trees group for the records and samples trees_group = root["trees"] nodes_group = trees_group["nodes"] time = np.array(nodes_group["time"]) breakpoints = np.array(trees_group["breakpoints"]) records_group = trees_group["records"] left_indexes = np.array(records_group["left"]) right_indexes = np.array(records_group["right"]) record_node = np.array(records_group["node"], dtype=np.int32) num_nodes = time.shape[0] sample_size = np.min(record_node) flags = np.zeros(num_nodes, dtype=np.uint32) flags[:sample_size] = msprime.NODE_IS_SAMPLE children_length = np.array(records_group["num_children"], dtype=np.uint32) total_rows = np.sum(children_length) left = np.zeros(total_rows, dtype=np.float64) right = np.zeros(total_rows, dtype=np.float64) parent = np.zeros(total_rows, dtype=np.int32) record_left = breakpoints[left_indexes] record_right = breakpoints[right_indexes] k = 0 for j in range(left_indexes.shape[0]): for _ in range(children_length[j]): left[k] = record_left[j] right[k] = record_right[j] parent[k] = record_node[j] k += 1 nodes = msprime.NodeTable() nodes.set_columns(flags=flags, time=nodes_group["time"], population=nodes_group["population"]) edges = msprime.EdgeTable() edges.set_columns(left=left, right=right, parent=parent, child=records_group["children"]) sites = msprime.SiteTable() mutations = msprime.MutationTable() if "mutations" in root: _convert_hdf5_mutations(root["mutations"], sites, mutations, remove_duplicate_positions) old_timestamp = datetime.datetime.min.isoformat() provenances = msprime.ProvenanceTable() if "provenance" in root: for record in root["provenance"]: provenances.add_row(timestamp=old_timestamp, record=record) provenances.add_row(_get_upgrade_provenance(root)) msprime.sort_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations) return msprime.load_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations, provenances=provenances)
def make_tree_add_mutations(nodes, edges, mutrate): rng = msprime.RandomGenerator(42) m = msprime.MutationTable() s = msprime.SiteTable() mg = msprime.MutationGenerator(rng, mutrate) mg.generate(nodes, edges, s, m) rv = msprime.load_tables(nodes=nodes, edgesets=edges, sites=s, mutations=m) return (rv, s)
def provenance_timestamp_only_example(): ts = msprime.simulate(10, random_seed=1) tables = ts.dump_tables() provenances = msprime.ProvenanceTable() provenances.add_row(timestamp="12345", record="") return msprime.load_tables(nodes=tables.nodes, edges=tables.edges, provenances=provenances)
def get_multiroot_example(self): ts = msprime.simulate( sample_size=50, recombination_rate=5, random_seed=self.random_seed) tables = ts.dump_tables() edges = tables.edges n = len(edges) // 2 edges.set_columns( left=edges.left[:n], right=edges.right[:n], parent=edges.parent[:n], child=edges.child[:n]) return msprime.load_tables(nodes=tables.nodes, edges=edges)
def mutation_metadata_example(): ts = msprime.simulate(10, length=10, random_seed=2) tables = ts.dump_tables() tables.sites.add_row(0, ancestral_state="a") for j in range(10): tables.mutations.add_row(site=0, node=j, derived_state="t", metadata=b"1234") return msprime.load_tables(**tables.asdict())
def general_mutation_example(): ts = msprime.simulate(10, recombination_rate=1, length=10, random_seed=2) tables = ts.dump_tables() tables.sites.add_row(position=0, ancestral_state="A", metadata=b"{}") tables.sites.add_row(position=1, ancestral_state="C", metadata=b"{'id':1}") tables.mutations.add_row(site=0, node=0, derived_state="T") tables.mutations.add_row(site=1, node=0, derived_state="G") return msprime.load_tables(nodes=tables.nodes, edges=tables.edges, sites=tables.sites, mutations=tables.mutations)
def store_output(self): if self.num_ancestors > 0: ts = self.get_tree_sequence(rescale_positions=False) else: # Allocate an empty tree sequence. ts = msprime.load_tables(nodes=msprime.NodeTable(), edges=msprime.EdgeTable(), sequence_length=1) if self.output_path is not None: ts.dump(self.output_path) return ts
def node_metadata_example(): ts = msprime.simulate( sample_size=100, recombination_rate=0.1, length=10, random_seed=1) nodes = msprime.NodeTable() edges = msprime.EdgeTable() ts.dump_tables(nodes=nodes, edges=edges) new_nodes = msprime.NodeTable() metadatas = ["n_{}".format(u) for u in range(ts.num_nodes)] packed, offset = msprime.pack_strings(metadatas) new_nodes.set_columns( metadata=packed, metadata_offset=offset, flags=nodes.flags, time=nodes.time) return msprime.load_tables(nodes=new_nodes, edges=edges)
def decapitate(ts, num_edges): """ Returns a copy of the specified tree sequence in which the specified number of edges have been retained. """ t = ts.dump_tables() t.edges.set_columns( left=t.edges.left[:num_edges], right=t.edges.right[:num_edges], parent=t.edges.parent[:num_edges], child=t.edges.child[:num_edges]) add_provenance(t.provenances, "decapitate") return msprime.load_tables( nodes=t.nodes, edges=t.edges, sites=t.sites, mutations=t.mutations, provenances=t.provenances, sequence_length=ts.sequence_length)
def test_nodes(self): nodes = msprime.NodeTable() edges = msprime.EdgeTable() metadata = ExampleMetadata(one="node1", two="node2") pickled = pickle.dumps(metadata) nodes.add_row(time=0.125, metadata=pickled) ts = msprime.load_tables(nodes=nodes, edges=edges, sequence_length=1) node = ts.node(0) self.assertEqual(node.time, 0.125) self.assertEqual(node.metadata, pickled) unpickled = pickle.loads(node.metadata) self.assertEqual(unpickled.one, metadata.one) self.assertEqual(unpickled.two, metadata.two)
def insert_redundant_breakpoints(ts): """ Builds a new tree sequence containing redundant breakpoints. """ tables = ts.dump_tables() tables.edges.reset() for r in ts.edges(): x = r.left + (r.right - r.left) / 2 tables.edges.add_row(left=r.left, right=x, child=r.child, parent=r.parent) tables.edges.add_row(left=x, right=r.right, child=r.child, parent=r.parent) add_provenance(tables.provenances, "insert_redundant_breakpoints") new_ts = msprime.load_tables(**tables.asdict()) assert new_ts.num_edges == 2 * ts.num_edges return new_ts
def get_multiroot_tree(self): ts = msprime.simulate(15, random_seed=1) # Take off the top quarter of edges tables = ts.dump_tables() edges = tables.edges n = len(edges) - len(edges) // 4 edges.set_columns( left=edges.left[:n], right=edges.right[:n], parent=edges.parent[:n], child=edges.child[:n]) ts = msprime.load_tables(nodes=tables.nodes, edges=edges) for t in ts.trees(): if t.num_roots > 1: return t assert False
def test4(self): self.n.set_columns(time=[1,0,0,2],flags=[msprime.NODE_IS_SAMPLE]*4) self.e.add_row(parent=0,child=1,left=0,right=0.4) self.e.add_row(parent=0,child=1,left=0.6,right=1.0) self.e.add_row(parent=0,child=2,left=0,right=1) self.e.add_row(parent=3,child=0,left=0,right=0.4) self.s.add_row(position=0.4,ancestral_state='0') self.m.add_row(site=0,node=3,derived_state='1') msprime.sort_tables(nodes=self.n,edges=self.e, sites=self.s,mutations=self.m) idmap = msprime.simplify_tables(nodes=self.n,edges=self.e, sites=self.s,mutations=self.m,samples=[1,2]) ts = msprime.load_tables(nodes=self.n,edges=self.e,sites=self.s, mutations=self.m) m = ts.genotype_matrix() self.assertEqual(m[0:].sum(),0)
def simplify(S, Ni, Ei, L): """ This is an implementation of the simplify algorithm described in Appendix A of the paper. """ No = msprime.NodeTable() Eo = msprime.EdgeTable() A = [[] for _ in range(len(Ni))] Q = [] ancient_nodes = [] for u in S: v = No.add_row(time=Ni.time[u], flags=1) if Ni.time[u] != 0.0: ancient_nodes.append(u) assert(v == len(No)-1) A[u] = [Segment(0, L, v)] # for u in S: # print(u, A[u]) # print("ancient nodes = ", ancient_nodes) # These changes make sure that # we collect edges for merging # in proper time order. # inodes = [i for i in range(len(Ni))] # inodes = sorted(inodes,key=lambda x:Ni.time[x]) # for u in range(len(Ni)): # for u in inodes: # for e in [e for e in Ei if e.parent == u]: # for x in A[e.child]: # if x.right > e.left and e.right > x.left: # y = Segment(max(x.left, e.left), min( # x.right, e.right), x.node) # heapq.heappush(Q, y) ei = 0 while ei < len(Ei): u = Ei.parent[ei] while ei < len(Ei) and Ei.parent[ei] == u: e = Ei[ei] for x in A[e.child]: if x.right > e.left and e.right > x.left: y = Segment(max(x.left, e.left), min( x.right, e.right), x.node) heapq.heappush(Q, y) ei += 1 v = -1 while len(Q) > 0: l = Q[0].left r = L X = [] while len(Q) > 0 and Q[0].left == l: x = heapq.heappop(Q) X.append(x) r = min(r, x.right) if len(Q) > 0: r = min(r, Q[0].left) if len(X) == 1: x = X[0] alpha = x if len(Q) > 0 and Q[0].left < x.right: alpha = Segment(x.left, Q[0].left, x.node) x.left = Q[0].left heapq.heappush(Q, x) else: if v == -1: v = No.add_row(time=Ni.time[u]) alpha = Segment(l, r, v) for x in X: Eo.add_row(l, r, v, x.node) if x.right > r: x.left = r heapq.heappush(Q, x) A[u].append(alpha) # Sort the output edges and compact them as much as possible into # the output table. We skip this for the algorithm listing as it's pretty mundane. # TODO replace this with a calls to squash_edges() and sort_tables() E = list(Eo) Eo.clear() E.sort(key=lambda e: (e.parent, e.child, e.right, e.left)) start = 0 for j in range(1, len(E)): condition = ( E[j - 1].right != E[j].left or E[j - 1].parent != E[j].parent or E[j - 1].child != E[j].child) if condition: Eo.add_row(E[start].left, E[j - 1].right, E[j - 1].parent, E[j - 1].child) start = j j = len(E) Eo.add_row(E[start].left, E[j - 1].right, E[j - 1].parent, E[j - 1].child) # for i in Eo: # print(i.left, i.right, i.parent, i.child, # No.time[i.parent], No.time[i.child]) return msprime.load_tables(nodes=No, edges=Eo)
def simplify(S, Ni, Ei, L): """ This is an implementation of the simplify algorithm described in Appendix A of the paper. """ No = msprime.NodeTable() Eo = msprime.EdgeTable() A = [[] for _ in range(len(Ni))] Q = [] for u in S: v = No.add_row(time=Ni.time[u], flags=1) assert(v == len(No)-1) A[u] = [Segment(0, L, v)] for u in S: print(u,A[u]) for u in range(len(Ni)): for e in [e for e in Ei if e.parent == u]: for x in A[e.child]: if x.right > e.left and e.right > x.left: y = Segment(max(x.left, e.left), min(x.right, e.right), x.node) heapq.heappush(Q, y) # if len(Q) >0: # print("Qsize: ",u,len(Q)) v = -1 while len(Q) > 0: l = Q[0].left r = L X = [] while len(Q) > 0 and Q[0].left == l: x = heapq.heappop(Q) X.append(x) r = min(r, x.right) if len(Q) > 0: r = min(r, Q[0].left) if len(X) == 1: x = X[0] alpha = x if len(Q) > 0 and Q[0].left < x.right: alpha = Segment(x.left, Q[0].left, x.node) x.left = Q[0].left heapq.heappush(Q, x) else: if v == -1: v = No.add_row(time=Ni.time[u]) alpha = Segment(l, r, v) for x in X: Eo.add_row(l, r, v, x.node) if x.right > r: x.left = r heapq.heappush(Q, x) print("check:",u,e.parent) A[u].append(alpha) # Sort the output edges and compact them as much as possible into # the output table. We skip this for the algorithm listing as it's pretty mundane. # TODO replace this with a calls to squash_edges() and sort_tables() E = list(Eo) Eo.clear() E.sort(key=lambda e: (e.parent, e.child, e.right, e.left)) start = 0 for j in range(1, len(E)): condition = ( E[j - 1].right != E[j].left or E[j - 1].parent != E[j].parent or E[j - 1].child != E[j].child) if condition: Eo.add_row(E[start].left, E[j - 1].right, E[j - 1].parent, E[j - 1].child) start = j j = len(E) Eo.add_row(E[start].left, E[j - 1].right, E[j - 1].parent, E[j - 1].child) return msprime.load_tables(nodes=No, edges=Eo)