def insert_branch_mutations(ts, mutations_per_branch=1): """ Returns a copy of the specified tree sequence with a mutation on every branch in every tree. """ sites = msprime.SiteTable() mutations = msprime.MutationTable() for tree in ts.trees(): site = len(sites) sites.add_row(position=tree.interval[0], ancestral_state='0') for root in tree.roots: state = {root: 0} mutation = {root: -1} stack = [root] while len(stack) > 0: u = stack.pop() stack.extend(tree.children(u)) v = tree.parent(u) if v != msprime.NULL_NODE: state[u] = state[v] parent = mutation[v] for j in range(mutations_per_branch): state[u] = (state[u] + 1) % 2 mutation[u] = len(mutations) mutations.add_row( site=site, node=u, derived_state=str(state[u]), parent=parent) parent = mutation[u] tables = ts.tables add_provenance(tables.provenances, "insert_branch_mutations") return msprime.load_tables( nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations, provenances=tables.provenances)
def jukes_cantor(ts, num_sites, mu, multiple_per_node=True, seed=None): """ Returns a copy of the specified tree sequence with Jukes-Cantor mutations applied at the specfied rate at the specifed number of sites. Site positions are chosen uniformly. """ random.seed(seed) positions = [ ts.sequence_length * random.random() for _ in range(num_sites) ] positions.sort() sites = msprime.SiteTable(num_sites) mutations = msprime.MutationTable(num_sites) trees = ts.trees() t = next(trees) for position in positions: while position >= t.interval[1]: t = next(trees) generate_site_mutations(t, position, mu, sites, mutations, multiple_per_node=multiple_per_node) tables = ts.dump_tables() add_provenance(tables.provenances, "jukes_cantor") new_ts = msprime.load_tables(nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations, provenances=tables.provenances) return new_ts
def insert_multichar_mutations(ts, seed=1, max_len=10): """ Returns a copy of the specified tree sequence with multiple chararacter mutations on a randomly chosen branch in every tree. """ rng = random.Random(seed) letters = ["A", "C", "T", "G"] sites = msprime.SiteTable() mutations = msprime.MutationTable() for tree in ts.trees(): site = len(sites) ancestral_state = rng.choice(letters) * rng.randint(0, max_len) sites.add_row(position=tree.interval[0], ancestral_state=ancestral_state) nodes = list(tree.nodes()) nodes.remove(tree.root) u = rng.choice(nodes) derived_state = ancestral_state while ancestral_state == derived_state: derived_state = rng.choice(letters) * rng.randint(0, max_len) mutations.add_row(site=site, node=u, derived_state=derived_state) tables = ts.tables add_provenance(tables.provenances, "insert_multichar_mutations") return msprime.load_tables( nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations, provenances=tables.provenances)
def _load_legacy_hdf5_v3(root, remove_duplicate_positions): # get the trees group for the records and samples trees_group = root["trees"] nodes_group = trees_group["nodes"] time = np.array(nodes_group["time"]) breakpoints = np.array(trees_group["breakpoints"]) records_group = trees_group["records"] left_indexes = np.array(records_group["left"]) right_indexes = np.array(records_group["right"]) record_node = np.array(records_group["node"], dtype=np.int32) num_nodes = time.shape[0] sample_size = np.min(record_node) flags = np.zeros(num_nodes, dtype=np.uint32) flags[:sample_size] = msprime.NODE_IS_SAMPLE children_length = np.array(records_group["num_children"], dtype=np.uint32) total_rows = np.sum(children_length) left = np.zeros(total_rows, dtype=np.float64) right = np.zeros(total_rows, dtype=np.float64) parent = np.zeros(total_rows, dtype=np.int32) record_left = breakpoints[left_indexes] record_right = breakpoints[right_indexes] k = 0 for j in range(left_indexes.shape[0]): for _ in range(children_length[j]): left[k] = record_left[j] right[k] = record_right[j] parent[k] = record_node[j] k += 1 nodes = msprime.NodeTable() nodes.set_columns(flags=flags, time=nodes_group["time"], population=nodes_group["population"]) edges = msprime.EdgeTable() edges.set_columns(left=left, right=right, parent=parent, child=records_group["children"]) sites = msprime.SiteTable() mutations = msprime.MutationTable() if "mutations" in root: _convert_hdf5_mutations(root["mutations"], sites, mutations, remove_duplicate_positions) old_timestamp = datetime.datetime.min.isoformat() provenances = msprime.ProvenanceTable() if "provenance" in root: for record in root["provenance"]: provenances.add_row(timestamp=old_timestamp, record=record) provenances.add_row(_get_upgrade_provenance(root)) msprime.sort_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations) return msprime.load_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations, provenances=provenances)
def make_tree_add_mutations(nodes, edges, mutrate): rng = msprime.RandomGenerator(42) m = msprime.MutationTable() s = msprime.SiteTable() mg = msprime.MutationGenerator(rng, mutrate) mg.generate(nodes, edges, s, m) rv = msprime.load_tables(nodes=nodes, edgesets=edges, sites=s, mutations=m) return (rv, s)
def general_mutation_example(): ts = msprime.simulate(10, recombination_rate=1, length=10, random_seed=2) nodes = msprime.NodeTable() edges = msprime.EdgeTable() ts.dump_tables(nodes=nodes, edges=edges) sites = msprime.SiteTable() mutations = msprime.MutationTable() sites.add_row(position=0, ancestral_state="A", metadata=b"{}") sites.add_row(position=1, ancestral_state="C", metadata=b"{'id':1}") mutations.add_row(site=0, node=0, derived_state="T") mutations.add_row(site=1, node=0, derived_state="G") return msprime.load_tables( nodes=nodes, edges=edges, sites=sites, mutations=mutations)
def test_sites(self): nodes = msprime.NodeTable() edges = msprime.EdgeTable() sites = msprime.SiteTable() mutations = msprime.MutationTable() metadata = ExampleMetadata(one="node1", two="node2") pickled = pickle.dumps(metadata) sites.add_row(position=0.1, ancestral_state="A", metadata=pickled) ts = msprime.load_tables( nodes=nodes, edges=edges, sites=sites, mutations=mutations, sequence_length=1) site = ts.site(0) self.assertEqual(site.position, 0.1) self.assertEqual(site.ancestral_state, "A") self.assertEqual(site.metadata, pickled) unpickled = pickle.loads(site.metadata) self.assertEqual(unpickled.one, metadata.one) self.assertEqual(unpickled.two, metadata.two)
def permute_nodes(ts, node_map): """ Returns a copy of the specified tree sequence such that the nodes are permuted according to the specified map. """ # Mapping from nodes in the new tree sequence back to nodes in the original reverse_map = [0 for _ in node_map] for j in range(ts.num_nodes): reverse_map[node_map[j]] = j old_nodes = list(ts.nodes()) new_nodes = msprime.NodeTable() for j in range(ts.num_nodes): old_node = old_nodes[reverse_map[j]] new_nodes.add_row(flags=old_node.flags, metadata=old_node.metadata, population=old_node.population, time=old_node.time) new_edges = msprime.EdgeTable() for edge in ts.edges(): new_edges.add_row(left=edge.left, right=edge.right, parent=node_map[edge.parent], child=node_map[edge.child]) new_sites = msprime.SiteTable() new_mutations = msprime.MutationTable() for site in ts.sites(): new_sites.add_row(position=site.position, ancestral_state=site.ancestral_state) for mutation in site.mutations: new_mutations.add_row(site=site.id, derived_state=mutation.derived_state, node=node_map[mutation.node]) msprime.sort_tables(nodes=new_nodes, edges=new_edges, sites=new_sites, mutations=new_mutations) provenances = ts.dump_tables().provenances add_provenance(provenances, "permute_nodes") return msprime.load_tables(nodes=new_nodes, edges=new_edges, sites=new_sites, mutations=new_mutations, provenances=provenances)
def insert_branch_sites(ts): """ Returns a copy of the specified tree sequence with a site on every branch of every tree. """ sites = msprime.SiteTable() mutations = msprime.MutationTable() for tree in ts.trees(): left, right = tree.interval delta = (right - left) / len(list(tree.nodes())) x = left for u in tree.nodes(): if tree.parent(u) != msprime.NULL_NODE: site = sites.add_row(position=x, ancestral_state='0') mutations.add_row(site=site, node=u, derived_state='1') x += delta tables = ts.tables add_provenance(tables.provenances, "insert_branch_sites") return msprime.load_tables( nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations, provenances=tables.provenances)
def test_mutations(self): nodes = msprime.NodeTable() edges = msprime.EdgeTable() sites = msprime.SiteTable() mutations = msprime.MutationTable() metadata = ExampleMetadata(one="node1", two="node2") pickled = pickle.dumps(metadata) nodes.add_row(time=0) sites.add_row(position=0.1, ancestral_state="A") mutations.add_row(site=0, node=0, derived_state="T", metadata=pickled) ts = msprime.load_tables( nodes=nodes, edges=edges, sites=sites, mutations=mutations, sequence_length=1) mutation = ts.site(0).mutations[0] self.assertEqual(mutation.site, 0) self.assertEqual(mutation.node, 0) self.assertEqual(mutation.derived_state, "T") self.assertEqual(mutation.metadata, pickled) unpickled = pickle.loads(mutation.metadata) self.assertEqual(unpickled.one, metadata.one) self.assertEqual(unpickled.two, metadata.two)
def ts_private_mutations_only(ts): """ Returns a new tree sequence which is a single tree and contains at least one singleton for each sample. """ ll_tables = ts.dump_tables().asdict() mt = msprime.MutationTable() st = msprime.SiteTable() positions = sorted([np.random.random() for _ in range(ts.num_samples)]) for i, n in enumerate(ts.samples()): st.add_row(positions[i], '0') mt.add_row(i, n, '1') ll_tables['sites'] = st.asdict() ll_tables['mutations'] = mt.asdict() ts_singletons = msprime.tskit.tables.TableCollection.fromdict( ll_tables).tree_sequence() return ts_singletons
def write_vcf(chrom): treefile = args.tree_file[chrom] vcf = open(args.vcffile[chrom], "w") mut_rate = args.mut_rate[chrom] seed = seeds[chrom] logfile.write("Simulating mutations on" + treefile + "\n") ts = msprime.load(treefile) rng = msprime.RandomGenerator(seed) nodes = msprime.NodeTable() edgesets = msprime.EdgesetTable() sites = msprime.SiteTable() mutations = msprime.MutationTable() migrations = msprime.MigrationTable() ts.dump_tables(nodes=nodes, edgesets=edgesets, migrations=migrations) mutgen = msprime.MutationGenerator(rng, mut_rate) mutgen.generate(nodes, edgesets, sites, mutations) logfile.write("Saving to" + args.vcffile[chrom] + "\n") mutated_ts = msprime.load_tables(nodes=nodes, edgesets=edgesets, sites=sites, mutations=mutations) mutated_ts.write_vcf(vcf, ploidy=1) return True
def get_tree_sequence(self, rescale_positions=True, all_sites=False): """ Returns the current state of the build tree sequence. All samples and ancestors will have the sample node flag set. """ # TODO Change the API here to ask whether we want a final tree sequence # or not. In the latter case we also need to translate the ancestral # and derived states to the input values. tsb = self.tree_sequence_builder flags, time = tsb.dump_nodes() nodes = msprime.NodeTable() nodes.set_columns(flags=flags, time=time) left, right, parent, child = tsb.dump_edges() if rescale_positions: position = self.sample_data.position[:] sequence_length = self.sample_data.sequence_length if sequence_length is None or sequence_length < position[-1]: sequence_length = position[-1] + 1 # Subset down to the variants. position = position[self.sample_data.variant_site[:]] x = np.hstack([position, [sequence_length]]) x[0] = 0 left = x[left] right = x[right] else: position = np.arange(tsb.num_sites) sequence_length = max(1, tsb.num_sites) edges = msprime.EdgeTable() edges.set_columns(left=left, right=right, parent=parent, child=child) sites = msprime.SiteTable() sites.set_columns( position=position, ancestral_state=np.zeros(tsb.num_sites, dtype=np.int8) + ord('0'), ancestral_state_offset=np.arange(tsb.num_sites + 1, dtype=np.uint32)) mutations = msprime.MutationTable() site = np.zeros(tsb.num_mutations, dtype=np.int32) node = np.zeros(tsb.num_mutations, dtype=np.int32) parent = np.zeros(tsb.num_mutations, dtype=np.int32) derived_state = np.zeros(tsb.num_mutations, dtype=np.int8) site, node, derived_state, parent = tsb.dump_mutations() derived_state += ord('0') mutations.set_columns(site=site, node=node, derived_state=derived_state, derived_state_offset=np.arange( tsb.num_mutations + 1, dtype=np.uint32), parent=parent) if all_sites: # Append the sites and mutations for each singleton. num_singletons = self.sample_data.num_singleton_sites singleton_site = self.sample_data.singleton_site[:] singleton_sample = self.sample_data.singleton_sample[:] pos = self.sample_data.position[:] new_sites = np.arange(len(sites), len(sites) + num_singletons, dtype=np.int32) sites.append_columns( position=pos[singleton_site], ancestral_state=np.zeros(num_singletons, dtype=np.int8) + ord('0'), ancestral_state_offset=np.arange(num_singletons + 1, dtype=np.uint32)) mutations.append_columns( site=new_sites, node=self.sample_ids[singleton_sample], derived_state=np.zeros(num_singletons, dtype=np.int8) + ord('1'), derived_state_offset=np.arange(num_singletons + 1, dtype=np.uint32)) # Get the invariant sites num_invariants = self.sample_data.num_invariant_sites invariant_site = self.sample_data.invariant_site[:] sites.append_columns( position=pos[invariant_site], ancestral_state=np.zeros(num_invariants, dtype=np.int8) + ord('0'), ancestral_state_offset=np.arange(num_invariants + 1, dtype=np.uint32)) msprime.sort_tables(nodes, edges, sites=sites, mutations=mutations) return msprime.load_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations, sequence_length=sequence_length)
def _load_legacy_hdf5_v2(root, remove_duplicate_positions): # Get the coalescence records trees_group = root["trees"] old_timestamp = datetime.datetime.min.isoformat() provenances = msprime.ProvenanceTable() provenances.add_row(timestamp=old_timestamp, record=_get_v2_provenance("generate_trees", trees_group.attrs)) num_rows = trees_group["node"].shape[0] index = np.arange(num_rows, dtype=int) parent = np.zeros(2 * num_rows, dtype=np.int32) parent[2 * index] = trees_group["node"] parent[2 * index + 1] = trees_group["node"] left = np.zeros(2 * num_rows, dtype=np.float64) left[2 * index] = trees_group["left"] left[2 * index + 1] = trees_group["left"] right = np.zeros(2 * num_rows, dtype=np.float64) right[2 * index] = trees_group["right"] right[2 * index + 1] = trees_group["right"] child = np.array(trees_group["children"], dtype=np.int32).flatten() edges = msprime.EdgeTable() edges.set_columns(left=left, right=right, parent=parent, child=child) cr_node = np.array(trees_group["node"], dtype=np.int32) num_nodes = max(np.max(child), np.max(cr_node)) + 1 sample_size = np.min(cr_node) flags = np.zeros(num_nodes, dtype=np.uint32) population = np.zeros(num_nodes, dtype=np.int32) time = np.zeros(num_nodes, dtype=np.float64) flags[:sample_size] = msprime.NODE_IS_SAMPLE cr_population = np.array(trees_group["population"], dtype=np.int32) cr_time = np.array(trees_group["time"]) time[cr_node] = cr_time population[cr_node] = cr_population if "samples" in root: samples_group = root["samples"] population[:sample_size] = samples_group["population"] if "time" in samples_group: time[:sample_size] = samples_group["time"] nodes = msprime.NodeTable() nodes.set_columns(flags=flags, population=population, time=time) sites = msprime.SiteTable() mutations = msprime.MutationTable() if "mutations" in root: mutations_group = root["mutations"] _convert_hdf5_mutations(mutations_group, sites, mutations, remove_duplicate_positions) provenances.add_row(timestamp=old_timestamp, record=_get_v2_provenance("generate_mutations", mutations_group.attrs)) provenances.add_row(_get_upgrade_provenance(root)) msprime.sort_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations) return msprime.load_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations, provenances=provenances)
population=nodes['population'], time=nodes['generation']) es = msprime.EdgeTable() es.set_columns(left=edges['left'], right=edges['right'], parent=edges['parent'], child=edges['child']) st = msprime.SiteTable() st.set_columns(position=mutas['position'], ancestral_state=np.zeros(len(mutas['position']), np.int8), ancestral_state_length=np.ones(len(mutas['position']), np.uint32)) mt = msprime.MutationTable() mt.set_columns(site=np.arange(len(mutas['node_id']), dtype=np.int32), node=mutas['node_id'], derived_state=np.ones(len(mutas['node_id']), np.int8), derived_state_length=np.ones(len(mutas['node_id']), np.uint32)) # Sort msprime.sort_tables(nodes=nt, edges=es, sites=st, mutations=mt) print("num total mutations: ", st.num_rows) # Simplify: this is where the magic happens ## PLR: since these tables aren't valid, you gotta use simplify_tables, not load them into a tree sequence nt_c = nt.copy() es_c = es.copy() st_c = st.copy()
def _load_legacy_hdf5_v10(root, remove_duplicate_positions=False): # We cannot have duplicate positions in v10, so this parameter is ignored nodes_group = root["nodes"] nodes = msprime.NodeTable() metadata = None metadata_offset = None if "metadata" in nodes_group: metadata = nodes_group["metadata"] metadata_offset = nodes_group["metadata_offset"] nodes.set_columns(flags=nodes_group["flags"], population=nodes_group["population"], time=nodes_group["time"], metadata=metadata, metadata_offset=metadata_offset) edges_group = root["edges"] edges = msprime.EdgeTable() edges.set_columns(left=edges_group["left"], right=edges_group["right"], parent=edges_group["parent"], child=edges_group["child"]) migrations_group = root["migrations"] migrations = msprime.MigrationTable() if "left" in migrations_group: migrations.set_columns(left=migrations_group["left"], right=migrations_group["right"], node=migrations_group["node"], source=migrations_group["source"], dest=migrations_group["dest"], time=migrations_group["time"]) sites_group = root["sites"] sites = msprime.SiteTable() if "position" in sites_group: metadata = None metadata_offset = None if "metadata" in sites_group: metadata = sites_group["metadata"] metadata_offset = sites_group["metadata_offset"] sites.set_columns( position=sites_group["position"], ancestral_state=sites_group["ancestral_state"], ancestral_state_offset=sites_group["ancestral_state_offset"], metadata=metadata, metadata_offset=metadata_offset) mutations_group = root["mutations"] mutations = msprime.MutationTable() if "site" in mutations_group: metadata = None metadata_offset = None if "metadata" in mutations_group: metadata = mutations_group["metadata"] metadata_offset = mutations_group["metadata_offset"] mutations.set_columns( site=mutations_group["site"], node=mutations_group["node"], parent=mutations_group["parent"], derived_state=mutations_group["derived_state"], derived_state_offset=mutations_group["derived_state_offset"], metadata=metadata, metadata_offset=metadata_offset) provenances_group = root["provenances"] provenances = msprime.ProvenanceTable() if "timestamp" in provenances_group: timestamp = provenances_group["timestamp"] timestamp_offset = provenances_group["timestamp_offset"] if "record" in provenances_group: record = provenances_group["record"] record_offset = provenances_group["record_offset"] else: record = np.empty_like(timestamp) record_offset = np.zeros_like(timestamp_offset) provenances.set_columns(timestamp=timestamp, timestamp_offset=timestamp_offset, record=record, record_offset=record_offset) provenances.add_row(_get_upgrade_provenance(root)) return msprime.load_tables(nodes=nodes, edges=edges, migrations=migrations, sites=sites, mutations=mutations, provenances=provenances)