Exemple #1
0
def insert_branch_mutations(ts, mutations_per_branch=1):
    """
    Returns a copy of the specified tree sequence with a mutation on every branch
    in every tree.
    """
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    for tree in ts.trees():
        site = len(sites)
        sites.add_row(position=tree.interval[0], ancestral_state='0')
        for root in tree.roots:
            state = {root: 0}
            mutation = {root: -1}
            stack = [root]
            while len(stack) > 0:
                u = stack.pop()
                stack.extend(tree.children(u))
                v = tree.parent(u)
                if v != msprime.NULL_NODE:
                    state[u] = state[v]
                    parent = mutation[v]
                    for j in range(mutations_per_branch):
                        state[u] = (state[u] + 1) % 2
                        mutation[u] = len(mutations)
                        mutations.add_row(
                            site=site, node=u, derived_state=str(state[u]),
                            parent=parent)
                        parent = mutation[u]
    tables = ts.tables
    add_provenance(tables.provenances, "insert_branch_mutations")
    return msprime.load_tables(
        nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations,
        provenances=tables.provenances)
Exemple #2
0
def jukes_cantor(ts, num_sites, mu, multiple_per_node=True, seed=None):
    """
    Returns a copy of the specified tree sequence with Jukes-Cantor mutations
    applied at the specfied rate at the specifed number of sites. Site positions
    are chosen uniformly.
    """
    random.seed(seed)
    positions = [
        ts.sequence_length * random.random() for _ in range(num_sites)
    ]
    positions.sort()
    sites = msprime.SiteTable(num_sites)
    mutations = msprime.MutationTable(num_sites)
    trees = ts.trees()
    t = next(trees)
    for position in positions:
        while position >= t.interval[1]:
            t = next(trees)
        generate_site_mutations(t,
                                position,
                                mu,
                                sites,
                                mutations,
                                multiple_per_node=multiple_per_node)
    tables = ts.dump_tables()
    add_provenance(tables.provenances, "jukes_cantor")
    new_ts = msprime.load_tables(nodes=tables.nodes,
                                 edges=tables.edges,
                                 sites=sites,
                                 mutations=mutations,
                                 provenances=tables.provenances)
    return new_ts
Exemple #3
0
def insert_multichar_mutations(ts, seed=1, max_len=10):
    """
    Returns a copy of the specified tree sequence with multiple chararacter
    mutations on a randomly chosen branch in every tree.
    """
    rng = random.Random(seed)
    letters = ["A", "C", "T", "G"]
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    for tree in ts.trees():
        site = len(sites)
        ancestral_state = rng.choice(letters) * rng.randint(0, max_len)
        sites.add_row(position=tree.interval[0], ancestral_state=ancestral_state)
        nodes = list(tree.nodes())
        nodes.remove(tree.root)
        u = rng.choice(nodes)
        derived_state = ancestral_state
        while ancestral_state == derived_state:
            derived_state = rng.choice(letters) * rng.randint(0, max_len)
        mutations.add_row(site=site, node=u, derived_state=derived_state)
    tables = ts.tables
    add_provenance(tables.provenances, "insert_multichar_mutations")
    return msprime.load_tables(
        nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations,
        provenances=tables.provenances)
Exemple #4
0
def _load_legacy_hdf5_v3(root, remove_duplicate_positions):
    # get the trees group for the records and samples
    trees_group = root["trees"]
    nodes_group = trees_group["nodes"]
    time = np.array(nodes_group["time"])

    breakpoints = np.array(trees_group["breakpoints"])
    records_group = trees_group["records"]
    left_indexes = np.array(records_group["left"])
    right_indexes = np.array(records_group["right"])
    record_node = np.array(records_group["node"], dtype=np.int32)
    num_nodes = time.shape[0]
    sample_size = np.min(record_node)
    flags = np.zeros(num_nodes, dtype=np.uint32)
    flags[:sample_size] = msprime.NODE_IS_SAMPLE

    children_length = np.array(records_group["num_children"], dtype=np.uint32)
    total_rows = np.sum(children_length)
    left = np.zeros(total_rows, dtype=np.float64)
    right = np.zeros(total_rows, dtype=np.float64)
    parent = np.zeros(total_rows, dtype=np.int32)
    record_left = breakpoints[left_indexes]
    record_right = breakpoints[right_indexes]
    k = 0
    for j in range(left_indexes.shape[0]):
        for _ in range(children_length[j]):
            left[k] = record_left[j]
            right[k] = record_right[j]
            parent[k] = record_node[j]
            k += 1
    nodes = msprime.NodeTable()
    nodes.set_columns(flags=flags,
                      time=nodes_group["time"],
                      population=nodes_group["population"])
    edges = msprime.EdgeTable()
    edges.set_columns(left=left,
                      right=right,
                      parent=parent,
                      child=records_group["children"])
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    if "mutations" in root:
        _convert_hdf5_mutations(root["mutations"], sites, mutations,
                                remove_duplicate_positions)
    old_timestamp = datetime.datetime.min.isoformat()
    provenances = msprime.ProvenanceTable()
    if "provenance" in root:
        for record in root["provenance"]:
            provenances.add_row(timestamp=old_timestamp, record=record)
    provenances.add_row(_get_upgrade_provenance(root))
    msprime.sort_tables(nodes=nodes,
                        edges=edges,
                        sites=sites,
                        mutations=mutations)
    return msprime.load_tables(nodes=nodes,
                               edges=edges,
                               sites=sites,
                               mutations=mutations,
                               provenances=provenances)
Exemple #5
0
def make_tree_add_mutations(nodes, edges, mutrate):
    rng = msprime.RandomGenerator(42)
    m = msprime.MutationTable()
    s = msprime.SiteTable()
    mg = msprime.MutationGenerator(rng, mutrate)
    mg.generate(nodes, edges, s, m)
    rv = msprime.load_tables(nodes=nodes, edgesets=edges, sites=s, mutations=m)
    return (rv, s)
Exemple #6
0
def general_mutation_example():
    ts = msprime.simulate(10, recombination_rate=1, length=10, random_seed=2)
    nodes = msprime.NodeTable()
    edges = msprime.EdgeTable()
    ts.dump_tables(nodes=nodes, edges=edges)
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    sites.add_row(position=0, ancestral_state="A", metadata=b"{}")
    sites.add_row(position=1, ancestral_state="C", metadata=b"{'id':1}")
    mutations.add_row(site=0, node=0, derived_state="T")
    mutations.add_row(site=1, node=0, derived_state="G")
    return msprime.load_tables(
        nodes=nodes, edges=edges, sites=sites, mutations=mutations)
Exemple #7
0
 def test_sites(self):
     nodes = msprime.NodeTable()
     edges = msprime.EdgeTable()
     sites = msprime.SiteTable()
     mutations = msprime.MutationTable()
     metadata = ExampleMetadata(one="node1", two="node2")
     pickled = pickle.dumps(metadata)
     sites.add_row(position=0.1, ancestral_state="A", metadata=pickled)
     ts = msprime.load_tables(
         nodes=nodes, edges=edges, sites=sites, mutations=mutations,
         sequence_length=1)
     site = ts.site(0)
     self.assertEqual(site.position, 0.1)
     self.assertEqual(site.ancestral_state, "A")
     self.assertEqual(site.metadata, pickled)
     unpickled = pickle.loads(site.metadata)
     self.assertEqual(unpickled.one, metadata.one)
     self.assertEqual(unpickled.two, metadata.two)
Exemple #8
0
def permute_nodes(ts, node_map):
    """
    Returns a copy of the specified tree sequence such that the nodes are
    permuted according to the specified map.
    """
    # Mapping from nodes in the new tree sequence back to nodes in the original
    reverse_map = [0 for _ in node_map]
    for j in range(ts.num_nodes):
        reverse_map[node_map[j]] = j
    old_nodes = list(ts.nodes())
    new_nodes = msprime.NodeTable()
    for j in range(ts.num_nodes):
        old_node = old_nodes[reverse_map[j]]
        new_nodes.add_row(flags=old_node.flags,
                          metadata=old_node.metadata,
                          population=old_node.population,
                          time=old_node.time)
    new_edges = msprime.EdgeTable()
    for edge in ts.edges():
        new_edges.add_row(left=edge.left,
                          right=edge.right,
                          parent=node_map[edge.parent],
                          child=node_map[edge.child])
    new_sites = msprime.SiteTable()
    new_mutations = msprime.MutationTable()
    for site in ts.sites():
        new_sites.add_row(position=site.position,
                          ancestral_state=site.ancestral_state)
        for mutation in site.mutations:
            new_mutations.add_row(site=site.id,
                                  derived_state=mutation.derived_state,
                                  node=node_map[mutation.node])
    msprime.sort_tables(nodes=new_nodes,
                        edges=new_edges,
                        sites=new_sites,
                        mutations=new_mutations)
    provenances = ts.dump_tables().provenances
    add_provenance(provenances, "permute_nodes")
    return msprime.load_tables(nodes=new_nodes,
                               edges=new_edges,
                               sites=new_sites,
                               mutations=new_mutations,
                               provenances=provenances)
Exemple #9
0
def insert_branch_sites(ts):
    """
    Returns a copy of the specified tree sequence with a site on every branch
    of every tree.
    """
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    for tree in ts.trees():
        left, right = tree.interval
        delta = (right - left) / len(list(tree.nodes()))
        x = left
        for u in tree.nodes():
            if tree.parent(u) != msprime.NULL_NODE:
                site = sites.add_row(position=x, ancestral_state='0')
                mutations.add_row(site=site, node=u, derived_state='1')
                x += delta
    tables = ts.tables
    add_provenance(tables.provenances, "insert_branch_sites")
    return msprime.load_tables(
        nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations,
        provenances=tables.provenances)
Exemple #10
0
 def test_mutations(self):
     nodes = msprime.NodeTable()
     edges = msprime.EdgeTable()
     sites = msprime.SiteTable()
     mutations = msprime.MutationTable()
     metadata = ExampleMetadata(one="node1", two="node2")
     pickled = pickle.dumps(metadata)
     nodes.add_row(time=0)
     sites.add_row(position=0.1, ancestral_state="A")
     mutations.add_row(site=0, node=0, derived_state="T", metadata=pickled)
     ts = msprime.load_tables(
         nodes=nodes, edges=edges, sites=sites, mutations=mutations,
         sequence_length=1)
     mutation = ts.site(0).mutations[0]
     self.assertEqual(mutation.site, 0)
     self.assertEqual(mutation.node, 0)
     self.assertEqual(mutation.derived_state, "T")
     self.assertEqual(mutation.metadata, pickled)
     unpickled = pickle.loads(mutation.metadata)
     self.assertEqual(unpickled.one, metadata.one)
     self.assertEqual(unpickled.two, metadata.two)
def ts_private_mutations_only(ts):
    """
    Returns a new tree sequence which is a single tree and contains at least
    one singleton for each sample.
    """
    ll_tables = ts.dump_tables().asdict()

    mt = msprime.MutationTable()
    st = msprime.SiteTable()
    positions = sorted([np.random.random() for _ in range(ts.num_samples)])
    for i, n in enumerate(ts.samples()):
        st.add_row(positions[i], '0')
        mt.add_row(i, n, '1')

    ll_tables['sites'] = st.asdict()
    ll_tables['mutations'] = mt.asdict()

    ts_singletons = msprime.tskit.tables.TableCollection.fromdict(
        ll_tables).tree_sequence()

    return ts_singletons
def write_vcf(chrom):
    treefile = args.tree_file[chrom]
    vcf = open(args.vcffile[chrom], "w")
    mut_rate = args.mut_rate[chrom]
    seed = seeds[chrom]
    logfile.write("Simulating mutations on" + treefile + "\n")
    ts = msprime.load(treefile)
    rng = msprime.RandomGenerator(seed)
    nodes = msprime.NodeTable()
    edgesets = msprime.EdgesetTable()
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    migrations = msprime.MigrationTable()
    ts.dump_tables(nodes=nodes, edgesets=edgesets, migrations=migrations)
    mutgen = msprime.MutationGenerator(rng, mut_rate)
    mutgen.generate(nodes, edgesets, sites, mutations)
    logfile.write("Saving to" + args.vcffile[chrom] + "\n")
    mutated_ts = msprime.load_tables(nodes=nodes,
                                     edgesets=edgesets,
                                     sites=sites,
                                     mutations=mutations)
    mutated_ts.write_vcf(vcf, ploidy=1)

    return True
Exemple #13
0
    def get_tree_sequence(self, rescale_positions=True, all_sites=False):
        """
        Returns the current state of the build tree sequence. All samples and
        ancestors will have the sample node flag set.
        """
        # TODO Change the API here to ask whether we want a final tree sequence
        # or not. In the latter case we also need to translate the ancestral
        # and derived states to the input values.
        tsb = self.tree_sequence_builder
        flags, time = tsb.dump_nodes()
        nodes = msprime.NodeTable()
        nodes.set_columns(flags=flags, time=time)

        left, right, parent, child = tsb.dump_edges()
        if rescale_positions:
            position = self.sample_data.position[:]
            sequence_length = self.sample_data.sequence_length
            if sequence_length is None or sequence_length < position[-1]:
                sequence_length = position[-1] + 1
            # Subset down to the variants.
            position = position[self.sample_data.variant_site[:]]
            x = np.hstack([position, [sequence_length]])
            x[0] = 0
            left = x[left]
            right = x[right]
        else:
            position = np.arange(tsb.num_sites)
            sequence_length = max(1, tsb.num_sites)

        edges = msprime.EdgeTable()
        edges.set_columns(left=left, right=right, parent=parent, child=child)

        sites = msprime.SiteTable()
        sites.set_columns(
            position=position,
            ancestral_state=np.zeros(tsb.num_sites, dtype=np.int8) + ord('0'),
            ancestral_state_offset=np.arange(tsb.num_sites + 1,
                                             dtype=np.uint32))
        mutations = msprime.MutationTable()
        site = np.zeros(tsb.num_mutations, dtype=np.int32)
        node = np.zeros(tsb.num_mutations, dtype=np.int32)
        parent = np.zeros(tsb.num_mutations, dtype=np.int32)
        derived_state = np.zeros(tsb.num_mutations, dtype=np.int8)
        site, node, derived_state, parent = tsb.dump_mutations()
        derived_state += ord('0')
        mutations.set_columns(site=site,
                              node=node,
                              derived_state=derived_state,
                              derived_state_offset=np.arange(
                                  tsb.num_mutations + 1, dtype=np.uint32),
                              parent=parent)
        if all_sites:
            # Append the sites and mutations for each singleton.
            num_singletons = self.sample_data.num_singleton_sites
            singleton_site = self.sample_data.singleton_site[:]
            singleton_sample = self.sample_data.singleton_sample[:]
            pos = self.sample_data.position[:]
            new_sites = np.arange(len(sites),
                                  len(sites) + num_singletons,
                                  dtype=np.int32)
            sites.append_columns(
                position=pos[singleton_site],
                ancestral_state=np.zeros(num_singletons, dtype=np.int8) +
                ord('0'),
                ancestral_state_offset=np.arange(num_singletons + 1,
                                                 dtype=np.uint32))
            mutations.append_columns(
                site=new_sites,
                node=self.sample_ids[singleton_sample],
                derived_state=np.zeros(num_singletons, dtype=np.int8) +
                ord('1'),
                derived_state_offset=np.arange(num_singletons + 1,
                                               dtype=np.uint32))
            # Get the invariant sites
            num_invariants = self.sample_data.num_invariant_sites
            invariant_site = self.sample_data.invariant_site[:]
            sites.append_columns(
                position=pos[invariant_site],
                ancestral_state=np.zeros(num_invariants, dtype=np.int8) +
                ord('0'),
                ancestral_state_offset=np.arange(num_invariants + 1,
                                                 dtype=np.uint32))

        msprime.sort_tables(nodes, edges, sites=sites, mutations=mutations)
        return msprime.load_tables(nodes=nodes,
                                   edges=edges,
                                   sites=sites,
                                   mutations=mutations,
                                   sequence_length=sequence_length)
Exemple #14
0
def _load_legacy_hdf5_v2(root, remove_duplicate_positions):
    # Get the coalescence records
    trees_group = root["trees"]
    old_timestamp = datetime.datetime.min.isoformat()
    provenances = msprime.ProvenanceTable()
    provenances.add_row(timestamp=old_timestamp,
                        record=_get_v2_provenance("generate_trees",
                                                  trees_group.attrs))
    num_rows = trees_group["node"].shape[0]
    index = np.arange(num_rows, dtype=int)
    parent = np.zeros(2 * num_rows, dtype=np.int32)
    parent[2 * index] = trees_group["node"]
    parent[2 * index + 1] = trees_group["node"]
    left = np.zeros(2 * num_rows, dtype=np.float64)
    left[2 * index] = trees_group["left"]
    left[2 * index + 1] = trees_group["left"]
    right = np.zeros(2 * num_rows, dtype=np.float64)
    right[2 * index] = trees_group["right"]
    right[2 * index + 1] = trees_group["right"]
    child = np.array(trees_group["children"], dtype=np.int32).flatten()
    edges = msprime.EdgeTable()
    edges.set_columns(left=left, right=right, parent=parent, child=child)

    cr_node = np.array(trees_group["node"], dtype=np.int32)
    num_nodes = max(np.max(child), np.max(cr_node)) + 1
    sample_size = np.min(cr_node)
    flags = np.zeros(num_nodes, dtype=np.uint32)
    population = np.zeros(num_nodes, dtype=np.int32)
    time = np.zeros(num_nodes, dtype=np.float64)
    flags[:sample_size] = msprime.NODE_IS_SAMPLE
    cr_population = np.array(trees_group["population"], dtype=np.int32)
    cr_time = np.array(trees_group["time"])
    time[cr_node] = cr_time
    population[cr_node] = cr_population
    if "samples" in root:
        samples_group = root["samples"]
        population[:sample_size] = samples_group["population"]
        if "time" in samples_group:
            time[:sample_size] = samples_group["time"]
    nodes = msprime.NodeTable()
    nodes.set_columns(flags=flags, population=population, time=time)

    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    if "mutations" in root:
        mutations_group = root["mutations"]
        _convert_hdf5_mutations(mutations_group, sites, mutations,
                                remove_duplicate_positions)
        provenances.add_row(timestamp=old_timestamp,
                            record=_get_v2_provenance("generate_mutations",
                                                      mutations_group.attrs))
    provenances.add_row(_get_upgrade_provenance(root))
    msprime.sort_tables(nodes=nodes,
                        edges=edges,
                        sites=sites,
                        mutations=mutations)
    return msprime.load_tables(nodes=nodes,
                               edges=edges,
                               sites=sites,
                               mutations=mutations,
                               provenances=provenances)
                   population=nodes['population'],
                   time=nodes['generation'])

    es = msprime.EdgeTable()
    es.set_columns(left=edges['left'],
                   right=edges['right'],
                   parent=edges['parent'],
                   child=edges['child'])

    st = msprime.SiteTable()
    st.set_columns(position=mutas['position'],
                   ancestral_state=np.zeros(len(mutas['position']), np.int8),
                   ancestral_state_length=np.ones(len(mutas['position']),
                                                  np.uint32))

    mt = msprime.MutationTable()
    mt.set_columns(site=np.arange(len(mutas['node_id']), dtype=np.int32),
                   node=mutas['node_id'],
                   derived_state=np.ones(len(mutas['node_id']), np.int8),
                   derived_state_length=np.ones(len(mutas['node_id']),
                                                np.uint32))

    # Sort
    msprime.sort_tables(nodes=nt, edges=es, sites=st, mutations=mt)
    print("num total mutations: ", st.num_rows)

    # Simplify: this is where the magic happens
    ## PLR: since these tables aren't valid, you gotta use simplify_tables, not load them into a tree sequence
    nt_c = nt.copy()
    es_c = es.copy()
    st_c = st.copy()
Exemple #16
0
def _load_legacy_hdf5_v10(root, remove_duplicate_positions=False):
    # We cannot have duplicate positions in v10, so this parameter is ignored
    nodes_group = root["nodes"]
    nodes = msprime.NodeTable()
    metadata = None
    metadata_offset = None
    if "metadata" in nodes_group:
        metadata = nodes_group["metadata"]
        metadata_offset = nodes_group["metadata_offset"]
    nodes.set_columns(flags=nodes_group["flags"],
                      population=nodes_group["population"],
                      time=nodes_group["time"],
                      metadata=metadata,
                      metadata_offset=metadata_offset)

    edges_group = root["edges"]
    edges = msprime.EdgeTable()
    edges.set_columns(left=edges_group["left"],
                      right=edges_group["right"],
                      parent=edges_group["parent"],
                      child=edges_group["child"])

    migrations_group = root["migrations"]
    migrations = msprime.MigrationTable()
    if "left" in migrations_group:
        migrations.set_columns(left=migrations_group["left"],
                               right=migrations_group["right"],
                               node=migrations_group["node"],
                               source=migrations_group["source"],
                               dest=migrations_group["dest"],
                               time=migrations_group["time"])

    sites_group = root["sites"]
    sites = msprime.SiteTable()
    if "position" in sites_group:
        metadata = None
        metadata_offset = None
        if "metadata" in sites_group:
            metadata = sites_group["metadata"]
            metadata_offset = sites_group["metadata_offset"]
        sites.set_columns(
            position=sites_group["position"],
            ancestral_state=sites_group["ancestral_state"],
            ancestral_state_offset=sites_group["ancestral_state_offset"],
            metadata=metadata,
            metadata_offset=metadata_offset)

    mutations_group = root["mutations"]
    mutations = msprime.MutationTable()
    if "site" in mutations_group:
        metadata = None
        metadata_offset = None
        if "metadata" in mutations_group:
            metadata = mutations_group["metadata"]
            metadata_offset = mutations_group["metadata_offset"]
        mutations.set_columns(
            site=mutations_group["site"],
            node=mutations_group["node"],
            parent=mutations_group["parent"],
            derived_state=mutations_group["derived_state"],
            derived_state_offset=mutations_group["derived_state_offset"],
            metadata=metadata,
            metadata_offset=metadata_offset)

    provenances_group = root["provenances"]
    provenances = msprime.ProvenanceTable()
    if "timestamp" in provenances_group:
        timestamp = provenances_group["timestamp"]
        timestamp_offset = provenances_group["timestamp_offset"]
        if "record" in provenances_group:
            record = provenances_group["record"]
            record_offset = provenances_group["record_offset"]
        else:
            record = np.empty_like(timestamp)
            record_offset = np.zeros_like(timestamp_offset)
        provenances.set_columns(timestamp=timestamp,
                                timestamp_offset=timestamp_offset,
                                record=record,
                                record_offset=record_offset)
    provenances.add_row(_get_upgrade_provenance(root))

    return msprime.load_tables(nodes=nodes,
                               edges=edges,
                               migrations=migrations,
                               sites=sites,
                               mutations=mutations,
                               provenances=provenances)