Beispiel #1
0
def insert_branch_mutations(ts, mutations_per_branch=1):
    """
    Returns a copy of the specified tree sequence with a mutation on every branch
    in every tree.
    """
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    for tree in ts.trees():
        site = len(sites)
        sites.add_row(position=tree.interval[0], ancestral_state='0')
        for root in tree.roots:
            state = {root: 0}
            mutation = {root: -1}
            stack = [root]
            while len(stack) > 0:
                u = stack.pop()
                stack.extend(tree.children(u))
                v = tree.parent(u)
                if v != msprime.NULL_NODE:
                    state[u] = state[v]
                    parent = mutation[v]
                    for j in range(mutations_per_branch):
                        state[u] = (state[u] + 1) % 2
                        mutation[u] = len(mutations)
                        mutations.add_row(
                            site=site, node=u, derived_state=str(state[u]),
                            parent=parent)
                        parent = mutation[u]
    tables = ts.tables
    add_provenance(tables.provenances, "insert_branch_mutations")
    return msprime.load_tables(
        nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations,
        provenances=tables.provenances)
Beispiel #2
0
 def __init__(self, ts, sample, filter_zero_mutation_sites=True):
     self.ts = ts
     self.n = len(sample)
     self.sequence_length = ts.sequence_length
     self.filter_zero_mutation_sites = filter_zero_mutation_sites
     self.num_mutations = ts.num_mutations
     self.input_sites = list(ts.sites())
     # A maps input node IDs to the extant ancestor chain. Once the algorithm
     # has processed the ancestors, they are are removed from the map.
     self.A = {}
     self.mutation_table = msprime.MutationTable(ts.num_mutations)
     self.node_table = msprime.NodeTable(ts.num_nodes)
     self.edge_table = msprime.EdgeTable(ts.num_edges)
     self.site_table = msprime.SiteTable(ts.num_sites)
     self.mutation_table = msprime.MutationTable(ts.num_mutations)
     self.edge_buffer = []
     self.node_id_map = {}
     self.mutation_node_map = [-1 for _ in range(self.num_mutations)]
     self.samples = set(sample)
     for sample_id in sample:
         self.insert_sample(sample_id)
     # We keep a map of input nodes to mutations.
     self.mutation_map = [[] for _ in range(ts.num_nodes)]
     position = ts.tables.sites.position
     site = ts.tables.mutations.site
     node = ts.tables.mutations.node
     for mutation_id in range(ts.num_mutations):
         site_position = position[site[mutation_id]]
         self.mutation_map[node[mutation_id]].append(
             (site_position, mutation_id))
Beispiel #3
0
def insert_multichar_mutations(ts, seed=1, max_len=10):
    """
    Returns a copy of the specified tree sequence with multiple chararacter
    mutations on a randomly chosen branch in every tree.
    """
    rng = random.Random(seed)
    letters = ["A", "C", "T", "G"]
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    for tree in ts.trees():
        site = len(sites)
        ancestral_state = rng.choice(letters) * rng.randint(0, max_len)
        sites.add_row(position=tree.interval[0], ancestral_state=ancestral_state)
        nodes = list(tree.nodes())
        nodes.remove(tree.root)
        u = rng.choice(nodes)
        derived_state = ancestral_state
        while ancestral_state == derived_state:
            derived_state = rng.choice(letters) * rng.randint(0, max_len)
        mutations.add_row(site=site, node=u, derived_state=derived_state)
    tables = ts.tables
    add_provenance(tables.provenances, "insert_multichar_mutations")
    return msprime.load_tables(
        nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations,
        provenances=tables.provenances)
Beispiel #4
0
def jukes_cantor(ts, num_sites, mu, multiple_per_node=True, seed=None):
    """
    Returns a copy of the specified tree sequence with Jukes-Cantor mutations
    applied at the specfied rate at the specifed number of sites. Site positions
    are chosen uniformly.
    """
    random.seed(seed)
    positions = [
        ts.sequence_length * random.random() for _ in range(num_sites)
    ]
    positions.sort()
    sites = msprime.SiteTable(num_sites)
    mutations = msprime.MutationTable(num_sites)
    trees = ts.trees()
    t = next(trees)
    for position in positions:
        while position >= t.interval[1]:
            t = next(trees)
        generate_site_mutations(t,
                                position,
                                mu,
                                sites,
                                mutations,
                                multiple_per_node=multiple_per_node)
    tables = ts.dump_tables()
    add_provenance(tables.provenances, "jukes_cantor")
    new_ts = msprime.load_tables(nodes=tables.nodes,
                                 edges=tables.edges,
                                 sites=sites,
                                 mutations=mutations,
                                 provenances=tables.provenances)
    return new_ts
Beispiel #5
0
def test():
    S = []
    np.random.seed(42)
    msp_rng = msprime.RandomGenerator(84)
    for i in range(1000):
        print(i)
        ts = wfrec(100, 100, 1000, 100)
        sites = msprime.SiteTable()

        mutations = msprime.MutationTable()
        mutgen = msprime.MutationGenerator(msp_rng, 100. / 4000.)
        mutgen.generate(ts.tables.nodes, ts.tables.edges, sites, mutations)
        ts = ts.load_tables(nodes=ts.tables.nodes,
                            edges=ts.tables.edges,
                            sites=sites,
                            mutations=mutations)
        S.append(ts.num_mutations)
    S2 = []
    for i in msprime.simulate(100,
                              recombination_rate=0,
                              mutation_rate=25,
                              num_replicates=1000):
        # S2.append(i.tables.nodes[next(i.trees()).root].time)
        S2.append(i.num_mutations)
    return S, S2
def get_common_mutations_ts(args, tree_sequence, log):

    common_sites = msprime.SiteTable()
    common_mutations = msprime.MutationTable()

    # Get the mutations > MAF.
    n_haps = tree_sequence.get_sample_size()
    log.log('Determining sites > MAF cutoff {m}'.format(m=args.maf))

    for tree in tree_sequence.trees():
        for site in tree.sites():
            f = tree.get_num_leaves(site.mutations[0].node) / n_haps
            if f > args.maf and f < 1 - args.maf:
                common_site_id = common_sites.add_row(
                    position=site.position,
                    ancestral_state=site.ancestral_state)
                common_mutations.add_row(
                    site=common_site_id,
                    node=site.mutations[0].node,
                    derived_state=site.mutations[0].derived_state)
    tables = tree_sequence.dump_tables()
    new_tree_sequence = msprime.load_tables(nodes=tables.nodes,
                                            edges=tables.edges,
                                            sites=common_sites,
                                            mutations=common_mutations)
    return new_tree_sequence
Beispiel #7
0
def strip_singletons(ts):
    """
    Returns a copy of the specified tree sequence with singletons removed.
    """
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    dropped_mutations = 0
    for variant in ts.variants():
        if np.sum(variant.genotypes) > 1:
            site_id = sites.add_row(
                position=variant.site.position,
                ancestral_state=variant.site.ancestral_state)
            assert len(variant.site.mutations) >= 1
            mutation = variant.site.mutations[0]
            parent_id = mutations.add_row(site=site_id,
                                          node=mutation.node,
                                          derived_state=mutation.derived_state)
            for error in variant.site.mutations[1:]:
                parent = -1
                if error.parent != -1:
                    parent = parent_id
                mutations.add_row(site=site_id,
                                  node=error.node,
                                  derived_state=error.derived_state,
                                  parent=parent)
    tables = ts.dump_tables()
    return msprime.load_tables(nodes=tables.nodes,
                               edges=tables.edges,
                               sites=sites,
                               mutations=mutations)
Beispiel #8
0
 def __init__(self, ts, sample, filter_zero_mutation_sites=True):
     self.ts = ts
     self.n = len(sample)
     self.sequence_length = ts.sequence_length
     self.filter_zero_mutation_sites = filter_zero_mutation_sites
     self.num_mutations = ts.num_mutations
     self.input_sites = list(ts.sites())
     self.A_head = [None for _ in range(ts.num_nodes)]
     self.A_tail = [None for _ in range(ts.num_nodes)]
     self.mutation_table = msprime.MutationTable(ts.num_mutations)
     self.node_table = msprime.NodeTable(ts.num_nodes)
     self.edge_table = msprime.EdgeTable(ts.num_edges)
     self.site_table = msprime.SiteTable(ts.num_sites)
     self.mutation_table = msprime.MutationTable(ts.num_mutations)
     self.edge_buffer = {}
     self.node_id_map = np.zeros(ts.num_nodes, dtype=np.int32) - 1
     self.mutation_node_map = [-1 for _ in range(self.num_mutations)]
     self.samples = set(sample)
     for sample_id in sample:
         output_id = self.record_node(sample_id, is_sample=True)
         self.add_ancestry(sample_id, 0, self.sequence_length, output_id)
     # We keep a map of input nodes to mutations.
     self.mutation_map = [[] for _ in range(ts.num_nodes)]
     position = ts.tables.sites.position
     site = ts.tables.mutations.site
     node = ts.tables.mutations.node
     for mutation_id in range(ts.num_mutations):
         site_position = position[site[mutation_id]]
         self.mutation_map[node[mutation_id]].append(
             (site_position, mutation_id))
Beispiel #9
0
def strip_singletons(ts, maf):
    """
    TODO: include maf filtering... done??
    modified from Jerome's
    :param maf:
    :param ts:
    :return:
    """
    n = ts.get_sample_size()
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    for tree in ts.trees():
        for site in tree.sites():
            assert len(site.mutations) == 1  # Only supports infinite sites muts.
            mut = site.mutations[0]
            f = tree.get_num_leaves(mut.node) / n
            if (tree.num_samples(mut.node) > 1) and (f > maf):
                site_id = sites.add_row(
                    position=site.position,
                    ancestral_state=site.ancestral_state)
                mutations.add_row(
                    site=site_id, node=mut.node, derived_state=mut.derived_state
                )
    tables = ts.dump_tables()
    new_ts = msprime.load_tables(
        nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations
    )
    return new_ts
Beispiel #10
0
def _load_legacy_hdf5_v3(root, remove_duplicate_positions):
    # get the trees group for the records and samples
    trees_group = root["trees"]
    nodes_group = trees_group["nodes"]
    time = np.array(nodes_group["time"])

    breakpoints = np.array(trees_group["breakpoints"])
    records_group = trees_group["records"]
    left_indexes = np.array(records_group["left"])
    right_indexes = np.array(records_group["right"])
    record_node = np.array(records_group["node"], dtype=np.int32)
    num_nodes = time.shape[0]
    sample_size = np.min(record_node)
    flags = np.zeros(num_nodes, dtype=np.uint32)
    flags[:sample_size] = msprime.NODE_IS_SAMPLE

    children_length = np.array(records_group["num_children"], dtype=np.uint32)
    total_rows = np.sum(children_length)
    left = np.zeros(total_rows, dtype=np.float64)
    right = np.zeros(total_rows, dtype=np.float64)
    parent = np.zeros(total_rows, dtype=np.int32)
    record_left = breakpoints[left_indexes]
    record_right = breakpoints[right_indexes]
    k = 0
    for j in range(left_indexes.shape[0]):
        for _ in range(children_length[j]):
            left[k] = record_left[j]
            right[k] = record_right[j]
            parent[k] = record_node[j]
            k += 1
    nodes = msprime.NodeTable()
    nodes.set_columns(flags=flags,
                      time=nodes_group["time"],
                      population=nodes_group["population"])
    edges = msprime.EdgeTable()
    edges.set_columns(left=left,
                      right=right,
                      parent=parent,
                      child=records_group["children"])
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    if "mutations" in root:
        _convert_hdf5_mutations(root["mutations"], sites, mutations,
                                remove_duplicate_positions)
    old_timestamp = datetime.datetime.min.isoformat()
    provenances = msprime.ProvenanceTable()
    if "provenance" in root:
        for record in root["provenance"]:
            provenances.add_row(timestamp=old_timestamp, record=record)
    provenances.add_row(_get_upgrade_provenance(root))
    msprime.sort_tables(nodes=nodes,
                        edges=edges,
                        sites=sites,
                        mutations=mutations)
    return msprime.load_tables(nodes=nodes,
                               edges=edges,
                               sites=sites,
                               mutations=mutations,
                               provenances=provenances)
Beispiel #11
0
def make_tree_add_mutations(nodes, edges, mutrate):
    rng = msprime.RandomGenerator(42)
    m = msprime.MutationTable()
    s = msprime.SiteTable()
    mg = msprime.MutationGenerator(rng, mutrate)
    mg.generate(nodes, edges, s, m)
    rv = msprime.load_tables(nodes=nodes, edgesets=edges, sites=s, mutations=m)
    return (rv, s)
Beispiel #12
0
def general_mutation_example():
    ts = msprime.simulate(10, recombination_rate=1, length=10, random_seed=2)
    nodes = msprime.NodeTable()
    edges = msprime.EdgeTable()
    ts.dump_tables(nodes=nodes, edges=edges)
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    sites.add_row(position=0, ancestral_state="A", metadata=b"{}")
    sites.add_row(position=1, ancestral_state="C", metadata=b"{'id':1}")
    mutations.add_row(site=0, node=0, derived_state="T")
    mutations.add_row(site=1, node=0, derived_state="G")
    return msprime.load_tables(
        nodes=nodes, edges=edges, sites=sites, mutations=mutations)
Beispiel #13
0
 def test_sites(self):
     nodes = msprime.NodeTable()
     edges = msprime.EdgeTable()
     sites = msprime.SiteTable()
     mutations = msprime.MutationTable()
     metadata = ExampleMetadata(one="node1", two="node2")
     pickled = pickle.dumps(metadata)
     sites.add_row(position=0.1, ancestral_state="A", metadata=pickled)
     ts = msprime.load_tables(
         nodes=nodes, edges=edges, sites=sites, mutations=mutations,
         sequence_length=1)
     site = ts.site(0)
     self.assertEqual(site.position, 0.1)
     self.assertEqual(site.ancestral_state, "A")
     self.assertEqual(site.metadata, pickled)
     unpickled = pickle.loads(site.metadata)
     self.assertEqual(unpickled.one, metadata.one)
     self.assertEqual(unpickled.two, metadata.two)
Beispiel #14
0
def permute_nodes(ts, node_map):
    """
    Returns a copy of the specified tree sequence such that the nodes are
    permuted according to the specified map.
    """
    # Mapping from nodes in the new tree sequence back to nodes in the original
    reverse_map = [0 for _ in node_map]
    for j in range(ts.num_nodes):
        reverse_map[node_map[j]] = j
    old_nodes = list(ts.nodes())
    new_nodes = msprime.NodeTable()
    for j in range(ts.num_nodes):
        old_node = old_nodes[reverse_map[j]]
        new_nodes.add_row(flags=old_node.flags,
                          metadata=old_node.metadata,
                          population=old_node.population,
                          time=old_node.time)
    new_edges = msprime.EdgeTable()
    for edge in ts.edges():
        new_edges.add_row(left=edge.left,
                          right=edge.right,
                          parent=node_map[edge.parent],
                          child=node_map[edge.child])
    new_sites = msprime.SiteTable()
    new_mutations = msprime.MutationTable()
    for site in ts.sites():
        new_sites.add_row(position=site.position,
                          ancestral_state=site.ancestral_state)
        for mutation in site.mutations:
            new_mutations.add_row(site=site.id,
                                  derived_state=mutation.derived_state,
                                  node=node_map[mutation.node])
    msprime.sort_tables(nodes=new_nodes,
                        edges=new_edges,
                        sites=new_sites,
                        mutations=new_mutations)
    provenances = ts.dump_tables().provenances
    add_provenance(provenances, "permute_nodes")
    return msprime.load_tables(nodes=new_nodes,
                               edges=new_edges,
                               sites=new_sites,
                               mutations=new_mutations,
                               provenances=provenances)
Beispiel #15
0
 def get_mutations_over_roots_tree(self):
     ts = msprime.simulate(15, random_seed=1)
     ts = tsutil.decapitate(ts, 20)
     tables = ts.tables
     sites = msprime.SiteTable()
     mutations = msprime.MutationTable()
     delta = 1.0 / (ts.num_nodes + 1)
     x = 0
     for node in range(ts.num_nodes):
         site_id = sites.add_row(x, ancestral_state="0")
         x += delta
         mutations.add_row(site_id, node=node, derived_state="1")
     ts = msprime.load_tables(
         nodes=tables.nodes, edges=tables.edges,
         sites=sites, mutations=mutations)
     tree = ts.first()
     assert any(
         tree.parent(mut.node) == msprime.NULL_NODE
         for mut in tree.mutations())
     return tree
def ts_private_mutations_only(ts):
    """
    Returns a new tree sequence which is a single tree and contains at least
    one singleton for each sample.
    """
    ll_tables = ts.dump_tables().asdict()

    mt = msprime.MutationTable()
    st = msprime.SiteTable()
    positions = sorted([np.random.random() for _ in range(ts.num_samples)])
    for i, n in enumerate(ts.samples()):
        st.add_row(positions[i], '0')
        mt.add_row(i, n, '1')

    ll_tables['sites'] = st.asdict()
    ll_tables['mutations'] = mt.asdict()

    ts_singletons = msprime.tskit.tables.TableCollection.fromdict(
        ll_tables).tree_sequence()

    return ts_singletons
Beispiel #17
0
 def test_mutations(self):
     nodes = msprime.NodeTable()
     edges = msprime.EdgeTable()
     sites = msprime.SiteTable()
     mutations = msprime.MutationTable()
     metadata = ExampleMetadata(one="node1", two="node2")
     pickled = pickle.dumps(metadata)
     nodes.add_row(time=0)
     sites.add_row(position=0.1, ancestral_state="A")
     mutations.add_row(site=0, node=0, derived_state="T", metadata=pickled)
     ts = msprime.load_tables(
         nodes=nodes, edges=edges, sites=sites, mutations=mutations,
         sequence_length=1)
     mutation = ts.site(0).mutations[0]
     self.assertEqual(mutation.site, 0)
     self.assertEqual(mutation.node, 0)
     self.assertEqual(mutation.derived_state, "T")
     self.assertEqual(mutation.metadata, pickled)
     unpickled = pickle.loads(mutation.metadata)
     self.assertEqual(unpickled.one, metadata.one)
     self.assertEqual(unpickled.two, metadata.two)
Beispiel #18
0
def insert_branch_sites(ts):
    """
    Returns a copy of the specified tree sequence with a site on every branch
    of every tree.
    """
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    for tree in ts.trees():
        left, right = tree.interval
        delta = (right - left) / len(list(tree.nodes()))
        x = left
        for u in tree.nodes():
            if tree.parent(u) != msprime.NULL_NODE:
                site = sites.add_row(position=x, ancestral_state='0')
                mutations.add_row(site=site, node=u, derived_state='1')
                x += delta
    tables = ts.tables
    add_provenance(tables.provenances, "insert_branch_sites")
    return msprime.load_tables(
        nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations,
        provenances=tables.provenances)
def set_mutations_in_tree(tree_sequence, p_causal):

    causal_sites = msprime.SiteTable()
    causal_mutations = msprime.MutationTable()

    # Get the causal mutations.
    for site in tree_sequence.sites():
        if np.random.random_sample() < p_causal:
            causal_site_id = causal_sites.add_row(
                position=site.position, ancestral_state=site.ancestral_state)
            causal_mutations.add_row(
                site=causal_site_id,
                node=site.mutations[0].node,
                derived_state=site.mutations[0].derived_state)

    tables = tree_sequence.dump_tables()
    new_tree_sequence = msprime.load_tables(nodes=tables.nodes,
                                            edges=tables.edges,
                                            sites=causal_sites,
                                            mutations=causal_mutations)
    m_causal = new_tree_sequence.get_num_mutations()

    return new_tree_sequence, m_causal
def write_vcf(chrom):
    treefile = args.tree_file[chrom]
    vcf = open(args.vcffile[chrom], "w")
    mut_rate = args.mut_rate[chrom]
    seed = seeds[chrom]
    logfile.write("Simulating mutations on" + treefile + "\n")
    ts = msprime.load(treefile)
    rng = msprime.RandomGenerator(seed)
    nodes = msprime.NodeTable()
    edgesets = msprime.EdgesetTable()
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    migrations = msprime.MigrationTable()
    ts.dump_tables(nodes=nodes, edgesets=edgesets, migrations=migrations)
    mutgen = msprime.MutationGenerator(rng, mut_rate)
    mutgen.generate(nodes, edgesets, sites, mutations)
    logfile.write("Saving to" + args.vcffile[chrom] + "\n")
    mutated_ts = msprime.load_tables(nodes=nodes,
                                     edgesets=edgesets,
                                     sites=sites,
                                     mutations=mutations)
    mutated_ts.write_vcf(vcf, ploidy=1)

    return True
Beispiel #21
0
def _load_legacy_hdf5_v10(root, remove_duplicate_positions=False):
    # We cannot have duplicate positions in v10, so this parameter is ignored
    nodes_group = root["nodes"]
    nodes = msprime.NodeTable()
    metadata = None
    metadata_offset = None
    if "metadata" in nodes_group:
        metadata = nodes_group["metadata"]
        metadata_offset = nodes_group["metadata_offset"]
    nodes.set_columns(flags=nodes_group["flags"],
                      population=nodes_group["population"],
                      time=nodes_group["time"],
                      metadata=metadata,
                      metadata_offset=metadata_offset)

    edges_group = root["edges"]
    edges = msprime.EdgeTable()
    edges.set_columns(left=edges_group["left"],
                      right=edges_group["right"],
                      parent=edges_group["parent"],
                      child=edges_group["child"])

    migrations_group = root["migrations"]
    migrations = msprime.MigrationTable()
    if "left" in migrations_group:
        migrations.set_columns(left=migrations_group["left"],
                               right=migrations_group["right"],
                               node=migrations_group["node"],
                               source=migrations_group["source"],
                               dest=migrations_group["dest"],
                               time=migrations_group["time"])

    sites_group = root["sites"]
    sites = msprime.SiteTable()
    if "position" in sites_group:
        metadata = None
        metadata_offset = None
        if "metadata" in sites_group:
            metadata = sites_group["metadata"]
            metadata_offset = sites_group["metadata_offset"]
        sites.set_columns(
            position=sites_group["position"],
            ancestral_state=sites_group["ancestral_state"],
            ancestral_state_offset=sites_group["ancestral_state_offset"],
            metadata=metadata,
            metadata_offset=metadata_offset)

    mutations_group = root["mutations"]
    mutations = msprime.MutationTable()
    if "site" in mutations_group:
        metadata = None
        metadata_offset = None
        if "metadata" in mutations_group:
            metadata = mutations_group["metadata"]
            metadata_offset = mutations_group["metadata_offset"]
        mutations.set_columns(
            site=mutations_group["site"],
            node=mutations_group["node"],
            parent=mutations_group["parent"],
            derived_state=mutations_group["derived_state"],
            derived_state_offset=mutations_group["derived_state_offset"],
            metadata=metadata,
            metadata_offset=metadata_offset)

    provenances_group = root["provenances"]
    provenances = msprime.ProvenanceTable()
    if "timestamp" in provenances_group:
        timestamp = provenances_group["timestamp"]
        timestamp_offset = provenances_group["timestamp_offset"]
        if "record" in provenances_group:
            record = provenances_group["record"]
            record_offset = provenances_group["record_offset"]
        else:
            record = np.empty_like(timestamp)
            record_offset = np.zeros_like(timestamp_offset)
        provenances.set_columns(timestamp=timestamp,
                                timestamp_offset=timestamp_offset,
                                record=record,
                                record_offset=record_offset)
    provenances.add_row(_get_upgrade_provenance(root))

    return msprime.load_tables(nodes=nodes,
                               edges=edges,
                               migrations=migrations,
                               sites=sites,
                               mutations=mutations,
                               provenances=provenances)
Beispiel #22
0
def _load_legacy_hdf5_v2(root, remove_duplicate_positions):
    # Get the coalescence records
    trees_group = root["trees"]
    old_timestamp = datetime.datetime.min.isoformat()
    provenances = msprime.ProvenanceTable()
    provenances.add_row(timestamp=old_timestamp,
                        record=_get_v2_provenance("generate_trees",
                                                  trees_group.attrs))
    num_rows = trees_group["node"].shape[0]
    index = np.arange(num_rows, dtype=int)
    parent = np.zeros(2 * num_rows, dtype=np.int32)
    parent[2 * index] = trees_group["node"]
    parent[2 * index + 1] = trees_group["node"]
    left = np.zeros(2 * num_rows, dtype=np.float64)
    left[2 * index] = trees_group["left"]
    left[2 * index + 1] = trees_group["left"]
    right = np.zeros(2 * num_rows, dtype=np.float64)
    right[2 * index] = trees_group["right"]
    right[2 * index + 1] = trees_group["right"]
    child = np.array(trees_group["children"], dtype=np.int32).flatten()
    edges = msprime.EdgeTable()
    edges.set_columns(left=left, right=right, parent=parent, child=child)

    cr_node = np.array(trees_group["node"], dtype=np.int32)
    num_nodes = max(np.max(child), np.max(cr_node)) + 1
    sample_size = np.min(cr_node)
    flags = np.zeros(num_nodes, dtype=np.uint32)
    population = np.zeros(num_nodes, dtype=np.int32)
    time = np.zeros(num_nodes, dtype=np.float64)
    flags[:sample_size] = msprime.NODE_IS_SAMPLE
    cr_population = np.array(trees_group["population"], dtype=np.int32)
    cr_time = np.array(trees_group["time"])
    time[cr_node] = cr_time
    population[cr_node] = cr_population
    if "samples" in root:
        samples_group = root["samples"]
        population[:sample_size] = samples_group["population"]
        if "time" in samples_group:
            time[:sample_size] = samples_group["time"]
    nodes = msprime.NodeTable()
    nodes.set_columns(flags=flags, population=population, time=time)

    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    if "mutations" in root:
        mutations_group = root["mutations"]
        _convert_hdf5_mutations(mutations_group, sites, mutations,
                                remove_duplicate_positions)
        provenances.add_row(timestamp=old_timestamp,
                            record=_get_v2_provenance("generate_mutations",
                                                      mutations_group.attrs))
    provenances.add_row(_get_upgrade_provenance(root))
    msprime.sort_tables(nodes=nodes,
                        edges=edges,
                        sites=sites,
                        mutations=mutations)
    return msprime.load_tables(nodes=nodes,
                               edges=edges,
                               sites=sites,
                               mutations=mutations,
                               provenances=provenances)
logfile.write("Simplified; now writing to treefile (if specified).\n")
logfile.write(time.strftime('%X %x %Z') + "\n")
logfile.write("----------\n")
logfile.flush()

if args.treefile is not None:
    minimal_ts.dump(args.treefile)

mut_seed = args.seed
logfile.write("Generating mutations with seed " + str(mut_seed) + "\n")
logfile.flush()

rng = msprime.RandomGenerator(mut_seed)
nodes = msprime.NodeTable()
edgesets = msprime.EdgesetTable()
sites = msprime.SiteTable()
mutations = msprime.MutationTable()
minimal_ts.dump_tables(nodes=nodes, edgesets=edgesets)
mutgen = msprime.MutationGenerator(rng, args.mut_rate)
mutgen.generate(nodes, edgesets, sites, mutations)

# print(nodes, file=logfile)
# print(edgesets, file=logfile)
# print(sites, file=logfile)
# print(mutations, file=logfile)

mutated_ts = msprime.load_tables(nodes=nodes,
                                 edgesets=edgesets,
                                 sites=sites,
                                 mutations=mutations)
Beispiel #24
0
    def get_tree_sequence(self, rescale_positions=True, all_sites=False):
        """
        Returns the current state of the build tree sequence. All samples and
        ancestors will have the sample node flag set.
        """
        # TODO Change the API here to ask whether we want a final tree sequence
        # or not. In the latter case we also need to translate the ancestral
        # and derived states to the input values.
        tsb = self.tree_sequence_builder
        flags, time = tsb.dump_nodes()
        nodes = msprime.NodeTable()
        nodes.set_columns(flags=flags, time=time)

        left, right, parent, child = tsb.dump_edges()
        if rescale_positions:
            position = self.sample_data.position[:]
            sequence_length = self.sample_data.sequence_length
            if sequence_length is None or sequence_length < position[-1]:
                sequence_length = position[-1] + 1
            # Subset down to the variants.
            position = position[self.sample_data.variant_site[:]]
            x = np.hstack([position, [sequence_length]])
            x[0] = 0
            left = x[left]
            right = x[right]
        else:
            position = np.arange(tsb.num_sites)
            sequence_length = max(1, tsb.num_sites)

        edges = msprime.EdgeTable()
        edges.set_columns(left=left, right=right, parent=parent, child=child)

        sites = msprime.SiteTable()
        sites.set_columns(
            position=position,
            ancestral_state=np.zeros(tsb.num_sites, dtype=np.int8) + ord('0'),
            ancestral_state_offset=np.arange(tsb.num_sites + 1,
                                             dtype=np.uint32))
        mutations = msprime.MutationTable()
        site = np.zeros(tsb.num_mutations, dtype=np.int32)
        node = np.zeros(tsb.num_mutations, dtype=np.int32)
        parent = np.zeros(tsb.num_mutations, dtype=np.int32)
        derived_state = np.zeros(tsb.num_mutations, dtype=np.int8)
        site, node, derived_state, parent = tsb.dump_mutations()
        derived_state += ord('0')
        mutations.set_columns(site=site,
                              node=node,
                              derived_state=derived_state,
                              derived_state_offset=np.arange(
                                  tsb.num_mutations + 1, dtype=np.uint32),
                              parent=parent)
        if all_sites:
            # Append the sites and mutations for each singleton.
            num_singletons = self.sample_data.num_singleton_sites
            singleton_site = self.sample_data.singleton_site[:]
            singleton_sample = self.sample_data.singleton_sample[:]
            pos = self.sample_data.position[:]
            new_sites = np.arange(len(sites),
                                  len(sites) + num_singletons,
                                  dtype=np.int32)
            sites.append_columns(
                position=pos[singleton_site],
                ancestral_state=np.zeros(num_singletons, dtype=np.int8) +
                ord('0'),
                ancestral_state_offset=np.arange(num_singletons + 1,
                                                 dtype=np.uint32))
            mutations.append_columns(
                site=new_sites,
                node=self.sample_ids[singleton_sample],
                derived_state=np.zeros(num_singletons, dtype=np.int8) +
                ord('1'),
                derived_state_offset=np.arange(num_singletons + 1,
                                               dtype=np.uint32))
            # Get the invariant sites
            num_invariants = self.sample_data.num_invariant_sites
            invariant_site = self.sample_data.invariant_site[:]
            sites.append_columns(
                position=pos[invariant_site],
                ancestral_state=np.zeros(num_invariants, dtype=np.int8) +
                ord('0'),
                ancestral_state_offset=np.arange(num_invariants + 1,
                                                 dtype=np.uint32))

        msprime.sort_tables(nodes, edges, sites=sites, mutations=mutations)
        return msprime.load_tables(nodes=nodes,
                                   edges=edges,
                                   sites=sites,
                                   mutations=mutations,
                                   sequence_length=sequence_length)
    # Construct and populate msprime's tables
    flags = np.empty([len(nodes)], dtype=np.uint32)
    flags.fill(1)
    nt = msprime.NodeTable()
    nt.set_columns(flags=flags,
                   population=nodes['population'],
                   time=nodes['generation'])

    es = msprime.EdgeTable()
    es.set_columns(left=edges['left'],
                   right=edges['right'],
                   parent=edges['parent'],
                   child=edges['child'])

    st = msprime.SiteTable()
    st.set_columns(position=mutas['position'],
                   ancestral_state=np.zeros(len(mutas['position']), np.int8),
                   ancestral_state_length=np.ones(len(mutas['position']),
                                                  np.uint32))

    mt = msprime.MutationTable()
    mt.set_columns(site=np.arange(len(mutas['node_id']), dtype=np.int32),
                   node=mutas['node_id'],
                   derived_state=np.ones(len(mutas['node_id']), np.int8),
                   derived_state_length=np.ones(len(mutas['node_id']),
                                                np.uint32))

    # Sort
    msprime.sort_tables(nodes=nt, edges=es, sites=st, mutations=mt)
    print("num total mutations: ", st.num_rows)
Beispiel #26
0
    def run(self, ngens):
        nodes = msprime.NodeTable()
        edges = msprime.EdgeTable()
        migrations = msprime.MigrationTable()
        sites = msprime.SiteTable()
        mutations = msprime.MutationTable()
        provenances = msprime.ProvenanceTable()
        if self.deep_history:
            # initial population
            init_ts = msprime.simulate(self.N, recombination_rate=1.0)
            init_ts.dump_tables(nodes=nodes, edges=edges)
            nodes.set_columns(time=nodes.time + ngens, flags=nodes.flags)
        else:
            for _ in range(self.N):
                nodes.add_row(time=ngens)

        pop = list(range(self.N))
        for t in range(ngens - 1, -1, -1):
            if self.debug:
                print("t:", t)
                print("pop:", pop)

            dead = [random.random() > self.survival for k in pop]
            # sample these first so that all parents are from the previous gen
            new_parents = [(random.choice(pop), random.choice(pop))
                           for k in range(sum(dead))]
            k = 0
            if self.debug:
                print("Replacing", sum(dead), "individuals.")
            for j in range(self.N):
                if dead[j]:
                    # this is: offspring ID, lparent, rparent, breakpoint
                    offspring = nodes.num_rows
                    nodes.add_row(time=t)
                    lparent, rparent = new_parents[k]
                    k += 1
                    bp = self.random_breakpoint()
                    if self.debug:
                        print("--->", offspring, lparent, rparent, bp)
                    pop[j] = offspring
                    if bp > 0.0:
                        edges.add_row(left=0.0,
                                      right=bp,
                                      parent=lparent,
                                      child=offspring)
                    if bp < 1.0:
                        edges.add_row(left=bp,
                                      right=1.0,
                                      parent=rparent,
                                      child=offspring)

        if self.debug:
            print("Done! Final pop:")
            print(pop)
        flags = [(msprime.NODE_IS_SAMPLE if u in pop else 0)
                 for u in range(nodes.num_rows)]
        nodes.set_columns(time=nodes.time, flags=flags)
        if self.debug:
            print("Done.")
            print("Nodes:")
            print(nodes)
            print("Edges:")
            print(edges)
        return msprime.TableCollection(nodes, edges, migrations, sites,
                                       mutations, provenances)