Example #1
0
 def finalise(self, simplify=True, stabilise_node_ordering=False):
     logger.info("Finalising tree sequence")
     ts = self.get_tree_sequence(all_sites=True)
     if simplify:
         logger.info("Running simplify on {} nodes and {} edges".format(
             ts.num_nodes, ts.num_edges))
         if stabilise_node_ordering:
             # Ensure all the node times are distinct so that they will have
             # stable IDs after simplifying. This could possibly also be done
             # by reversing the IDs within a time slice. This is used for comparing
             # tree sequences produced by perfect inference.
             tables = ts.tables
             time = tables.nodes.time
             for t in range(1, int(time[0])):
                 index = np.where(time == t)[0]
                 k = index.shape[0]
                 time[index] += np.arange(k)[::-1] / k
             tables.nodes.set_columns(flags=tables.nodes.flags, time=time)
             msprime.sort_tables(**tables.asdict())
             ts = msprime.load_tables(**tables.asdict())
         ts = ts.simplify(samples=self.sample_ids,
                          filter_zero_mutation_sites=False)
         logger.info(
             "Finished simplify; now have {} nodes and {} edges".format(
                 ts.num_nodes, ts.num_edges))
     return ts
Example #2
0
def wright_fisher(N, delta, L, T):
    """
    Direct implementation of Algorithm W.
    """
    edges = msprime.EdgeTable()
    tau = []
    P = [j for j in range(N)]
    for j in range(N):
        tau.append(T)
    t = T
    n = N
    while t > 0:
        t -= 1
        j = 0
        Pp = [P[j] for j in range(N)]
        while j < N:
            if random.random() < delta:
                Pp[j] = n
                tau.append(t)
                a = random.randint(0, N - 1)
                b = random.randint(0, N - 1)
                x = random.uniform(0, L)
                edges.add_row(0, x, P[a], n)
                edges.add_row(x, L, P[b], n)
                n += 1
            j += 1
        P = Pp
    nodes = msprime.NodeTable()
    P = set(P)
    for j in range(n):
        nodes.add_row(time=tau[j], flags=int(j in P))
    msprime.sort_tables(nodes=nodes, edges=edges)
    return msprime.load_tables(nodes=nodes, edges=edges)
Example #3
0
 def test_one_generation_no_deep_history(self):
     N = 20
     tables = wf_sim(N=N,
                     ngens=1,
                     deep_history=False,
                     seed=self.random_seed)
     self.assertEqual(tables.nodes.num_rows, 2 * N)
     self.assertGreater(tables.edges.num_rows, 0)
     self.assertEqual(tables.sites.num_rows, 0)
     self.assertEqual(tables.mutations.num_rows, 0)
     self.assertEqual(tables.migrations.num_rows, 0)
     nodes = tables.nodes
     edges = tables.edges
     samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype(
         np.int32)
     msprime.sort_tables(nodes=nodes, edges=edges)
     msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges)
     self.assertGreater(tables.nodes.num_rows, 0)
     self.assertGreater(tables.edges.num_rows, 0)
     ts = msprime.load_tables(nodes=nodes, edges=edges)
     for tree in ts.trees():
         all_samples = set()
         for root in tree.roots:
             root_samples = set(tree.samples(root))
             self.assertEqual(len(root_samples & all_samples), 0)
             all_samples |= root_samples
         self.assertEqual(all_samples, set(ts.samples()))
Example #4
0
def single_childify(ts):
    """
    Builds a new equivalent tree sequence which contains an extra node in the
    middle of all exising branches.
    """
    tables = ts.dump_tables()
    edges = tables.edges
    nodes = tables.nodes
    sites = tables.sites
    mutations = tables.mutations

    time = nodes.time[:]
    edges.reset()
    for edge in ts.edges():
        # Insert a new node in between the parent and child.
        u = len(nodes)
        t = time[edge.child] + (time[edge.parent] - time[edge.child]) / 2
        nodes.add_row(time=t)
        edges.add_row(
            left=edge.left, right=edge.right, parent=u, child=edge.child)
        edges.add_row(
            left=edge.left, right=edge.right, parent=edge.parent, child=u)
    msprime.sort_tables(
        nodes=nodes, edges=edges, sites=sites, mutations=mutations)
    add_provenance(tables.provenances, "insert_redundant_breakpoints")
    new_ts = msprime.load_tables(
        nodes=nodes, edges=edges, sites=sites, mutations=mutations,
        provenances=tables.provenances)
    return new_ts
Example #5
0
 def test_with_recurrent_mutations(self):
     # actually with only ONE site, at 0.0
     N = 10
     ngens = 100
     tables = wf_sim(N=N,
                     ngens=ngens,
                     deep_history=False,
                     seed=self.random_seed)
     msprime.sort_tables(**tables.asdict())
     ts = msprime.load_tables(**tables.asdict())
     ts = tsutil.jukes_cantor(ts, 1, 10, seed=self.random_seed)
     tables = ts.tables
     self.assertEqual(tables.sites.num_rows, 1)
     self.assertGreater(tables.mutations.num_rows, 0)
     nodes = tables.nodes
     samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype(
         np.int32)
     # before simplify
     for h in ts.haplotypes():
         self.assertEqual(len(h), 1)
     # after simplify
     msprime.simplify_tables(samples=samples,
                             nodes=tables.nodes,
                             edges=tables.edges,
                             sites=tables.sites,
                             mutations=tables.mutations)
     self.assertGreater(tables.nodes.num_rows, 0)
     self.assertGreater(tables.edges.num_rows, 0)
     self.assertEqual(tables.sites.num_rows, 1)
     self.assertGreater(tables.mutations.num_rows, 0)
     ts = msprime.load_tables(**tables.asdict())
     self.assertEqual(ts.sample_size, N)
     for hap in ts.haplotypes():
         self.assertEqual(len(hap), ts.num_sites)
Example #6
0
def _load_legacy_hdf5_v3(root, remove_duplicate_positions):
    # get the trees group for the records and samples
    trees_group = root["trees"]
    nodes_group = trees_group["nodes"]
    time = np.array(nodes_group["time"])

    breakpoints = np.array(trees_group["breakpoints"])
    records_group = trees_group["records"]
    left_indexes = np.array(records_group["left"])
    right_indexes = np.array(records_group["right"])
    record_node = np.array(records_group["node"], dtype=np.int32)
    num_nodes = time.shape[0]
    sample_size = np.min(record_node)
    flags = np.zeros(num_nodes, dtype=np.uint32)
    flags[:sample_size] = msprime.NODE_IS_SAMPLE

    children_length = np.array(records_group["num_children"], dtype=np.uint32)
    total_rows = np.sum(children_length)
    left = np.zeros(total_rows, dtype=np.float64)
    right = np.zeros(total_rows, dtype=np.float64)
    parent = np.zeros(total_rows, dtype=np.int32)
    record_left = breakpoints[left_indexes]
    record_right = breakpoints[right_indexes]
    k = 0
    for j in range(left_indexes.shape[0]):
        for _ in range(children_length[j]):
            left[k] = record_left[j]
            right[k] = record_right[j]
            parent[k] = record_node[j]
            k += 1
    nodes = msprime.NodeTable()
    nodes.set_columns(flags=flags,
                      time=nodes_group["time"],
                      population=nodes_group["population"])
    edges = msprime.EdgeTable()
    edges.set_columns(left=left,
                      right=right,
                      parent=parent,
                      child=records_group["children"])
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    if "mutations" in root:
        _convert_hdf5_mutations(root["mutations"], sites, mutations,
                                remove_duplicate_positions)
    old_timestamp = datetime.datetime.min.isoformat()
    provenances = msprime.ProvenanceTable()
    if "provenance" in root:
        for record in root["provenance"]:
            provenances.add_row(timestamp=old_timestamp, record=record)
    provenances.add_row(_get_upgrade_provenance(root))
    msprime.sort_tables(nodes=nodes,
                        edges=edges,
                        sites=sites,
                        mutations=mutations)
    return msprime.load_tables(nodes=nodes,
                               edges=edges,
                               sites=sites,
                               mutations=mutations,
                               provenances=provenances)
Example #7
0
 def test_overlapping_generations(self):
     tables = wf_sim(N=30, ngens=10, survival=0.85, seed=self.random_seed)
     self.assertGreater(tables.nodes.num_rows, 0)
     self.assertGreater(tables.edges.num_rows, 0)
     self.assertEqual(tables.sites.num_rows, 0)
     self.assertEqual(tables.mutations.num_rows, 0)
     self.assertEqual(tables.migrations.num_rows, 0)
     nodes = tables.nodes
     edges = tables.edges
     msprime.sort_tables(nodes=nodes, edges=edges)
     samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype(
         np.int32)
     msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges)
     ts = msprime.load_tables(nodes=nodes, edges=edges)
     for tree in ts.trees():
         self.assertEqual(tree.num_roots, 1)
Example #8
0
    def test_ts_with_root_mutations(self):
        ts = self.get_example_ts(5, 3)
        t = ts.dump_tables()
        positions = set(site.position for site in ts.sites())
        for tree in ts.trees():
            pos = tree.interval[0]
            if pos not in positions:
                site_id = t.sites.add_row(position=pos, ancestral_state="0")
                t.mutations.add_row(site=site_id, node=tree.root, derived_state="1")
                positions.add(pos)
        self.assertGreater(len(positions), ts.num_sites)
        msprime.sort_tables(**t.asdict())
        ts = msprime.load_tables(**t.asdict())

        input_file = formats.SampleData.initialise(
            num_samples=ts.num_samples, sequence_length=ts.sequence_length)
        self.verify_data_round_trip(ts, input_file)
Example #9
0
    def test_ts_with_invariant_sites(self):
        ts = self.get_example_ts(5, 3)
        t = ts.dump_tables()
        positions = set(site.position for site in ts.sites())
        for j in range(10):
            pos = 1 / (j + 1)
            if pos not in positions:
                t.sites.add_row(position=pos, ancestral_state="0")
                positions.add(pos)
        self.assertGreater(len(positions), ts.num_sites)
        msprime.sort_tables(**t.asdict())
        ts = msprime.load_tables(**t.asdict())

        input_file = formats.SampleData.initialise(
            num_samples=ts.num_samples, sequence_length=ts.sequence_length)
        self.verify_data_round_trip(ts, input_file)
        self.assertGreater(len(str(input_file)), 0)
Example #10
0
 def get_wf_sims(self, seed):
     """
     Returns an iterator of example tree sequences produced by the WF simulator.
     """
     for N in [5, 10, 20]:
         for surv in [0.0, 0.5, 0.9]:
             for mut in [0.01, 1.0]:
                 for nloci in [1, 2, 3]:
                     tables = wf_sim(N=N, ngens=N, survival=surv, seed=seed)
                     msprime.sort_tables(**tables.asdict())
                     ts = msprime.load_tables(**tables.asdict())
                     ts = tsutil.jukes_cantor(ts,
                                              num_sites=nloci,
                                              mu=mut,
                                              seed=seed)
                     self.verify_simulation(ts, ngens=N)
                     yield ts
    def simplify(self, generation, ancestry):
        # update node times:
        if self.__nodes.num_rows > 0:
            tc = self.__nodes.time
            dt = float(generation) - self.last_gc_time
            tc += dt
            self.last_gc_time = generation
            flags = np.empty([self.__nodes.num_rows], dtype=np.uint32)
            flags.fill(1)
            self.__nodes.set_columns(flags=flags,
                                     population=self.__nodes.population,
                                     time=tc)

        start = time.time()
        ancestry.prep_for_gc()
        na = np.array(ancestry.nodes, copy=False)
        ea = np.array(ancestry.edges, copy=False)
        samples = np.array(ancestry.samples, copy=False)
        flags = np.empty([len(na)], dtype=np.uint32)
        flags.fill(1)
        stop = time.time()
        self.__time_prepping += (stop - start)

        start = time.time()
        self.__nodes.append_columns(flags=flags,
                                    population=na['population'],
                                    time=na['generation'])
        self.__edges.append_columns(left=ea['left'],
                                    right=ea['right'],
                                    parent=ea['parent'],
                                    children=ea['child'],
                                    children_length=[1] * len(ea))
        stop = time.time()
        self.__time_appending += (stop - start)
        start = time.time()
        msprime.sort_tables(nodes=self.__nodes, edgesets=self.__edges)
        stop = time.time()
        self.__time_sorting += (stop - start)
        start = time.time()
        msprime.simplify_tables(samples=samples.tolist(),
                                nodes=self.__nodes,
                                edgesets=self.__edges)
        stop = time.time()
        self.__time_simplifying += (stop - start)
        return (True, self.__nodes.num_rows)
    def test4(self):
        self.n.set_columns(time=[1,0,0,2],flags=[msprime.NODE_IS_SAMPLE]*4)
        
        self.e.add_row(parent=0,child=1,left=0,right=0.4)
        self.e.add_row(parent=0,child=1,left=0.6,right=1.0)
        self.e.add_row(parent=0,child=2,left=0,right=1)
        self.e.add_row(parent=3,child=0,left=0,right=0.4)

        self.s.add_row(position=0.4,ancestral_state='0')
        self.m.add_row(site=0,node=3,derived_state='1')

        msprime.sort_tables(nodes=self.n,edges=self.e,
                sites=self.s,mutations=self.m)
        idmap = msprime.simplify_tables(nodes=self.n,edges=self.e,
                sites=self.s,mutations=self.m,samples=[1,2])
        ts = msprime.load_tables(nodes=self.n,edges=self.e,sites=self.s,
                mutations=self.m)
        m = ts.genotype_matrix()
        self.assertEqual(m[0:].sum(),0)
Example #13
0
def permute_nodes(ts, node_map):
    """
    Returns a copy of the specified tree sequence such that the nodes are
    permuted according to the specified map.
    """
    # Mapping from nodes in the new tree sequence back to nodes in the original
    reverse_map = [0 for _ in node_map]
    for j in range(ts.num_nodes):
        reverse_map[node_map[j]] = j
    old_nodes = list(ts.nodes())
    new_nodes = msprime.NodeTable()
    for j in range(ts.num_nodes):
        old_node = old_nodes[reverse_map[j]]
        new_nodes.add_row(flags=old_node.flags,
                          metadata=old_node.metadata,
                          population=old_node.population,
                          time=old_node.time)
    new_edges = msprime.EdgeTable()
    for edge in ts.edges():
        new_edges.add_row(left=edge.left,
                          right=edge.right,
                          parent=node_map[edge.parent],
                          child=node_map[edge.child])
    new_sites = msprime.SiteTable()
    new_mutations = msprime.MutationTable()
    for site in ts.sites():
        new_sites.add_row(position=site.position,
                          ancestral_state=site.ancestral_state)
        for mutation in site.mutations:
            new_mutations.add_row(site=site.id,
                                  derived_state=mutation.derived_state,
                                  node=node_map[mutation.node])
    msprime.sort_tables(nodes=new_nodes,
                        edges=new_edges,
                        sites=new_sites,
                        mutations=new_mutations)
    provenances = ts.dump_tables().provenances
    add_provenance(provenances, "permute_nodes")
    return msprime.load_tables(nodes=new_nodes,
                               edges=new_edges,
                               sites=new_sites,
                               mutations=new_mutations,
                               provenances=provenances)
Example #14
0
    def simplify(self, generation, tracker):
        """
        Details of taking new data, appending, and
        simplifying.

        :return: length of simplifed node table, which is next_id to use
        """
        # Update time in current nodes.
        # Is this most effficient method?
        dt = generation - self.last_gc_time
        self.nodes.set_columns(flags=self.nodes.flags,
                               population=self.nodes.population,
                               time=self.nodes.time + dt)

        # Create "flags" for new nodes.
        # This is much faster than making a list
        flags = np.empty([len(tracker.nodes)], dtype=np.uint32)
        flags.fill(1)

        # Convert time from forwards to backwards
        tracker.convert_time()

        # Update internal *Tables
        self.nodes.append_columns(flags=flags,
                                  population=tracker.nodes['population'],
                                  time=tracker.nodes['generation'])
        self.edges.append_columns(left=tracker.edges['left'],
                                  right=tracker.edges['right'],
                                  parent=tracker.edges['parent'],
                                  children=tracker.edges['child'],
                                  children_length=[1] * len(tracker.edges))

        # Sort and simplify
        msprime.sort_tables(nodes=self.nodes, edgesets=self.edges)
        msprime.simplify_tables(samples=tracker.samples.tolist(),
                                nodes=self.nodes,
                                edgesets=self.edges)
        # Return length of NodeTable,
        # which can be used as next offspring ID
        return self.nodes.num_rows
Example #15
0
 def test_non_overlapping_generations(self):
     tables = wf_sim(N=10, ngens=10, survival=0.0, seed=self.random_seed)
     self.assertGreater(tables.nodes.num_rows, 0)
     self.assertGreater(tables.edges.num_rows, 0)
     self.assertEqual(tables.sites.num_rows, 0)
     self.assertEqual(tables.mutations.num_rows, 0)
     self.assertEqual(tables.migrations.num_rows, 0)
     nodes = tables.nodes
     edges = tables.edges
     msprime.sort_tables(nodes=nodes, edges=edges)
     samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype(
         np.int32)
     msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges)
     ts = msprime.load_tables(nodes=nodes, edges=edges)
     # All trees should have exactly one root and the leaves should be the samples,
     # and all internal nodes should have arity > 1
     for tree in ts.trees():
         self.assertEqual(tree.num_roots, 1)
         leaves = set(tree.leaves(tree.root))
         self.assertEqual(leaves, set(ts.samples()))
         for u in tree.nodes():
             if tree.is_internal(u):
                 self.assertGreater(len(tree.children(u)), 1)
Example #16
0
def wright_fisher(N, T, simplify_interval=1):
    """
    An implementation of algorithm W where we simplify after every generation.
    The goal here is to measure the number of edges in the tree sequence
    representing the history as a function of time.

    For simplicity we assume that the genome length L = 1 and the probability
    of death delta = 1.
    """
    L = 1
    edges = msprime.EdgeTable()
    nodes = msprime.NodeTable()
    P = [j for j in range(N)]
    for j in range(N):
        nodes.add_row(time=T, flags=1)
    t = T
    S = np.zeros(T, dtype=int)
    while t > 0:
        t -= 1
        Pp = [P[j] for j in range(N)]
        for j in range(N):
            n = len(nodes)
            nodes.add_row(time=t, flags=1)
            Pp[j] = n
            a = random.randint(0, N - 1)
            b = random.randint(0, N - 1)
            x = random.uniform(0, L)
            edges.add_row(0, x, P[a], n)
            edges.add_row(x, L, P[b], n)
        P = Pp
        if t % simplify_interval == 0:
            msprime.sort_tables(nodes=nodes, edges=edges)
            msprime.simplify_tables(Pp, nodes, edges)
            P = list(range(N))
        S[T - t - 1] = len(edges)
    # We will always simplify at t = 0, so no need for special case at the end
    return msprime.load_tables(nodes=nodes, edges=edges), S
Example #17
0
 def test_many_generations_no_deep_history(self):
     N = 10
     ngens = 100
     tables = wf_sim(N=N,
                     ngens=ngens,
                     deep_history=False,
                     seed=self.random_seed)
     self.assertEqual(tables.nodes.num_rows, N * (ngens + 1))
     self.assertGreater(tables.edges.num_rows, 0)
     self.assertEqual(tables.sites.num_rows, 0)
     self.assertEqual(tables.mutations.num_rows, 0)
     self.assertEqual(tables.migrations.num_rows, 0)
     nodes = tables.nodes
     edges = tables.edges
     samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype(
         np.int32)
     msprime.sort_tables(nodes=nodes, edges=edges)
     msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges)
     self.assertGreater(tables.nodes.num_rows, 0)
     self.assertGreater(tables.edges.num_rows, 0)
     # We are assuming that everything has coalesced and we have single-root trees
     ts = msprime.load_tables(nodes=nodes, edges=edges)
     for tree in ts.trees():
         self.assertEqual(tree.num_roots, 1)
Example #18
0
def insert_perfect_mutations(ts, delta=None):
    """
    Returns a copy of the specified tree sequence where the left and right
    coordinates of all edgesets are marked by mutations. This *should* be sufficient
    information to recover the tree sequence exactly.

    This has to be fudged slightly because we cannot have two sites with
    precisely the same coordinates. We work around this by having sites at
    some very small delta from the correct location.
    """
    tables = ts.dump_tables()
    tables.sites.clear()
    tables.mutations.clear()

    num_children = np.zeros(ts.num_nodes, dtype=int)
    parent = np.zeros(ts.num_nodes, dtype=int) - 1

    current_delta = 0
    if delta is not None:
        current_delta = delta

    for (left, right), edges_out, edges_in in ts.edge_diffs():
        last_num_children = list(num_children)
        children_in = set()
        children_out = set()
        parents_in = set()
        parents_out = set()
        for e in edges_out:
            # print("out:", e)
            parent[e.child] = -1
            num_children[e.parent] -= 1
            children_out.add(e.child)
            parents_out.add(e.parent)
        for e in edges_in:
            # print("in:", e)
            parent[e.child] = e.parent
            num_children[e.parent] += 1
            children_in.add(e.child)
            parents_in.add(e.parent)
        root = 0
        while parent[root] != -1:
            root = parent[root]
        # If we have more than 4 edges in the diff, or we have a 2 edge diff
        # that is not a root change this must be a multiple recombination.
        if len(edges_out) > 4 or (len(edges_out) == 2
                                  and root not in parents_in):
            raise ValueError("Multiple recombination detected")
        # We use the value of delta from the previous iteration
        x = left - current_delta
        for u in list(children_out - children_in) + list(children_in
                                                         & children_out):
            if last_num_children[u] > 0:
                site_id = tables.sites.add_row(position=x, ancestral_state="0")
                tables.mutations.add_row(site=site_id,
                                         node=u,
                                         derived_state="1")
                x -= current_delta

        # Now update delta for this interval.
        if delta is None:
            max_nodes = 2 * (len(children_out) +
                             len(children_in)) + len(parents_in) + 1
            current_delta = (right - left) / max_nodes
        x = left
        for c in list(children_in - children_out) + list(children_in
                                                         & children_out):
            if num_children[c] > 0:
                site_id = tables.sites.add_row(position=x, ancestral_state="0")
                tables.mutations.add_row(site=site_id,
                                         node=c,
                                         derived_state="1")
                x += current_delta

        # It seems wrong that we have to mark every parent, since a few of these
        # will already have been marked out by the children.
        for u in parents_in:
            if parent[u] != -1:
                # print("marking in parent", u, "at", x)
                site_id = tables.sites.add_row(position=x, ancestral_state="0")
                tables.mutations.add_row(site=site_id,
                                         node=u,
                                         derived_state="1")
                x += current_delta

    msprime.sort_tables(**tables.asdict())
    return msprime.load_tables(**tables.asdict())
        flags=nt.flags,  #[2 * popsize:],
        population=nt.population,  #[2 * popsize:],
        time=nt.time + ngens + 1)
    node_offset = nt.num_rows

    nt.append_columns(flags=flags,
                      population=nodes['population'] + node_offset,
                      time=nodes['generation'])

    es.append_columns(left=edges['left'],
                      right=edges['right'],
                      parent=edges['parent'] + node_offset,
                      child=edges['child'] + node_offset)

    # Sort
    msprime.sort_tables(nodes=nt, edges=es)

    # Simplify: this is where the magic happens
    # PLR: since these tables aren't valid, you gotta use simplify_tables, not load them into a tree sequence
    msprime.simplify_tables(samples=samples.tolist(), nodes=nt, edges=es)

    # Create a tree sequence
    x = msprime.load_tables(nodes=nt, edges=es)

    # Lets look at the MRCAS.
    # This is where things go badly:
    MRCAS = [t.get_time(t.get_root()) for t in x.trees()]
    print(MRCAS)

    # Throw down some mutations
    # onto a sample of size nsam
Example #20
0
def wfrec(nsam, rho, nsites, theta):
    samples = []
    for i in range(nsam):
        samples.append(it.IntervalTree([it.Interval(0, nsites)]))

    links = np.array([sumIntervalTree(i) for i in samples], dtype=np.int)
    nlinks = links.sum()

    n = nsam
    rbp = rho / float(nsites - 1)
    t = 0.0

    nodes = msprime.NodeTable()
    edges = msprime.EdgeTable()

    nodes.set_columns(time=np.zeros(nsam),
                      flags=np.ones(nsam, dtype=np.uint32))

    sample_indexes = [i for i in range(len(samples))]
    next_index = len(sample_indexes)

    while (n > 1):
        rcoal = float(n * (n - 1))
        rrec = rbp * float(nlinks)

        iscoal = bool(np.random.random_sample(1)[0] < rcoal / (rcoal + rrec))
        t += np.random.exponential(4. / (rcoal + rrec), 1)[0]
        assert len(samples) == len(links), "sample/link error"
        if iscoal is True:
            chroms = np.sort(np.random.choice(n, 2, replace=False))
            c1 = chroms[0]
            c2 = chroms[1]

            nodes.add_row(time=t, flags=msprime.NODE_IS_SAMPLE)
            for i in samples[c1]:
                edges.add_row(left=i[0],
                              right=i[1],
                              parent=next_index,
                              child=sample_indexes[c1])
                edges.add_row(left=i[0],
                              right=i[1],
                              parent=next_index,
                              child=sample_indexes[c2])
            newchrom = it.IntervalTree()
            # Merge intervals of the two chromosomes
            # and remove overlaps
            for i in samples[c1]:
                newchrom.append(i)
            for i in samples[c2]:
                newchrom.append(i)
            newchrom.merge_overlaps()
            samples.pop(c2)
            samples.pop(c1)
            samples.append(newchrom)
            sample_indexes.pop(c2)
            sample_indexes.pop(c1)
            sample_indexes.append(next_index)
            next_index += 1
            n -= 1
        else:
            # Pick a chrom proportional to
            # its total size:
            chrom = np.random.choice(len(sample_indexes),
                                     1,
                                     p=links / links.sum())[0]
            mnpos = min(
                [i for j in samples[chrom] for i in j if i is not None])
            mxpos = max(
                [i for j in samples[chrom] for i in j if i is not None])
            pos = np.random.randint(mnpos, mxpos)
            samples[chrom].chop(pos, pos)
            tc = it.IntervalTree([i for i in samples[chrom] if i[0] >= pos])
            samples[chrom].remove_overlap(pos, nsites)
            samples.append(tc)
            sample_indexes.append(next_index)
            next_index += 1
            n += 1

        assert all([len(i) > 0 for i in samples]), "empty IntervalTree"
        assert len(samples) == len(sample_indexes), "sample/sample_index error"
        links = np.array([sumIntervalTree(i) for i in samples], dtype=np.int)
        nlinks = links.sum()
        assert len(samples) == len(links), "sample/link error 2"
    for i in range(len(edges)):
        assert edges[i].parent < len(nodes), "parent error"
        assert edges[i].child < len(nodes), "child error"
    msprime.sort_tables(nodes=nodes, edges=edges)
    return msprime.load_tables(nodes=nodes, edges=edges)
Example #21
0
 def writer():
     msprime.sort_tables(**tables.asdict())
    st = msprime.SiteTable()
    st.set_columns(position=mutas['position'],
                   ancestral_state=np.zeros(len(mutas['position']), np.int8),
                   ancestral_state_length=np.ones(len(mutas['position']),
                                                  np.uint32))

    mt = msprime.MutationTable()
    mt.set_columns(site=np.arange(len(mutas['node_id']), dtype=np.int32),
                   node=mutas['node_id'],
                   derived_state=np.ones(len(mutas['node_id']), np.int8),
                   derived_state_length=np.ones(len(mutas['node_id']),
                                                np.uint32))

    # Sort
    msprime.sort_tables(nodes=nt, edges=es, sites=st, mutations=mt)
    print("num total mutations: ", st.num_rows)

    # Simplify: this is where the magic happens
    ## PLR: since these tables aren't valid, you gotta use simplify_tables, not load them into a tree sequence
    nt_c = nt.copy()
    es_c = es.copy()
    st_c = st.copy()
    mt_c = mt.copy()
    msprime.simplify_tables(samples=samples.tolist(),
                            nodes=nt_c,
                            edges=es_c,
                            sites=st_c,
                            mutations=mt_c)
    print("num simplified mutations: ", st_c.num_rows)
    # Create a tree sequence
        ri=struct.unpack('d',f.read(8))
        p.append(pi[0])
        c.append(ci[0])
        l.append(li[0])
        r.append(ri[0])

edges.set_columns(parent=p,child=c,left=l,right=r)


N=int(sys.argv[3])
#samples=[i for i in range(len(times)-2*N,len(times))] 
samples=[i for i in range(0,len(times),132)]
ts=None

A=time.time()
msprime.sort_tables(nodes=nodes,edges=edges)
B=time.time()
ts=msprime.simplify_tables(nodes=nodes,edges=edges,samples=samples)
C=time.time()

print("Sorting: ",B-A,"seconds")
print("Simplifying: ",C-B,"seconds")


with open(sys.argv[4],'w') as f:
    for i in edges:
        f.write("{} {} {:.6f} {:.6f}\n".format(i.parent,i.child,i.left,i.right,nodes[i.parent].time))
with open(sys.argv[5],'w') as f:
    for i in nodes:
        f.write("{}\n".format(i.time))
# Practice with the simplify API
import msprime

n = msprime.NodeTable()
sv = [True, True, True, True, True, True, True]
tv = [0.0, 0.0, 0.0, 0.4, 0.5, 0.7, 1.0]
pv = [0, 0, 0, 0, 0, 0, 0]
n = msprime.NodeTable()
n.set_columns(flags=sv, population=pv, time=tv)
print(n)

left = [0.2, 0.2, 0.0, 0.0, 0.2, 0.2, 0.8, 0.8, 0.8, 0.8, 0.0, 0.0]
right = [0.8, 0.8, 0.2, 0.2, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
parent = [3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 6, 6]
# children = [(0,2),(1,2),(1,3),(0,4),(0,4)]
children = [0, 2, 1, 2, 1, 3, 1, 2, 0, 4, 0, 4]

e = msprime.EdgesetTable()

for l, r, p, c in zip(left, right, parent, children):
    e.add_row(left=l, right=r, parent=p, children=(c, ))

print(e)
msprime.sort_tables(nodes=n, edgesets=e)
x = msprime.load_tables(nodes=n, edgesets=e)
x = x.simplify(samples=[0, 1, 2])
x.dump_tables(nodes=n, edgesets=e)
print(n)
print(e)
# make some fake nodes
Example #25
0
def _load_legacy_hdf5_v2(root, remove_duplicate_positions):
    # Get the coalescence records
    trees_group = root["trees"]
    old_timestamp = datetime.datetime.min.isoformat()
    provenances = msprime.ProvenanceTable()
    provenances.add_row(timestamp=old_timestamp,
                        record=_get_v2_provenance("generate_trees",
                                                  trees_group.attrs))
    num_rows = trees_group["node"].shape[0]
    index = np.arange(num_rows, dtype=int)
    parent = np.zeros(2 * num_rows, dtype=np.int32)
    parent[2 * index] = trees_group["node"]
    parent[2 * index + 1] = trees_group["node"]
    left = np.zeros(2 * num_rows, dtype=np.float64)
    left[2 * index] = trees_group["left"]
    left[2 * index + 1] = trees_group["left"]
    right = np.zeros(2 * num_rows, dtype=np.float64)
    right[2 * index] = trees_group["right"]
    right[2 * index + 1] = trees_group["right"]
    child = np.array(trees_group["children"], dtype=np.int32).flatten()
    edges = msprime.EdgeTable()
    edges.set_columns(left=left, right=right, parent=parent, child=child)

    cr_node = np.array(trees_group["node"], dtype=np.int32)
    num_nodes = max(np.max(child), np.max(cr_node)) + 1
    sample_size = np.min(cr_node)
    flags = np.zeros(num_nodes, dtype=np.uint32)
    population = np.zeros(num_nodes, dtype=np.int32)
    time = np.zeros(num_nodes, dtype=np.float64)
    flags[:sample_size] = msprime.NODE_IS_SAMPLE
    cr_population = np.array(trees_group["population"], dtype=np.int32)
    cr_time = np.array(trees_group["time"])
    time[cr_node] = cr_time
    population[cr_node] = cr_population
    if "samples" in root:
        samples_group = root["samples"]
        population[:sample_size] = samples_group["population"]
        if "time" in samples_group:
            time[:sample_size] = samples_group["time"]
    nodes = msprime.NodeTable()
    nodes.set_columns(flags=flags, population=population, time=time)

    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    if "mutations" in root:
        mutations_group = root["mutations"]
        _convert_hdf5_mutations(mutations_group, sites, mutations,
                                remove_duplicate_positions)
        provenances.add_row(timestamp=old_timestamp,
                            record=_get_v2_provenance("generate_mutations",
                                                      mutations_group.attrs))
    provenances.add_row(_get_upgrade_provenance(root))
    msprime.sort_tables(nodes=nodes,
                        edges=edges,
                        sites=sites,
                        mutations=mutations)
    return msprime.load_tables(nodes=nodes,
                               edges=edges,
                               sites=sites,
                               mutations=mutations,
                               provenances=provenances)
Example #26
0
    def get_tree_sequence(self, rescale_positions=True, all_sites=False):
        """
        Returns the current state of the build tree sequence. All samples and
        ancestors will have the sample node flag set.
        """
        # TODO Change the API here to ask whether we want a final tree sequence
        # or not. In the latter case we also need to translate the ancestral
        # and derived states to the input values.
        tsb = self.tree_sequence_builder
        flags, time = tsb.dump_nodes()
        nodes = msprime.NodeTable()
        nodes.set_columns(flags=flags, time=time)

        left, right, parent, child = tsb.dump_edges()
        if rescale_positions:
            position = self.sample_data.position[:]
            sequence_length = self.sample_data.sequence_length
            if sequence_length is None or sequence_length < position[-1]:
                sequence_length = position[-1] + 1
            # Subset down to the variants.
            position = position[self.sample_data.variant_site[:]]
            x = np.hstack([position, [sequence_length]])
            x[0] = 0
            left = x[left]
            right = x[right]
        else:
            position = np.arange(tsb.num_sites)
            sequence_length = max(1, tsb.num_sites)

        edges = msprime.EdgeTable()
        edges.set_columns(left=left, right=right, parent=parent, child=child)

        sites = msprime.SiteTable()
        sites.set_columns(
            position=position,
            ancestral_state=np.zeros(tsb.num_sites, dtype=np.int8) + ord('0'),
            ancestral_state_offset=np.arange(tsb.num_sites + 1,
                                             dtype=np.uint32))
        mutations = msprime.MutationTable()
        site = np.zeros(tsb.num_mutations, dtype=np.int32)
        node = np.zeros(tsb.num_mutations, dtype=np.int32)
        parent = np.zeros(tsb.num_mutations, dtype=np.int32)
        derived_state = np.zeros(tsb.num_mutations, dtype=np.int8)
        site, node, derived_state, parent = tsb.dump_mutations()
        derived_state += ord('0')
        mutations.set_columns(site=site,
                              node=node,
                              derived_state=derived_state,
                              derived_state_offset=np.arange(
                                  tsb.num_mutations + 1, dtype=np.uint32),
                              parent=parent)
        if all_sites:
            # Append the sites and mutations for each singleton.
            num_singletons = self.sample_data.num_singleton_sites
            singleton_site = self.sample_data.singleton_site[:]
            singleton_sample = self.sample_data.singleton_sample[:]
            pos = self.sample_data.position[:]
            new_sites = np.arange(len(sites),
                                  len(sites) + num_singletons,
                                  dtype=np.int32)
            sites.append_columns(
                position=pos[singleton_site],
                ancestral_state=np.zeros(num_singletons, dtype=np.int8) +
                ord('0'),
                ancestral_state_offset=np.arange(num_singletons + 1,
                                                 dtype=np.uint32))
            mutations.append_columns(
                site=new_sites,
                node=self.sample_ids[singleton_sample],
                derived_state=np.zeros(num_singletons, dtype=np.int8) +
                ord('1'),
                derived_state_offset=np.arange(num_singletons + 1,
                                               dtype=np.uint32))
            # Get the invariant sites
            num_invariants = self.sample_data.num_invariant_sites
            invariant_site = self.sample_data.invariant_site[:]
            sites.append_columns(
                position=pos[invariant_site],
                ancestral_state=np.zeros(num_invariants, dtype=np.int8) +
                ord('0'),
                ancestral_state_offset=np.arange(num_invariants + 1,
                                                 dtype=np.uint32))

        msprime.sort_tables(nodes, edges, sites=sites, mutations=mutations)
        return msprime.load_tables(nodes=nodes,
                                   edges=edges,
                                   sites=sites,
                                   mutations=mutations,
                                   sequence_length=sequence_length)
Example #27
0
    def simplify(self, generation, ancestry):
        # print(type(ancestry))
        # update node times:
        if self.__nodes.num_rows > 0:
            tc = self.__nodes.time
            dt = float(generation) - self.last_gc_time
            tc += dt
            self.last_gc_time = generation
            flags = np.ones(self.__nodes.num_rows, dtype=np.uint32)
            self.__nodes.set_columns(flags=flags,
                                     population=self.__nodes.population,
                                     time=tc)

        before = time.process_time()
        # Acquire mutex
        ancestry.acquire()
        self.reverse_time(ancestry.nodes)
        na = np.array(ancestry.nodes, copy=False)
        ea = np.array(ancestry.edges, copy=False)
        new_min_id = na['id'][0]
        new_max_id = na['id'][-1]
        delta = new_min_id - len(self.__nodes)
        if delta != 0:
            self.update_indexes(ancestry.edges, ancestry.samples, delta,
                                new_min_id, new_max_id)
        samples = np.array(ancestry.samples, copy=False)
        flags = np.ones(len(na), dtype=np.uint32)
        self.__time_prepping += time.process_time() - before

        before = time.process_time()
        clen = len(self.__nodes)
        self.__nodes.append_columns(flags=flags,
                                    population=na['population'],
                                    time=na['generation'])
        # Copy the already sorted edges to local arrays
        left = self.__edges.left[:]
        right = self.__edges.right[:]
        parent = self.__edges.parent[:]
        child = self.__edges.child[:]
        # Get the new edges and reverse them. After this, we know that all edges
        # are correctly sorted with respect to time. We then sort each time slice
        # individually, reducing the overall cost of the sort.
        new_left = ea['left'][::-1]
        new_right = ea['right'][::-1]
        new_parent = ea['parent'][::-1]
        new_child = ea['child'][::-1]

        parent_time = self.__nodes.time[new_parent]
        breakpoints = np.where(parent_time[1:] != parent_time[:-1])[0] + 1
        self.__edges.reset()
        self.__time_appending += time.process_time() - before

        before = time.process_time()
        start = 0
        for end in itertools.chain(breakpoints, [-1]):
            assert np.all(parent_time[start:end] == parent_time[start])
            self.__edges.append_columns(left=new_left[start:end],
                                        right=new_right[start:end],
                                        parent=new_parent[start:end],
                                        child=new_child[start:end])
            msprime.sort_tables(nodes=self.__nodes,
                                edges=self.__edges,
                                edge_start=start)
            start = end
        self.__time_sorting += time.process_time() - before

        # Append the old sorted edges to the table.
        self.__edges.append_columns(left=left,
                                    right=right,
                                    parent=parent,
                                    child=child)
        before = time.process_time()
        msprime.simplify_tables(samples=samples.tolist(),
                                nodes=self.__nodes,
                                edges=self.__edges)

        # Release any locks on the ancestry object
        ancestry.release()
        self.__last_edge_start = len(self.__edges)
        self.__time_simplifying += time.process_time() - before
        self.__process = True
        return (True, self.__nodes.num_rows)
Example #28
0
 def writer(thread_index, results):
     msprime.sort_tables(**tables.asdict())