Beispiel #1
0
    def test_sweep_model_change_time_complete(self):
        # Short sweep that doesn't coalesce followed
        # by Hudson phase to finish up coalescent
        sweep_model = msprime.SweepGenicSelection(position=5,
                                                  start_frequency=0.69,
                                                  end_frequency=0.72,
                                                  s=0.1,
                                                  dt=1e-6)
        ts = msprime.sim_ancestry(
            10,
            population_size=1000,
            sequence_length=10,
            recombination_rate=2,
            model=[sweep_model, "hudson"],
            random_seed=2,
        )
        assert all(tree.num_roots == 1 for tree in ts.trees())

        ts = msprime.sim_ancestry(
            10,
            population_size=1000,
            recombination_rate=2,
            model=sweep_model,
            random_seed=2,
            sequence_length=10,
            discrete_genome=False,
        )
        assert any(tree.num_roots > 1 for tree in ts.trees())
Beispiel #2
0
    def test_random_seed(self):
        ts1 = msprime.sim_ancestry(10, random_seed=1)
        ts2 = msprime.sim_ancestry(10, random_seed=1)
        self.assertTrue(tree_sequences_equal(ts1, ts2))

        ts2 = msprime.sim_ancestry(10, random_seed=2)
        self.assertFalse(tree_sequences_equal(ts1, ts2))
Beispiel #3
0
 def test_sweep_coalescence_same_seed(self):
     model = msprime.SweepGenicSelection(position=0.5,
                                         start_frequency=0.6,
                                         end_frequency=0.7,
                                         s=0.1,
                                         dt=1e-6)
     ts1 = msprime.sim_ancestry(5, model=model, random_seed=2)
     ts2 = msprime.sim_ancestry(5, model=model, random_seed=2)
     assert ts1.equals(ts2, ignore_provenance=True)
Beispiel #4
0
def test_msprime(selenium):
    import msprime
    import tskit

    # basic test
    ts = msprime.sim_ancestry(10, random_seed=42)
    ts.dump("/tmp/msprime.trees")
    ts = tskit.load("/tmp/msprime.trees")
    ts2 = msprime.sim_ancestry(10, random_seed=42)
    ts.tables.assert_equals(ts2.tables, ignore_provenance=True)
Beispiel #5
0
 def _run_long_sequence_length_gene_conversion(self):
     msprime.sim_ancestry(
         sample_size=100,
         length=1e8,
         Ne=10**4,
         gene_conversion_rate=1e-8,
         # 100Kb tract length.
         gene_conversion_tract_length=100 * 1e3,
         random_seed=43,
     )
Beispiel #6
0
 def test_population_size(self):
     ts1 = msprime.sim_ancestry(10, population_size=1, random_seed=2)
     # Defaults to 1
     ts2 = msprime.sim_ancestry(10, random_seed=2)
     self.assertTrue(tree_sequences_equal(ts1, ts2))
     ts2 = msprime.sim_ancestry(10, population_size=100, random_seed=2)
     # Acts as a simple scaling factor on times.
     self.assertEqual(ts1.tables.edges, ts2.tables.edges)
     self.assertTrue(
         np.allclose(100 * ts1.tables.nodes.time, ts2.tables.nodes.time))
Beispiel #7
0
 def test_model_end_broken(self):
     # Checking that we're correctly detecting the fact that
     # sweeps are non renentrant.
     model = msprime.SweepGenicSelection(position=0.5,
                                         start_frequency=0.1,
                                         end_frequency=0.9,
                                         s=0.01,
                                         dt=0.01)
     with pytest.raises(RuntimeError,
                        match="does not support interruption"):
         msprime.sim_ancestry(10, model=model, end_time=0.0001)
Beispiel #8
0
 def test_hudson_time_scale(self):
     n = 10
     seed = 1234
     for ploidy in [1, 2, 3, 7]:
         # Default ploidy is 1
         ts1 = msprime.sim_ancestry(n * ploidy, random_seed=seed)
         ts2 = msprime.sim_ancestry(n, ploidy=ploidy, random_seed=seed)
         t1 = ts1.tables
         t2 = ts2.tables
         self.assertTrue(np.allclose(t1.nodes.time * ploidy, t2.nodes.time))
         self.assertEqual(t1.edges, t2.edges)
Beispiel #9
0
 def test_model(self):
     ts1 = msprime.sim_ancestry(10, population_size=100, random_seed=2)
     ts2 = msprime.sim_ancestry(10,
                                population_size=100,
                                model="hudson",
                                random_seed=2)
     self.assertTrue(tree_sequences_equal(ts1, ts2))
     ts2 = msprime.sim_ancestry(10,
                                population_size=100,
                                model="dtwf",
                                random_seed=2)
     self.assertFalse(tree_sequences_equal(ts1, ts2))
Beispiel #10
0
    def test_4_species_run(self):
        species_tree = (
            "(((human:5.6,chimpanzee:5.6):3.0,gorilla:8.6):9.4,orangutan:18.0)"
        )
        spec = species_trees.parse_species_tree(
            species_tree,
            time_units="myr",
            initial_size=10000,
            generation_time=20,
        )

        # Take one sample from each population
        ts = msprime.sim_ancestry(samples={j: 1
                                           for j in range(4)},
                                  demography=spec,
                                  ploidy=1)

        assert ts.num_trees == 1
        assert ts.num_samples == 4
        assert ts.num_populations == 7
        for j, u in enumerate(ts.samples()):
            assert ts.node(u).population == j

        pops = list(ts.populations())
        assert pops[0].metadata["name"] == "human"
        assert pops[1].metadata["name"] == "chimpanzee"
        assert pops[2].metadata["name"] == "gorilla"
        assert pops[3].metadata["name"] == "orangutan"
        assert pops[4].metadata["name"] == "pop_4"
        assert pops[5].metadata["name"] == "pop_5"
        assert pops[6].metadata["name"] == "pop_6"

        # Use the population names to get the samples
        samples = dict(human=4, gorilla=2)
        ts = msprime.sim_ancestry(samples=samples, demography=spec)
        assert ts.num_trees == 1
        assert ts.num_samples == 12
        for j, u in enumerate(ts.samples()):
            pop = 0 if j < 8 else 2
            assert ts.node(u).population == pop

        # Order of keywords is respected
        ts = msprime.sim_ancestry(samples={
            "gorilla": 2,
            "human": 4
        },
                                  demography=spec)
        assert ts.num_trees == 1
        assert ts.num_samples == 12
        for j, u in enumerate(ts.samples()):
            pop = 2 if j < 4 else 0
            assert ts.node(u).population == pop
Beispiel #11
0
 def test_incorrect_num_labels(self):
     model = msprime.SweepGenicSelection(position=0.5,
                                         start_frequency=0.1,
                                         end_frequency=0.9,
                                         s=0.01,
                                         dt=0.01)
     for num_labels in [1, 3, 10]:
         # Not the best error, but this shouldn't be exposed to the user anyway.
         with pytest.raises(_msprime.LibraryError,
                            match="configuration is not supported"):
             msprime.sim_ancestry(
                 10,
                 model=model,
                 num_labels=num_labels,
             )
Beispiel #12
0
 def finish_simulation(self, from_ts, recombination_rate=0, seed=1):
     return msprime.sim_ancestry(
         initial_state=from_ts,
         start_time=1,
         recombination_rate=recombination_rate,
         random_seed=seed,
     )
Beispiel #13
0
 def test_defaults(self):
     n = 10
     ts = msprime.sim_ancestry(n)
     self.assertEqual(ts.num_samples, n)
     self.assertEqual(ts.num_trees, 1)
     self.assertEqual(ts.num_sites, 0)
     self.assertEqual(ts.sequence_length, 1)
Beispiel #14
0
 def test_encode_simulation_models(self):
     models = [
         msprime.StandardCoalescent(duration=10),
         msprime.DiscreteTimeWrightFisher(duration=10),
         msprime.SmcApproxCoalescent(duration=10),
         msprime.StandardCoalescent(),
     ]
     ts = msprime.sim_ancestry(10, model=models, random_seed=1234)
     decoded = self.decode(ts.provenance(0).record)
     parameters = decoded.parameters
     assert parameters.model[0] == {
         "__class__": "msprime.ancestry.StandardCoalescent",
         "duration": 10,
     }
     assert parameters.model[1] == {
         "__class__": "msprime.ancestry.DiscreteTimeWrightFisher",
         "duration": 10,
     }
     assert parameters.model[2] == {
         "__class__": "msprime.ancestry.SmcApproxCoalescent",
         "duration": 10,
     }
     assert parameters.model[3] == {
         "__class__": "msprime.ancestry.StandardCoalescent",
         "duration": None,
     }
Beispiel #15
0
 def test_many_sweeps_regular_times_model_change(self):
     models = []
     for j in range(0, 10):
         models.extend([
             # Start each sweep after 0.01 generations of Hudson
             msprime.StandardCoalescent(duration=0.01),
             msprime.SweepGenicSelection(
                 position=j,
                 start_frequency=0.69,
                 end_frequency=0.7,
                 s=0.1,
                 dt=1e-6,
             ),
         ])
     # Complete the simulation with Hudson
     models.append("hudson")
     ts = msprime.sim_ancestry(
         3,
         population_size=1000,
         sequence_length=10,
         recombination_rate=0.2,
         model=models,
         random_seed=2,
     )
     assert all(tree.num_roots == 1 for tree in ts.trees())
def run_arg_sim():

    L_col = []
    size_col = []
    arg_col = []
    for megabases in np.linspace(0.1, 5, 20):
        L = int(megabases * 1_000_000)
        arg_ts = msprime.sim_ancestry(
            100,
            population_size=10_000,
            sequence_length=L,
            recombination_rate=1e-8,
            random_seed=42,
            record_full_arg=True,
        )
        flags = arg_ts.tables.nodes.flags
        # Samples have flags == 1 and ordinary coalescent nodes have
        # flags == 0. So, anything > 1 is an ARG node.
        arg_nodes = flags > 1
        L_col.append(L)
        arg_fraction = np.sum(arg_nodes) / arg_ts.num_nodes
        ts = arg_ts.simplify()
        size_ratio = ts.tables.nbytes / arg_ts.nbytes
        size_col.append(size_ratio)
        arg_col.append(arg_fraction)
        print(L, arg_fraction, size_ratio)
    data = {"L": L_col, "size_ratio": size_col, "arg_nodes": arg_col}

    df = pd.DataFrame(data)
    print(df)
    df.to_csv("data/arg.csv")
Beispiel #17
0
 def test_generate_nucleotides_keep(self):
     ts = msprime.sim_ancestry(4, sequence_length=10, population_size=10)
     ts = pyslim.annotate_defaults(ts, model_type='nonWF', slim_generation=1)
     mts1 = msprime.sim_mutations(ts,
             model=msprime.SLiMMutationModel(type=1),
             rate=0.1,
             random_seed=23)
     mts1.dump("out.trees")
     nts1 = pyslim.generate_nucleotides(mts1, seed=10, keep=False)
     assert nts1.num_mutations > 0
     self.verify_generate_nucleotides(nts1, check_transitions=False)
     mts2 = msprime.sim_mutations(nts1,
             model=msprime.SLiMMutationModel(
                 type=2,
                 next_id=nts1.num_mutations,
             ),
             rate=0.1,
             random_seed=24,
     )
     # keep defaults to True
     nts2 = pyslim.generate_nucleotides(mts2, seed=12)
     assert nts2.num_mutations > nts1.num_mutations
     muts1 = {}
     for mut in nts1.mutations():
         for i, md in zip(mut.derived_state.split(","), mut.metadata['mutation_list']):
             muts1[i] = md['nucleotide']
     for mut in nts2.mutations():
         for i, md in zip(mut.derived_state.split(","), mut.metadata['mutation_list']):
             if md['mutation_type'] == 1:
                 assert i in muts1
                 assert muts1[i] == md['nucleotide']
             else:
                 assert md['nucleotide'] in [0, 1, 2, 3]
     nts3 = pyslim.generate_nucleotides(mts2, keep=False, seed=15)
     self.verify_generate_nucleotides(nts3, check_transitions=False)
Beispiel #18
0
 def test_wf_hudson_different_specifications(self):
     Ne = 100
     t = 100
     ts1 = msprime.sim_ancestry(
         samples=5,
         population_size=Ne,
         model=[msprime.DiscreteTimeWrightFisher(duration=t), "hudson"],
         recombination_rate=0.1,
         sequence_length=1,
         discrete_genome=False,
         random_seed=2,
     )
     ts2 = msprime.simulate(
         sample_size=10,
         recombination_rate=0.1,
         Ne=Ne,
         model="dtwf",
         demographic_events=[msprime.SimulationModelChange(t, "hudson")],
         random_seed=2,
     )
     ts3 = msprime.simulate(
         sample_size=10,
         recombination_rate=0.1,
         Ne=Ne,
         model="dtwf",
         demographic_events=[msprime.SimulationModelChange(t)],
         random_seed=2,
     )
     # Not worth trying to puzzle out the slight differences in tables
     # between the old and new form. The edges are the same, good enough.
     assert ts1.tables.edges == ts2.tables.edges
     assert ts2.equals(ts3, ignore_provenance=True)
Beispiel #19
0
 def test_current_ts(self):
     ts1 = msprime.sim_ancestry(5, random_seed=1)
     ts2 = msprime.sim_mutations(ts1)
     command, prov = msprime.provenance.parse_provenance(
         ts2.provenance(1), ts1)
     assert command == "sim_mutations"
     assert prov["tree_sequence"] == ts1
def run_msprime(*,
                sample_size,
                L,
                gc_rate,
                gc_tract_length,
                ret_breakpoints=True):
    sim = msprime.sim_ancestry(
        samples=sample_size,
        sequence_length=L,
        ploidy=1,
        gene_conversion_rate=gc_rate,
        gene_conversion_tract_length=gc_tract_length,
    )
    treenumber = sim.num_trees

    # We use an internal msprime API here because we want to get at the
    # number of breakpoints, not the distinct trees.
    if ret_breakpoints:
        sim = msprime.ancestry._parse_sim_ancestry(
            samples=sample_size,
            sequence_length=L,
            ploidy=1,
            gene_conversion_rate=gc_rate,
            gene_conversion_tract_length=gc_tract_length,
        )
        sim.run()
        breakpointnumber = sim.num_breakpoints
        return treenumber, breakpointnumber

    return treenumber
Beispiel #21
0
 def test_all_fields(self):
     demography = msprime.Demography()
     demography.add_population(name="A", initial_size=10_000)
     demography.add_population(name="B", initial_size=5_000)
     demography.add_population(name="C", initial_size=1_000)
     demography.add_population_split(time=1000, derived=["A", "B"], ancestral="C")
     ts = msprime.sim_ancestry(
         samples={"A": 1, "B": 1},
         demography=demography,
         random_seed=42,
         record_migrations=True,
     )
     ts = msprime.sim_mutations(ts, rate=1, random_seed=42)
     tables = ts.dump_tables()
     for name, table in tables.table_name_map.items():
         if name not in ["provenances", "edges"]:
             table.metadata_schema = tskit.MetadataSchema({"codec": "json"})
             metadatas = [f'{{"foo":"n_{name}_{u}"}}' for u in range(len(table))]
             metadata, metadata_offset = tskit.pack_strings(metadatas)
             table.set_columns(
                 **{
                     **table.asdict(),
                     "metadata": metadata,
                     "metadata_offset": metadata_offset,
                 }
             )
     tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
     tables.metadata = "Test metadata"
     self.verify(tables.tree_sequence())
Beispiel #22
0
 def test_upgrade_provenance(self):
     ts = msprime.sim_ancestry(10)
     for record_text in old_provenance_examples:
         record = json.loads(record_text)
         prov = tskit.Provenance(id=0,
                                 timestamp='2018-08-25T14:59:13',
                                 record=json.dumps(record))
         is_slim, version = pyslim.slim_provenance_version(prov)
         assert is_slim
         if 'file_version' in record:
             assert version == "0.1"
         else:
             assert version == record['slim']['file_version']
         tables = ts.dump_tables()
         tables.provenances.add_row(json.dumps(record))
         pyslim.upgrade_slim_provenance(tables)  # modifies the tables
         new_ts = tables.tree_sequence()
         assert new_ts.num_provenances == 3
         is_slim, version = pyslim.slim_provenance_version(
             new_ts.provenance(2))
         assert is_slim
         assert version == "0.4"
         new_record = json.loads(new_ts.provenance(2).record)
         if 'model_type' in record:
             assert record['model_type'] == new_record['parameters'][
                 'model_type']
             assert record['generation'] == new_record['slim']["generation"]
         else:
             assert record['parameters']['model_type'] == new_record[
                 'parameters']['model_type']
             assert record['slim']['generation'] == new_record['slim'][
                 "generation"]
Beispiel #23
0
 def test_many_populations(self, helper_functions, tmp_path):
     # test we can add more than one population
     ts = msprime.sim_ancestry(5,
                               population_size=10,
                               sequence_length=100,
                               random_seed=455)
     t = ts.dump_tables()
     for k in range(5):
         md = pyslim.default_slim_metadata('population')
         md['name'] = f"new_pop_num_{k}"
         md['description'] = f"the {k}-th added pop"
         t.populations.add_row(metadata=md)
         i = t.individuals.add_row()
         for _ in range(2):
             t.nodes.add_row(flags=1, time=0.0, individual=i, population=k)
     ts = t.tree_sequence()
     ts = pyslim.annotate_defaults(ts, model_type='WF', slim_generation=1)
     for ind in ts.individuals():
         assert ind.flags == pyslim.INDIVIDUAL_ALIVE
     sts = helper_functions.run_slim_restart(
         ts,
         "restart_WF.slim",
         tmp_path,
         WF=True,
     )
def great_apes(sample_size, initial_size):
    spec = msprime.species_trees.parse_species_tree(
        "(((human:5.6,chimp:5.6):3.0,gorilla:8.6):9.4,orangutan:18.0)",
        initial_size=initial_size,
        branch_length_units="myr",
        generation_time=28,
    )

    species_ts: tskit.TreeSequence = msprime.sim_ancestry(
        sequence_length=1e6,
        samples={j: sample_size
                 for j in range(4)},
        demography=spec,
        recombination_rate=1e-8,
        random_seed=1,
    )

    print(
        species_ts.num_samples / 1e3,
        "thousand genomes, ",
        round(species_ts.num_trees / 1e3),
        "thousand trees",
    )

    return species_ts
Beispiel #25
0
 def test_sim_ancestry(self):
     ts = msprime.sim_ancestry(5, random_seed=1)
     prov = ts.provenance(0).record
     decoded = self.decode(prov)
     assert decoded.schema_version == "1.0.0"
     assert decoded.parameters.command == "sim_ancestry"
     assert decoded.parameters.random_seed == 1
Beispiel #26
0
 def test_sim_ancestry(self):
     ts = msprime.sim_ancestry(5, random_seed=1)
     prov = ts.provenance(0).record
     decoded = self.decode(prov)
     self.assertEqual(decoded.schema_version, "1.0.0")
     self.assertEqual(decoded.parameters.command, "sim_ancestry")
     self.assertEqual(decoded.parameters.random_seed, 1)
Beispiel #27
0
def simulate_ts(
    sample_size: int,
    length: int = 100,
    mutation_rate: float = 0.05,
    random_seed: int = 42,
) -> tskit.TreeSequence:
    """
    Simulate some data using msprime with recombination and mutation and
    return the resulting tskit TreeSequence.

    Note this method currently simulates with ploidy=1 to minimise the
    update from an older version. We should update to simulate data under
    a range of ploidy values.
    """
    ancestry_ts = msprime.sim_ancestry(
        sample_size,
        ploidy=1,
        recombination_rate=0.01,
        sequence_length=length,
        random_seed=random_seed,
    )
    # Make sure we generate some data that's not all from the same tree
    assert ancestry_ts.num_trees > 1
    return msprime.sim_mutations(ancestry_ts,
                                 rate=mutation_rate,
                                 random_seed=random_seed)
Beispiel #28
0
 def test_repr_without_store_segments(self):
     ts = msprime.sim_ancestry(2, random_seed=2)
     result = ts.ibd_segments(store_pairs=True)
     s = repr(result)
     assert s.startswith("<tskit.tables.IdentitySegments")
     result = ts.ibd_segments()
     s = repr(result)
     assert s.startswith("<tskit.tables.IdentitySegments")
Beispiel #29
0
 def test_repr_store_segments(self):
     ts = msprime.sim_ancestry(2, random_seed=2)
     result = ts.ibd_segments(store_segments=True)
     s = repr(result)
     assert s.startswith("IdentitySegments({")
     for lst in result.values():
         s = repr(lst)
         assert s.startswith("IdentitySegmentList([")
Beispiel #30
0
 def test_ploidy(self):
     n = 10
     for k in [1, 2, 3, 4]:
         ts = msprime.sim_ancestry(n, ploidy=k)
         self.assertEqual(ts.num_samples, k * n)
         self.assertEqual(ts.num_trees, 1)
         self.assertEqual(ts.num_sites, 0)
         self.assertEqual(ts.sequence_length, 1)