def test_sweep_model_change_time_complete(self): # Short sweep that doesn't coalesce followed # by Hudson phase to finish up coalescent sweep_model = msprime.SweepGenicSelection(position=5, start_frequency=0.69, end_frequency=0.72, s=0.1, dt=1e-6) ts = msprime.sim_ancestry( 10, population_size=1000, sequence_length=10, recombination_rate=2, model=[sweep_model, "hudson"], random_seed=2, ) assert all(tree.num_roots == 1 for tree in ts.trees()) ts = msprime.sim_ancestry( 10, population_size=1000, recombination_rate=2, model=sweep_model, random_seed=2, sequence_length=10, discrete_genome=False, ) assert any(tree.num_roots > 1 for tree in ts.trees())
def test_random_seed(self): ts1 = msprime.sim_ancestry(10, random_seed=1) ts2 = msprime.sim_ancestry(10, random_seed=1) self.assertTrue(tree_sequences_equal(ts1, ts2)) ts2 = msprime.sim_ancestry(10, random_seed=2) self.assertFalse(tree_sequences_equal(ts1, ts2))
def test_sweep_coalescence_same_seed(self): model = msprime.SweepGenicSelection(position=0.5, start_frequency=0.6, end_frequency=0.7, s=0.1, dt=1e-6) ts1 = msprime.sim_ancestry(5, model=model, random_seed=2) ts2 = msprime.sim_ancestry(5, model=model, random_seed=2) assert ts1.equals(ts2, ignore_provenance=True)
def test_msprime(selenium): import msprime import tskit # basic test ts = msprime.sim_ancestry(10, random_seed=42) ts.dump("/tmp/msprime.trees") ts = tskit.load("/tmp/msprime.trees") ts2 = msprime.sim_ancestry(10, random_seed=42) ts.tables.assert_equals(ts2.tables, ignore_provenance=True)
def _run_long_sequence_length_gene_conversion(self): msprime.sim_ancestry( sample_size=100, length=1e8, Ne=10**4, gene_conversion_rate=1e-8, # 100Kb tract length. gene_conversion_tract_length=100 * 1e3, random_seed=43, )
def test_population_size(self): ts1 = msprime.sim_ancestry(10, population_size=1, random_seed=2) # Defaults to 1 ts2 = msprime.sim_ancestry(10, random_seed=2) self.assertTrue(tree_sequences_equal(ts1, ts2)) ts2 = msprime.sim_ancestry(10, population_size=100, random_seed=2) # Acts as a simple scaling factor on times. self.assertEqual(ts1.tables.edges, ts2.tables.edges) self.assertTrue( np.allclose(100 * ts1.tables.nodes.time, ts2.tables.nodes.time))
def test_model_end_broken(self): # Checking that we're correctly detecting the fact that # sweeps are non renentrant. model = msprime.SweepGenicSelection(position=0.5, start_frequency=0.1, end_frequency=0.9, s=0.01, dt=0.01) with pytest.raises(RuntimeError, match="does not support interruption"): msprime.sim_ancestry(10, model=model, end_time=0.0001)
def test_hudson_time_scale(self): n = 10 seed = 1234 for ploidy in [1, 2, 3, 7]: # Default ploidy is 1 ts1 = msprime.sim_ancestry(n * ploidy, random_seed=seed) ts2 = msprime.sim_ancestry(n, ploidy=ploidy, random_seed=seed) t1 = ts1.tables t2 = ts2.tables self.assertTrue(np.allclose(t1.nodes.time * ploidy, t2.nodes.time)) self.assertEqual(t1.edges, t2.edges)
def test_model(self): ts1 = msprime.sim_ancestry(10, population_size=100, random_seed=2) ts2 = msprime.sim_ancestry(10, population_size=100, model="hudson", random_seed=2) self.assertTrue(tree_sequences_equal(ts1, ts2)) ts2 = msprime.sim_ancestry(10, population_size=100, model="dtwf", random_seed=2) self.assertFalse(tree_sequences_equal(ts1, ts2))
def test_4_species_run(self): species_tree = ( "(((human:5.6,chimpanzee:5.6):3.0,gorilla:8.6):9.4,orangutan:18.0)" ) spec = species_trees.parse_species_tree( species_tree, time_units="myr", initial_size=10000, generation_time=20, ) # Take one sample from each population ts = msprime.sim_ancestry(samples={j: 1 for j in range(4)}, demography=spec, ploidy=1) assert ts.num_trees == 1 assert ts.num_samples == 4 assert ts.num_populations == 7 for j, u in enumerate(ts.samples()): assert ts.node(u).population == j pops = list(ts.populations()) assert pops[0].metadata["name"] == "human" assert pops[1].metadata["name"] == "chimpanzee" assert pops[2].metadata["name"] == "gorilla" assert pops[3].metadata["name"] == "orangutan" assert pops[4].metadata["name"] == "pop_4" assert pops[5].metadata["name"] == "pop_5" assert pops[6].metadata["name"] == "pop_6" # Use the population names to get the samples samples = dict(human=4, gorilla=2) ts = msprime.sim_ancestry(samples=samples, demography=spec) assert ts.num_trees == 1 assert ts.num_samples == 12 for j, u in enumerate(ts.samples()): pop = 0 if j < 8 else 2 assert ts.node(u).population == pop # Order of keywords is respected ts = msprime.sim_ancestry(samples={ "gorilla": 2, "human": 4 }, demography=spec) assert ts.num_trees == 1 assert ts.num_samples == 12 for j, u in enumerate(ts.samples()): pop = 2 if j < 4 else 0 assert ts.node(u).population == pop
def test_incorrect_num_labels(self): model = msprime.SweepGenicSelection(position=0.5, start_frequency=0.1, end_frequency=0.9, s=0.01, dt=0.01) for num_labels in [1, 3, 10]: # Not the best error, but this shouldn't be exposed to the user anyway. with pytest.raises(_msprime.LibraryError, match="configuration is not supported"): msprime.sim_ancestry( 10, model=model, num_labels=num_labels, )
def finish_simulation(self, from_ts, recombination_rate=0, seed=1): return msprime.sim_ancestry( initial_state=from_ts, start_time=1, recombination_rate=recombination_rate, random_seed=seed, )
def test_defaults(self): n = 10 ts = msprime.sim_ancestry(n) self.assertEqual(ts.num_samples, n) self.assertEqual(ts.num_trees, 1) self.assertEqual(ts.num_sites, 0) self.assertEqual(ts.sequence_length, 1)
def test_encode_simulation_models(self): models = [ msprime.StandardCoalescent(duration=10), msprime.DiscreteTimeWrightFisher(duration=10), msprime.SmcApproxCoalescent(duration=10), msprime.StandardCoalescent(), ] ts = msprime.sim_ancestry(10, model=models, random_seed=1234) decoded = self.decode(ts.provenance(0).record) parameters = decoded.parameters assert parameters.model[0] == { "__class__": "msprime.ancestry.StandardCoalescent", "duration": 10, } assert parameters.model[1] == { "__class__": "msprime.ancestry.DiscreteTimeWrightFisher", "duration": 10, } assert parameters.model[2] == { "__class__": "msprime.ancestry.SmcApproxCoalescent", "duration": 10, } assert parameters.model[3] == { "__class__": "msprime.ancestry.StandardCoalescent", "duration": None, }
def test_many_sweeps_regular_times_model_change(self): models = [] for j in range(0, 10): models.extend([ # Start each sweep after 0.01 generations of Hudson msprime.StandardCoalescent(duration=0.01), msprime.SweepGenicSelection( position=j, start_frequency=0.69, end_frequency=0.7, s=0.1, dt=1e-6, ), ]) # Complete the simulation with Hudson models.append("hudson") ts = msprime.sim_ancestry( 3, population_size=1000, sequence_length=10, recombination_rate=0.2, model=models, random_seed=2, ) assert all(tree.num_roots == 1 for tree in ts.trees())
def run_arg_sim(): L_col = [] size_col = [] arg_col = [] for megabases in np.linspace(0.1, 5, 20): L = int(megabases * 1_000_000) arg_ts = msprime.sim_ancestry( 100, population_size=10_000, sequence_length=L, recombination_rate=1e-8, random_seed=42, record_full_arg=True, ) flags = arg_ts.tables.nodes.flags # Samples have flags == 1 and ordinary coalescent nodes have # flags == 0. So, anything > 1 is an ARG node. arg_nodes = flags > 1 L_col.append(L) arg_fraction = np.sum(arg_nodes) / arg_ts.num_nodes ts = arg_ts.simplify() size_ratio = ts.tables.nbytes / arg_ts.nbytes size_col.append(size_ratio) arg_col.append(arg_fraction) print(L, arg_fraction, size_ratio) data = {"L": L_col, "size_ratio": size_col, "arg_nodes": arg_col} df = pd.DataFrame(data) print(df) df.to_csv("data/arg.csv")
def test_generate_nucleotides_keep(self): ts = msprime.sim_ancestry(4, sequence_length=10, population_size=10) ts = pyslim.annotate_defaults(ts, model_type='nonWF', slim_generation=1) mts1 = msprime.sim_mutations(ts, model=msprime.SLiMMutationModel(type=1), rate=0.1, random_seed=23) mts1.dump("out.trees") nts1 = pyslim.generate_nucleotides(mts1, seed=10, keep=False) assert nts1.num_mutations > 0 self.verify_generate_nucleotides(nts1, check_transitions=False) mts2 = msprime.sim_mutations(nts1, model=msprime.SLiMMutationModel( type=2, next_id=nts1.num_mutations, ), rate=0.1, random_seed=24, ) # keep defaults to True nts2 = pyslim.generate_nucleotides(mts2, seed=12) assert nts2.num_mutations > nts1.num_mutations muts1 = {} for mut in nts1.mutations(): for i, md in zip(mut.derived_state.split(","), mut.metadata['mutation_list']): muts1[i] = md['nucleotide'] for mut in nts2.mutations(): for i, md in zip(mut.derived_state.split(","), mut.metadata['mutation_list']): if md['mutation_type'] == 1: assert i in muts1 assert muts1[i] == md['nucleotide'] else: assert md['nucleotide'] in [0, 1, 2, 3] nts3 = pyslim.generate_nucleotides(mts2, keep=False, seed=15) self.verify_generate_nucleotides(nts3, check_transitions=False)
def test_wf_hudson_different_specifications(self): Ne = 100 t = 100 ts1 = msprime.sim_ancestry( samples=5, population_size=Ne, model=[msprime.DiscreteTimeWrightFisher(duration=t), "hudson"], recombination_rate=0.1, sequence_length=1, discrete_genome=False, random_seed=2, ) ts2 = msprime.simulate( sample_size=10, recombination_rate=0.1, Ne=Ne, model="dtwf", demographic_events=[msprime.SimulationModelChange(t, "hudson")], random_seed=2, ) ts3 = msprime.simulate( sample_size=10, recombination_rate=0.1, Ne=Ne, model="dtwf", demographic_events=[msprime.SimulationModelChange(t)], random_seed=2, ) # Not worth trying to puzzle out the slight differences in tables # between the old and new form. The edges are the same, good enough. assert ts1.tables.edges == ts2.tables.edges assert ts2.equals(ts3, ignore_provenance=True)
def test_current_ts(self): ts1 = msprime.sim_ancestry(5, random_seed=1) ts2 = msprime.sim_mutations(ts1) command, prov = msprime.provenance.parse_provenance( ts2.provenance(1), ts1) assert command == "sim_mutations" assert prov["tree_sequence"] == ts1
def run_msprime(*, sample_size, L, gc_rate, gc_tract_length, ret_breakpoints=True): sim = msprime.sim_ancestry( samples=sample_size, sequence_length=L, ploidy=1, gene_conversion_rate=gc_rate, gene_conversion_tract_length=gc_tract_length, ) treenumber = sim.num_trees # We use an internal msprime API here because we want to get at the # number of breakpoints, not the distinct trees. if ret_breakpoints: sim = msprime.ancestry._parse_sim_ancestry( samples=sample_size, sequence_length=L, ploidy=1, gene_conversion_rate=gc_rate, gene_conversion_tract_length=gc_tract_length, ) sim.run() breakpointnumber = sim.num_breakpoints return treenumber, breakpointnumber return treenumber
def test_all_fields(self): demography = msprime.Demography() demography.add_population(name="A", initial_size=10_000) demography.add_population(name="B", initial_size=5_000) demography.add_population(name="C", initial_size=1_000) demography.add_population_split(time=1000, derived=["A", "B"], ancestral="C") ts = msprime.sim_ancestry( samples={"A": 1, "B": 1}, demography=demography, random_seed=42, record_migrations=True, ) ts = msprime.sim_mutations(ts, rate=1, random_seed=42) tables = ts.dump_tables() for name, table in tables.table_name_map.items(): if name not in ["provenances", "edges"]: table.metadata_schema = tskit.MetadataSchema({"codec": "json"}) metadatas = [f'{{"foo":"n_{name}_{u}"}}' for u in range(len(table))] metadata, metadata_offset = tskit.pack_strings(metadatas) table.set_columns( **{ **table.asdict(), "metadata": metadata, "metadata_offset": metadata_offset, } ) tables.metadata_schema = tskit.MetadataSchema({"codec": "json"}) tables.metadata = "Test metadata" self.verify(tables.tree_sequence())
def test_upgrade_provenance(self): ts = msprime.sim_ancestry(10) for record_text in old_provenance_examples: record = json.loads(record_text) prov = tskit.Provenance(id=0, timestamp='2018-08-25T14:59:13', record=json.dumps(record)) is_slim, version = pyslim.slim_provenance_version(prov) assert is_slim if 'file_version' in record: assert version == "0.1" else: assert version == record['slim']['file_version'] tables = ts.dump_tables() tables.provenances.add_row(json.dumps(record)) pyslim.upgrade_slim_provenance(tables) # modifies the tables new_ts = tables.tree_sequence() assert new_ts.num_provenances == 3 is_slim, version = pyslim.slim_provenance_version( new_ts.provenance(2)) assert is_slim assert version == "0.4" new_record = json.loads(new_ts.provenance(2).record) if 'model_type' in record: assert record['model_type'] == new_record['parameters'][ 'model_type'] assert record['generation'] == new_record['slim']["generation"] else: assert record['parameters']['model_type'] == new_record[ 'parameters']['model_type'] assert record['slim']['generation'] == new_record['slim'][ "generation"]
def test_many_populations(self, helper_functions, tmp_path): # test we can add more than one population ts = msprime.sim_ancestry(5, population_size=10, sequence_length=100, random_seed=455) t = ts.dump_tables() for k in range(5): md = pyslim.default_slim_metadata('population') md['name'] = f"new_pop_num_{k}" md['description'] = f"the {k}-th added pop" t.populations.add_row(metadata=md) i = t.individuals.add_row() for _ in range(2): t.nodes.add_row(flags=1, time=0.0, individual=i, population=k) ts = t.tree_sequence() ts = pyslim.annotate_defaults(ts, model_type='WF', slim_generation=1) for ind in ts.individuals(): assert ind.flags == pyslim.INDIVIDUAL_ALIVE sts = helper_functions.run_slim_restart( ts, "restart_WF.slim", tmp_path, WF=True, )
def great_apes(sample_size, initial_size): spec = msprime.species_trees.parse_species_tree( "(((human:5.6,chimp:5.6):3.0,gorilla:8.6):9.4,orangutan:18.0)", initial_size=initial_size, branch_length_units="myr", generation_time=28, ) species_ts: tskit.TreeSequence = msprime.sim_ancestry( sequence_length=1e6, samples={j: sample_size for j in range(4)}, demography=spec, recombination_rate=1e-8, random_seed=1, ) print( species_ts.num_samples / 1e3, "thousand genomes, ", round(species_ts.num_trees / 1e3), "thousand trees", ) return species_ts
def test_sim_ancestry(self): ts = msprime.sim_ancestry(5, random_seed=1) prov = ts.provenance(0).record decoded = self.decode(prov) assert decoded.schema_version == "1.0.0" assert decoded.parameters.command == "sim_ancestry" assert decoded.parameters.random_seed == 1
def test_sim_ancestry(self): ts = msprime.sim_ancestry(5, random_seed=1) prov = ts.provenance(0).record decoded = self.decode(prov) self.assertEqual(decoded.schema_version, "1.0.0") self.assertEqual(decoded.parameters.command, "sim_ancestry") self.assertEqual(decoded.parameters.random_seed, 1)
def simulate_ts( sample_size: int, length: int = 100, mutation_rate: float = 0.05, random_seed: int = 42, ) -> tskit.TreeSequence: """ Simulate some data using msprime with recombination and mutation and return the resulting tskit TreeSequence. Note this method currently simulates with ploidy=1 to minimise the update from an older version. We should update to simulate data under a range of ploidy values. """ ancestry_ts = msprime.sim_ancestry( sample_size, ploidy=1, recombination_rate=0.01, sequence_length=length, random_seed=random_seed, ) # Make sure we generate some data that's not all from the same tree assert ancestry_ts.num_trees > 1 return msprime.sim_mutations(ancestry_ts, rate=mutation_rate, random_seed=random_seed)
def test_repr_without_store_segments(self): ts = msprime.sim_ancestry(2, random_seed=2) result = ts.ibd_segments(store_pairs=True) s = repr(result) assert s.startswith("<tskit.tables.IdentitySegments") result = ts.ibd_segments() s = repr(result) assert s.startswith("<tskit.tables.IdentitySegments")
def test_repr_store_segments(self): ts = msprime.sim_ancestry(2, random_seed=2) result = ts.ibd_segments(store_segments=True) s = repr(result) assert s.startswith("IdentitySegments({") for lst in result.values(): s = repr(lst) assert s.startswith("IdentitySegmentList([")
def test_ploidy(self): n = 10 for k in [1, 2, 3, 4]: ts = msprime.sim_ancestry(n, ploidy=k) self.assertEqual(ts.num_samples, k * n) self.assertEqual(ts.num_trees, 1) self.assertEqual(ts.num_sites, 0) self.assertEqual(ts.sequence_length, 1)