def test_example(self): tables = get_example_tables() tables.metadata_schema = tskit.MetadataSchema( { "codec": "struct", "type": "object", "properties": {"top-level": {"type": "string", "binaryFormat": "50p"}}, } ) tables.metadata = {"top-level": "top-level-metadata"} for table in [ "individuals", "nodes", "edges", "migrations", "sites", "mutations", "populations", ]: t = getattr(tables, table) t.packset_metadata([f"{table}-{i}".encode() for i in range(t.num_rows)]) t.metadata_schema = tskit.MetadataSchema( { "codec": "struct", "type": "object", "properties": {table: {"type": "string", "binaryFormat": "50p"}}, } ) self.verify(tables)
def test_example(self, tables): tables.metadata_schema = tskit.MetadataSchema({ "codec": "struct", "type": "object", "properties": { "top-level": { "type": "string", "binaryFormat": "50p" } }, }) tables.metadata = {"top-level": "top-level-metadata"} for table in tskit.TABLE_NAMES: t = getattr(tables, table) if hasattr(t, "metadata_schema"): t.packset_metadata( [f"{table}-{i}".encode() for i in range(t.num_rows)]) t.metadata_schema = tskit.MetadataSchema({ "codec": "struct", "type": "object", "properties": { table: { "type": "string", "binaryFormat": "50p" } }, }) self.verify(tables)
def test_all_fields(self): demography = msprime.Demography() demography.add_population(name="A", initial_size=10_000) demography.add_population(name="B", initial_size=5_000) demography.add_population(name="C", initial_size=1_000) demography.add_population_split(time=1000, derived=["A", "B"], ancestral="C") ts = msprime.sim_ancestry( samples={"A": 1, "B": 1}, demography=demography, random_seed=42, record_migrations=True, ) ts = msprime.sim_mutations(ts, rate=1, random_seed=42) tables = ts.dump_tables() for name, table in tables.table_name_map.items(): if name not in ["provenances", "edges"]: table.metadata_schema = tskit.MetadataSchema({"codec": "json"}) metadatas = [f'{{"foo":"n_{name}_{u}"}}' for u in range(len(table))] metadata, metadata_offset = tskit.pack_strings(metadatas) table.set_columns( **{ **table.asdict(), "metadata": metadata, "metadata_offset": metadata_offset, } ) tables.metadata_schema = tskit.MetadataSchema({"codec": "json"}) tables.metadata = "Test metadata" self.verify(tables.tree_sequence())
def full_ts(): """ Return a tree sequence that has data in all fields. """ """ A tree sequence with data in all fields - duplcated from tskit's conftest.py as other test suites using this file will not have that fixture defined. """ n = 10 t = 1 population_configurations = [ msprime.PopulationConfiguration(n // 2), msprime.PopulationConfiguration(n // 2), msprime.PopulationConfiguration(0), ] demographic_events = [ msprime.MassMigration(time=t, source=0, destination=2), msprime.MassMigration(time=t, source=1, destination=2), ] ts = msprime.simulate( population_configurations=population_configurations, demographic_events=demographic_events, random_seed=1, mutation_rate=1, record_migrations=True, ) tables = ts.dump_tables() # TODO replace this with properly linked up individuals using sim_ancestry # once 1.0 is released. for j in range(n): tables.individuals.add_row(flags=j, location=(j, j), parents=(j - 1, j - 1)) for name, table in tables.name_map.items(): if name != "provenances": table.metadata_schema = tskit.MetadataSchema({"codec": "json"}) metadatas = [f"n_{name}_{u}" for u in range(len(table))] metadata, metadata_offset = tskit.pack_strings(metadatas) table.set_columns( **{ **table.asdict(), "metadata": metadata, "metadata_offset": metadata_offset, }) tables.metadata_schema = tskit.MetadataSchema({"codec": "json"}) tables.metadata = "Test metadata" # Add some more provenance so we have enough rows for the offset deletion test. for j in range(10): tables.provenances.add_row(timestamp="x" * j, record="y" * j) return tables.tree_sequence()
def full_ts(): """ A tree sequence with data in all fields - duplicated from tskit's conftest.py as other test suites using this file will not have that fixture defined. """ demography = msprime.Demography() demography.add_population(initial_size=100, name="A") demography.add_population(initial_size=100, name="B") demography.add_population(initial_size=100, name="C") demography.add_population_split(time=10, ancestral="C", derived=["A", "B"]) ts = msprime.sim_ancestry( {"A": 5, "B": 5}, demography=demography, random_seed=1, sequence_length=10, record_migrations=True, ) assert ts.num_migrations > 0 assert ts.num_individuals > 0 ts = msprime.sim_mutations(ts, rate=0.1, random_seed=2) assert ts.num_mutations > 0 tables = ts.dump_tables() tables.individuals.clear() for ind in ts.individuals(): tables.individuals.add_row(flags=0, location=[ind.id, ind.id], parents=[-1, -1]) for name, table in tables.table_name_map.items(): if name != "provenances": table.metadata_schema = tskit.MetadataSchema({"codec": "json"}) metadatas = [f"n_{name}_{u}" for u in range(len(table))] metadata, metadata_offset = tskit.pack_strings(metadatas) table.set_columns( **{ **table.asdict(), "metadata": metadata, "metadata_offset": metadata_offset, } ) tables.metadata_schema = tskit.MetadataSchema({"codec": "json"}) tables.metadata = {"A": "Test metadata"} tables.reference_sequence.data = "A" * int(tables.sequence_length) tables.reference_sequence.url = "https://example.com/sequence" tables.reference_sequence.metadata_schema = tskit.MetadataSchema.permissive_json() tables.reference_sequence.metadata = {"A": "Test metadata"} # Add some more provenance so we have enough rows for the offset deletion test. for j in range(10): tables.provenances.add_row(timestamp="x" * j, record="y" * j) return tables.tree_sequence()
def ts_fixture(): """ A tree sequence with data in all fields """ demography = msprime.Demography() demography.add_population(name="A", initial_size=10_000) demography.add_population(name="B", initial_size=5_000) demography.add_population(name="C", initial_size=1_000) demography.add_population(name="D", initial_size=500) demography.add_population(name="E", initial_size=100) demography.add_population_split(time=1000, derived=["A", "B"], ancestral="C") ts = msprime.sim_ancestry( samples={"A": 10, "B": 10}, demography=demography, sequence_length=5, random_seed=42, record_migrations=True, record_provenance=True, ) ts = msprime.sim_mutations(ts, rate=0.001, random_seed=42) tables = ts.dump_tables() # Add locations to individuals individuals_copy = tables.individuals.copy() tables.individuals.clear() for i, individual in enumerate(individuals_copy): tables.individuals.append( individual.replace(location=[i, i + 1], parents=[i - 1, i - 1]) ) for name, table in tables.name_map.items(): if name != "provenances": table.metadata_schema = tskit.MetadataSchema({"codec": "json"}) metadatas = [f'{{"foo":"n_{name}_{u}"}}' for u in range(len(table))] metadata, metadata_offset = tskit.pack_strings(metadatas) table.set_columns( **{ **table.asdict(), "metadata": metadata, "metadata_offset": metadata_offset, } ) tables.metadata_schema = tskit.MetadataSchema({"codec": "json"}) tables.metadata = "Test metadata" # Add some more rows to provenance to have enough for testing. for _ in range(3): tables.provenances.add_row(record="A") return tables.tree_sequence()
def test_set_tree_sequence_metadata_keeps(self): # make sure doesn't overwrite other stuff dummy_schema = tskit.MetadataSchema({ 'codec': 'json', 'type': 'object', 'properties': { 'abc': { 'type': 'string' } } }) dummy_metadata = {'abc': 'foo'} for ts in self.get_slim_examples(): tables = ts.tables tables.metadata_schema = dummy_schema tables.metadata = dummy_metadata pyslim.set_tree_sequence_metadata(tables, "nonWF", 0) schema = tables.metadata_schema.schema for k in dummy_metadata: self.assertTrue(k in schema['properties']) self.assertTrue(k in tables.metadata) self.assertEqual(tables.metadata[k], dummy_metadata[k]) self.validate_slim_metadata(tables) self.assertEqual(tables.metadata['SLiM']['model_type'], "nonWF") self.assertEqual(tables.metadata['SLiM']['generation'], 0) break
def test_recover_metadata(self, recipe): # msprime <=0.7.5 discards metadata, but we can recover it from provenance ts = recipe["ts"] tables = ts.dump_tables() tables.metadata_schema = tskit.MetadataSchema(None) tables.metadata = b'' new_ts = pyslim.load_tables(tables) assert new_ts.metadata == ts.metadata
def test_bad_metadata(self): clean_tables = self.clean_example() tables = clean_tables.copy() tables.metadata_schema = tskit.MetadataSchema({"type": "object", "codec": "json"}) tables.metadata = {} ts = tables.tree_sequence() with pytest.raises(ValueError): _ = pyslim.SlimTreeSequence(ts)
def test_set_tree_sequence_metadata_errors(self): for ts in self.get_slim_examples(): tables = ts.tables tables.metadata_schema = tskit.MetadataSchema(None) self.assertGreater(len(tables.metadata), 0) with self.assertRaises(ValueError): pyslim.set_tree_sequence_metadata(tables, "nonWF", 0) break
def test_recover_metadata(self): # msprime <=0.7.5 discards metadata, but we can recover it from provenance for ts in self.get_slim_examples(): t = ts.tables t.metadata_schema = tskit.MetadataSchema(None) t.metadata = b'' new_ts = pyslim.load_tables(t) self.assertEqual(new_ts.metadata, ts.metadata)
def verify_0_3_3(self, ts): for table in tskit.TABLE_NAMES: t = getattr(ts.tables, table) assert t.num_rows > 0 if hasattr(t, "metadata_schema"): assert t.metadata_schema == tskit.MetadataSchema({"codec": "json"}) assert t[2].metadata == f"n_{table}_2" assert ts.tables.has_index()
def verify_mutation_decoding(self, t): ms = tskit.MetadataSchema(None) nt = t.copy() nt.metadata_schema = ms for a, b in zip(t, nt): md = a.metadata with self.assertWarns(DeprecationWarning): omd = pyslim.decode_mutation(b.metadata) self.assertEqual(md, {"mutation_list": [u.asdict() for u in omd]})
def ts_fixture(): """ A tree sequence with data in all fields """ n = 10 t = 1 population_configurations = [ msprime.PopulationConfiguration(n // 2), msprime.PopulationConfiguration(n // 2), msprime.PopulationConfiguration(0), ] demographic_events = [ msprime.MassMigration(time=t, source=0, destination=2), msprime.MassMigration(time=t, source=1, destination=2), ] ts = msprime.simulate( population_configurations=population_configurations, demographic_events=demographic_events, random_seed=1, mutation_rate=1, record_migrations=True, ) tables = ts.dump_tables() # TODO replace this with properly linked up individuals using sim_ancestry # once 1.0 is released. for j in range(n): tables.individuals.add_row(flags=j, location=(j, j), parents=(j - 1, j - 1)) for name, table in tables.name_map.items(): if name != "provenances": table.metadata_schema = tskit.MetadataSchema({"codec": "json"}) metadatas = [f"n_{name}_{u}" for u in range(len(table))] metadata, metadata_offset = tskit.pack_strings(metadatas) table.set_columns( **{ **table.asdict(), "metadata": metadata, "metadata_offset": metadata_offset, }) tables.metadata_schema = tskit.MetadataSchema({"codec": "json"}) tables.metadata = "Test metadata" return tables.tree_sequence()
def verify_decoding(self, t, decoder): ms = tskit.MetadataSchema(None) nt = t.copy() nt.metadata_schema = ms for a, b in zip(t, nt): md = a.metadata with self.assertWarns(FutureWarning): omd = decoder(b.metadata) if md is None: self.assertTrue(omd is None) else: self.assertEqual(md, omd.asdict())
def test_small_msprime_top_level_metadata(self): ts = msprime.simulate(10, recombination_rate=2, mutation_rate=2, random_seed=2) self.assertGreater(ts.num_sites, 2) self.assertGreater(ts.num_trees, 2) tables = ts.dump_tables() top_level_schema = { "codec": "json", "properties": {"my_int": {"type": "integer"}}, } tables.metadata_schema = tskit.MetadataSchema(top_level_schema) tables.metadata = {"my_int": 1234} self.verify(tables.tree_sequence())
def verify_0_3_3(self, ts): for table in [ "populations", "individuals", "nodes", "edges", "sites", "migrations", "mutations", ]: t = getattr(ts.tables, table) assert t.num_rows > 0 assert t.metadata_schema == tskit.MetadataSchema({"codec": "json"}) assert t[2].metadata == f"n_{table}_2" assert ts.tables.has_index()
def test_tskit(selenium): import tskit # basic test tc = tskit.TableCollection(2) assert tc.sequence_length == 2 tc.nodes.add_row(flags=tskit.NODE_IS_SAMPLE) tc.nodes.add_row(time=1) tc.edges.add_row(left=0, right=1, parent=1, child=0) tc.edges.add_row(left=1, right=2, parent=1, child=0) ts = tc.tree_sequence() assert ts.num_nodes == 2 # save and load ts.dump("/tmp/tskit.trees") ts2 = tskit.load("/tmp/tskit.trees") ts.tables.assert_equals(ts2.tables) # test dependency related functions ts.draw_svg(size=(200, 200)) tskit.MetadataSchema({"codec": "json"})
def add_default_schemas(ts): """ Returns a copy of the specified tree sequence with permissive JSON schemas on the tables that are used for round-tripping data in tsinfer. """ tables = ts.dump_tables() schema = tskit.MetadataSchema(tsinfer.permissive_json_schema()) # Make sure we're not overwriting existing metadata. This will probably # fail when msprime 1.0 comes along, but we can fix it then. assert len(tables.metadata) == 0 tables.metadata_schema = schema tables.metadata = {} tables.populations.metadata_schema = schema assert len(tables.populations.metadata) == 0 tables.populations.packset_metadata([b"{}"] * ts.num_populations) tables.individuals.metadata_schema = schema assert len(tables.individuals.metadata) == 0 tables.individuals.packset_metadata([b"{}"] * ts.num_individuals) tables.sites.metadata_schema = schema assert len(tables.sites.metadata) == 0 tables.sites.packset_metadata([b"{}"] * ts.num_sites) return tables.tree_sequence()
def set_tree_sequence_metadata(tables, model_type, generation, spatial_dimensionality='', spatial_periodicity='', separate_sexes=False, nucleotide_based=False, stage='late', file_version=None): if file_version is None: file_version = slim_file_version if isinstance(tables.metadata, bytes): if len(tables.metadata) > 0: raise ValueError( "Tree sequence has top-level metadata but no schema: this is a problem " "since pyslim is trying to add to the metadata.") schema_dict = slim_metadata_schemas['tree_sequence'].schema metadata_dict = {} else: # we need to keep other keys in the metadata (and schema) if there are any schema_dict = tables.metadata_schema.schema metadata_dict = tables.metadata assert (schema_dict['codec'] == 'json') assert (schema_dict['type'] == 'object') schema_dict['properties']['SLiM'] = slim_metadata_schemas[ 'tree_sequence'].schema['properties']['SLiM'] tables.metadata_schema = tskit.MetadataSchema(schema_dict) metadata_dict['SLiM'] = { "model_type": model_type, "generation": generation, "file_version": file_version, "spatial_dimensionality": spatial_dimensionality, "spatial_periodicity": spatial_periodicity, "separate_sexes": separate_sexes, "nucleotide_based": nucleotide_based, "stage": stage, } tables.metadata = metadata_dict _set_metadata_schemas(tables)
def test_set_tree_sequence_metadata_keeps(self, recipe): # make sure doesn't overwrite other stuff for x in [{}, {'properties': {'abc': {'type': 'string'}}}]: schema_dict = { 'codec': 'json', 'type': 'object', } schema_dict.update(x) dummy_schema = tskit.MetadataSchema(schema_dict) dummy_metadata = {'abc': 'foo'} tables = recipe["ts"].dump_tables() tables.metadata_schema = dummy_schema tables.metadata = dummy_metadata pyslim.set_tree_sequence_metadata(tables, "nonWF", 0) schema = tables.metadata_schema.schema for k in dummy_metadata: if len(x) > 0: assert k in schema['properties'] assert k in tables.metadata assert tables.metadata[k] == dummy_metadata[k] self.validate_slim_metadata(tables) assert tables.metadata['SLiM']['model_type'] == "nonWF" assert tables.metadata['SLiM']['generation'] == 0
def assertTableCollectionsEqual(self, t1, t2, skip_provenance=False, check_metadata_schema=True, reordered_individuals=False): if isinstance(t1, tskit.TreeSequence): t1 = t1.dump_tables() if isinstance(t2, tskit.TreeSequence): t2 = t2.dump_tables() t1_samples = [(n.metadata['slim_id'], j) for j, n in enumerate(t1.nodes) if (n.flags & tskit.NODE_IS_SAMPLE)] t1_samples.sort() t2_samples = [(n.metadata['slim_id'], j) for j, n in enumerate(t2.nodes) if (n.flags & tskit.NODE_IS_SAMPLE)] t2_samples.sort() t1.simplify([j for (_, j) in t1_samples], record_provenance=False) t2.simplify([j for (_, j) in t2_samples], record_provenance=False) if skip_provenance is True: t1.provenances.clear() t2.provenances.clear() if skip_provenance == -1: assert t1.provenances.num_rows + 1 == t2.provenances.num_rows t2.provenances.truncate(t1.provenances.num_rows) assert t1.provenances.num_rows == t2.provenances.num_rows if check_metadata_schema: # this is redundant now, but will help diagnose if things go wrong assert t1.metadata_schema.schema == t2.metadata_schema.schema assert t1.populations.metadata_schema.schema == t2.populations.metadata_schema.schema assert t1.individuals.metadata_schema.schema == t2.individuals.metadata_schema.schema assert t1.nodes.metadata_schema.schema == t2.nodes.metadata_schema.schema assert t1.edges.metadata_schema.schema == t2.edges.metadata_schema.schema assert t1.sites.metadata_schema.schema == t2.sites.metadata_schema.schema assert t1.mutations.metadata_schema.schema == t2.mutations.metadata_schema.schema assert t1.migrations.metadata_schema.schema == t2.migrations.metadata_schema.schema if not check_metadata_schema: # need to pull out metadata to compare as dicts before zeroing the schema m1 = t1.metadata m2 = t2.metadata ms = tskit.MetadataSchema(None) for t in (t1, t2): t.metadata_schema = ms t.populations.metadata_schema = ms t.individuals.metadata_schema = ms t.nodes.metadata_schema = ms t.edges.metadata_schema = ms t.sites.metadata_schema = ms t.mutations.metadata_schema = ms t.migrations.metadata_schema = ms t1.metadata = b'' t2.metadata = b'' assert m1 == m2 if reordered_individuals: ind1 = {i.metadata['pedigree_id']: j for j, i in enumerate(t1.individuals)} ind2 = {i.metadata['pedigree_id']: j for j, i in enumerate(t2.individuals)} for pid in ind1: if not pid in ind2: print("not in t2:", ind1[pid]) assert pid in ind2 if t1.individuals[ind1[pid]] != t2.individuals[ind2[pid]]: print("t1:", t1.individuals[ind1[pid]]) print("t2:", t2.individuals[ind2[pid]]) assert t1.individuals[ind1[pid]] == t2.individuals[ind2[pid]] for pid in ind2: if not pid in ind1: print("not in t1:", ind2[pid]) assert pid in ind1 t1.individuals.clear() t2.individuals.clear() # go through one-by-one so we know which fails self.assertTablesEqual(t1.populations, t2.populations, "populations") self.assertTablesEqual(t1.individuals, t2.individuals, "individuals") self.assertTablesEqual(t1.nodes, t2.nodes, "nodes") self.assertTablesEqual(t1.edges, t2.edges, "edges") self.assertTablesEqual(t1.sites, t2.sites, "sites") self.assertTablesEqual(t1.mutations, t2.mutations, "mutations") self.assertTablesEqual(t1.migrations, t2.migrations, "migrations") self.assertTablesEqual(t1.provenances, t2.provenances, "provenances") self.assertMetadataEqual(t1, t2) assert t1.sequence_length == t2.sequence_length if t1.reference_sequence.data != t2.reference_sequence.data: print(t1.reference_sequence.data, " != ", t2.reference_sequence.data) assert t1.reference_sequence.data == t2.reference_sequence.data
def get_example_tables(): """ Return a tree sequence that has data in all fields. """ pop_configs = [msprime.PopulationConfiguration(5) for _ in range(2)] migration_matrix = [[0, 1], [1, 0]] ts = msprime.simulate( population_configurations=pop_configs, migration_matrix=migration_matrix, mutation_rate=1, record_migrations=True, random_seed=1, ) tables = ts.dump_tables() for j in range(ts.num_samples): tables.individuals.add_row(flags=j, location=np.arange(j), metadata=b"x" * j) tables.nodes.clear() for node in ts.nodes(): tables.nodes.add_row( flags=node.flags, time=node.time, population=node.population, individual=node.id if node.id < ts.num_samples else -1, metadata=b"y" * node.id, ) tables.edges.clear() for edge in ts.edges(): tables.edges.add_row( left=edge.left, right=edge.right, child=edge.child, parent=edge.parent, metadata=b"y" * edge.id, ) tables.sites.clear() for site in ts.sites(): tables.sites.add_row( position=site.position, ancestral_state="A" * site.id, metadata=b"q" * site.id, ) tables.mutations.clear() for mutation in ts.mutations(): mut_id = tables.mutations.add_row( site=mutation.site, node=mutation.node, time=0, parent=-1, derived_state="C" * mutation.id, metadata=b"x" * mutation.id, ) # Add another mutation on the same branch. tables.mutations.add_row( site=mutation.site, node=mutation.node, time=0, parent=mut_id, derived_state="G" * mutation.id, metadata=b"y" * mutation.id, ) tables.migrations.clear() for migration in ts.migrations(): tables.migrations.add_row( left=migration.left, right=migration.right, node=migration.node, source=migration.source, dest=migration.dest, time=migration.time, metadata=b"y" * migration.id, ) for j in range(10): tables.populations.add_row(metadata=b"p" * j) tables.provenances.add_row(timestamp="x" * j, record="y" * j) tables.metadata_schema = tskit.MetadataSchema({ "codec": "struct", "type": "object", "properties": { "top-level": { "type": "array", "items": { "type": "integer", "binaryFormat": "B" }, "noLengthEncodingExhaustBuffer": True, } }, }) tables.metadata = {"top-level": [1, 2, 3, 4]} for table in [ "individuals", "nodes", "edges", "migrations", "sites", "mutations", "populations", ]: t = getattr(tables, table) t.metadata_schema = tskit.MetadataSchema({ "codec": "struct", "type": "object", "properties": { table: { "type": "string", "binaryFormat": "50p" } }, }) return tables
class TestEquals: def test_equal_self(self, ts_fixture): ts_fixture.reference_sequence.assert_equals(ts_fixture.reference_sequence) assert ts_fixture.reference_sequence == ts_fixture.reference_sequence assert not ts_fixture.reference_sequence != ts_fixture.reference_sequence assert ts_fixture.reference_sequence.equals(ts_fixture.reference_sequence) def test_equal_empty(self): tables = tskit.TableCollection(1) tables.reference_sequence.assert_equals(tables.reference_sequence) assert tables.reference_sequence == tables.reference_sequence assert tables.reference_sequence.equals(tables.reference_sequence) @pytest.mark.parametrize("attr", ["url", "data"]) def test_unequal_attr_missing(self, ts_fixture, attr): t1 = ts_fixture.tables d = t1.asdict() del d["reference_sequence"][attr] t2 = tskit.TableCollection.fromdict(d) with pytest.raises(AssertionError, match=attr): t1.reference_sequence.assert_equals(t2.reference_sequence) assert t1.reference_sequence != t2.reference_sequence assert not t1.reference_sequence.equals(t2.reference_sequence) with pytest.raises(AssertionError, match=attr): t2.reference_sequence.assert_equals(t1.reference_sequence) assert t2.reference_sequence != t1.reference_sequence assert not t2.reference_sequence.equals(t1.reference_sequence) @pytest.mark.parametrize( ("attr", "val"), [ ("url", "foo"), ("data", "bar"), ("metadata", {"json": "runs the world"}), ("metadata_schema", tskit.MetadataSchema(None)), ], ) def test_different_not_equal(self, ts_fixture, attr, val): t1 = ts_fixture.dump_tables() t2 = t1.copy() setattr(t1.reference_sequence, attr, val) with pytest.raises(AssertionError): t1.reference_sequence.assert_equals(t2.reference_sequence) assert t1.reference_sequence != t2.reference_sequence assert not t1.reference_sequence.equals(t2.reference_sequence) with pytest.raises(AssertionError): t2.reference_sequence.assert_equals(t1.reference_sequence) assert t2.reference_sequence != t1.reference_sequence assert not t2.reference_sequence.equals(t1.reference_sequence) @pytest.mark.parametrize( ("attr", "val"), [ ("metadata", {"json": "runs the world"}), ("metadata_schema", tskit.MetadataSchema(None)), ], ) def test_different_but_ignore(self, ts_fixture, attr, val): t1 = ts_fixture.dump_tables() t2 = t1.copy() setattr(t1.reference_sequence, attr, val) with pytest.raises(AssertionError): t1.reference_sequence.assert_equals(t2.reference_sequence) assert t1.reference_sequence != t2.reference_sequence assert not t1.reference_sequence.equals(t2.reference_sequence) with pytest.raises(AssertionError): t2.reference_sequence.assert_equals(t1.reference_sequence) assert t2.reference_sequence != t1.reference_sequence assert not t2.reference_sequence.equals(t1.reference_sequence) t2.reference_sequence.assert_equals(t1.reference_sequence, ignore_metadata=True) assert t2.reference_sequence.equals(t1.reference_sequence, ignore_metadata=True)
def simulate_stdpopsim( species, model, contig, num_samples, mutation_file=None, seed=123, skip_existing=False, num_procs=1, ): base_fn = f"{model}_{contig}_n{num_samples}" tree_fn = f"{base_fn}_seed{seed}" logger.info( f"Using {species}:{contig} from stdpopsim using the {model} model") if skip_existing and os.path.exists(tree_fn + ".trees"): logger.info( f"Simulation file {tree_fn}.trees already exists, returning that.") return base_fn, tree_fn sample_data = None species = stdpopsim.get_species(species) model = species.get_demographic_model(model) num_pops = model.num_sampling_populations if num_samples < num_pops or num_samples % num_pops != 0: raise ValueError( f"num_samples must be an integer multiple of {num_pops} " f"(or 2 x {num_pops} if diploid sequencing error is injected)") pop_n = num_samples // num_pops logger.info( f"Simulating {num_pops}x{pop_n} samples, seed {seed}, file prefix '{tree_fn}'." ) contig = species.get_contig(contig) l = contig.recombination_map.get_sequence_length() if mutation_file is not None: logger.debug(f"Loading {mutation_file}") sample_data = tsinfer.load(mutation_file) if sample_data.sequence_length != l: raise ValueError( f"Mismatching sequence_length between simulation and {mutation_file}" ) # Reduce mutation rate to 0, as we will insert mutations later contig = stdpopsim.Contig( mutation_rate=0, recombination_map=contig.recombination_map, genetic_map=contig.genetic_map, ) r_map = contig.recombination_map assert len(r_map.get_rates()) == 2 # Ensure a single rate over chr samples = model.get_samples(*([pop_n] * num_pops)) engine = stdpopsim.get_engine('msprime') ts = engine.simulate(model, contig, samples, seed=seed) tables = ts.dump_tables() if sample_data is not None: pos = sample_data.sites_position[:] logger.info( f"Inserting {len(pos)} mutations at variable sites from {mutation_file}" ) for tree in ts.trees(): positions = pos[np.logical_and(pos >= tree.interval[0], pos < tree.interval[1])] if len(positions) == 0: continue muts = list( zip( np.random.uniform(0, tree.total_branch_length, size=len(positions)), positions)) muts.sort() tot = 0 # place a mutation on a random branch, proportional to branch length try: for n in tree.nodes(): tot += tree.branch_length(n) while muts[0][0] < tot: _, position = muts.pop(0) s = tables.sites.add_row(position=position, ancestral_state="0") tables.mutations.add_row(node=n, site=s, derived_state="1") except IndexError: # No more mutations - go to next tree continue tables.sort() logger.debug( f"Inserted mutations at density {ts.num_mutations/ts.sequence_length}" ) interval = [int(l * 2 / 20), int(l * 2 / 20) + 1e7] # 10Mb near the start, not centromeric tables.keep_intervals([interval]) tables.trim() logger.debug( f"Cut down tree seq to {interval} ({tables.sites.num_rows} sites) for speed" ) # Add info to the top-level metadata user_data = {} logger.info( "Calculating the kc distance of the simulation against a flat tree") star_tree = tskit.Tree.generate_star(ts.num_samples, span=tables.sequence_length, record_provenance=False) user_data['kc_max'] = tables.tree_sequence().kc_distance( star_tree.tree_sequence) kc_array = [] max_reps = 100 ts = tables.tree_sequence() logger.info( f"Calculating KC distance of the sim against at most {max_reps} * {ts.num_trees}" f" random trees using {num_procs} parallel threads. This could take a while." ) seeds = range(seed, seed + max_reps) with multiprocessing.Pool(num_procs) as pool: for i, kc in enumerate( pool.imap_unordered(rnd_kc, zip(itertools.repeat(ts), seeds))): kc_array.append(kc) if i > 10: se_mean = np.std(kc_array, ddof=1) / np.sqrt(i) # break if SEM < 1/100th of mean KC. This can take along time if se_mean / np.average(kc_array) < 0.01: logger.info( f"Stopped after {i} replicates as kc_max_split deemed accurate." ) break user_data['kc_max_split'] = np.average(kc_array) if tables.metadata_schema != tskit.MetadataSchema({"codec": "json"}): if tables.metadata: raise RuntimeError("Metadata already exists, and is not JSON") tables.metadata_schema = tskit.MetadataSchema({"codec": "json"}) tables.metadata = {} tables.metadata = {"user_data": user_data, **tables.metadata} tables.tree_sequence().dump(tree_fn + ".trees") return base_fn, tree_fn
num_replicates=1, ) status("Converting tables...") # record individual id metadata tables = next(sim).dump_tables() individual_metadata_schema = tskit.MetadataSchema({ "codec": "json", "type": "object", "properties": { # Name of the individual in the pedigree file "individual_name": { "type": "integer" }, "is_sample": { "type": "boolean" }, }, "required": ["individual_name", "is_sample"], }) meta_individuals = tskit.IndividualTable() meta_individuals.metadata_schema = individual_metadata_schema for i, ind in enumerate(tables.individuals): ind_name = int(ped.individual[i]) if i < ped.num_individuals else -1 is_sample = bool(ped.is_sample[i]) if i < ped.num_individuals else False meta_individuals.add_row(metadata={ "individual_name": ind_name,
def parse_fam(fam_file): """ Parse PLINK .fam file and convert to tskit IndividualTable. Assumes fam file contains five columns: FID, IID, PAT, MAT, SEX :param fam_file: PLINK .fam file object :param tskit.TableCollection tc: TableCollection with IndividualTable to which the individuals will be added """ individuals = np.loadtxt( fname=fam_file, dtype=str, ndmin=2, # read file as 2-D table usecols=(0, 1, 2, 3, 4), # only keep FID, IID, PAT, MAT, SEX columns ) # requires same number of columns in each row, i.e. not ragged id_map = {} # dict for translating PLINK ID to tskit IndividualTable ID for tskit_id, (plink_fid, plink_iid, _pat, _mat, _sex) in enumerate(individuals): # include space between strings to ensure uniqueness plink_id = f"{plink_fid} {plink_iid}" if plink_id in id_map: raise ValueError("Duplicate PLINK ID: {plink_id}") id_map[plink_id] = tskit_id id_map["0"] = -1 # -1 is used in tskit to denote "missing" tc = tskit.TableCollection(1) tb = tc.individuals tb.metadata_schema = tskit.MetadataSchema({ "codec": "json", "type": "object", "properties": { "plink_fid": { "type": "string" }, "plink_iid": { "type": "string" }, "sex": { "type": "integer" }, }, "required": ["plink_fid", "plink_iid", "sex"], "additionalProperties": True, }) for plink_fid, plink_iid, pat, mat, sex in individuals: sex = int(sex) if not (sex in range(3)): raise ValueError( "Sex must be one of the following: 0 (unknown), 1 (male), 2 (female)" ) metadata_dict = { "plink_fid": plink_fid, "plink_iid": plink_iid, "sex": sex } pat_id = f"{plink_fid} {pat}" if pat != "0" else pat mat_id = f"{plink_fid} {mat}" if mat != "0" else mat tb.add_row( parents=[ id_map[pat_id], id_map[mat_id], ], metadata=metadata_dict, ) tc.sort() return tb
0.0, "bounds_z1": 100.0, "migration_records": [{ "source_subpop": 1, "migration_rate": 0.9 }, { "source_subpop": 2, "migration_rate": 0.1 }] }] }, } slim_metadata_schemas = { k: tskit.MetadataSchema(_raw_slim_metadata_schemas[k]) for k in _raw_slim_metadata_schemas } default_slim_metadata = { "tree_sequence": { "SLiM": { "model_type": "nonWF", "generation": 1, "file_version": slim_file_version, "spatial_dimensionality": "", "spatial_periodicity": "", "separate_sexes": False, "nucleotide_based": False, "stage": "late" }
def from_newick(string, min_edge_length=0): """ Returns a tree sequence representation of the specified newick string. The tree sequence will contain a single tree, as specified by the newick. All leaf nodes will be marked as samples (``tskit.NODE_IS_SAMPLE``). Newick names and comments will be written to the node metadata. :param string string: Newick string :param float min_edge_length: Replace any edge length shorter than this value by this value. Unlike newick, tskit doesn't support zero or negative edge lengths, so setting this argument to a small value is necessary when importing trees with zero or negative lengths. """ trees = newick.loads(string) if len(trees) > 1: raise ValueError("Only one tree can be imported from a newick string") if len(trees) == 0: raise ValueError("Newick string was empty") tree = trees[0] tables = tskit.TableCollection(1) nodes = tables.nodes nodes.metadata_schema = tskit.MetadataSchema( { "codec": "json", "type": "object", "properties": { "name": { "type": ["string"], "description": "Name from newick file", }, "comment": { "type": ["string"], "description": "Comment from newick file", }, }, } ) id_map = {} def get_or_add_node(newick_node, time): if newick_node not in id_map: flags = tskit.NODE_IS_SAMPLE if len(newick_node.descendants) == 0 else 0 metadata = {} if newick_node.name: metadata["name"] = newick_node.name if newick_node.comment: metadata["comment"] = newick_node.comment id_map[newick_node] = tables.nodes.add_row( flags=flags, time=time, metadata=metadata ) return id_map[newick_node] root = next(tree.walk()) get_or_add_node(root, 0) for newick_node in tree.walk(): node_id = id_map[newick_node] for child in newick_node.descendants: length = max(child.length, min_edge_length) if length <= 0: raise ValueError( "tskit tree sequences cannot contain edges with lengths" " <= 0. Set min_edge_length to force lengths to a" " minimum size" ) child_node_id = get_or_add_node(child, nodes[node_id].time - length) tables.edges.add_row(0, 1, node_id, child_node_id) # Rewrite node times to fit the tskit convention of zero at the youngest leaf nodes = tables.nodes.copy() youngest = min(tables.nodes.time) tables.nodes.clear() for node in nodes: tables.nodes.append(node.replace(time=node.time - youngest + root.length)) tables.sort() return tables.tree_sequence()
next_id = 0 for ind in both.individuals(): md = ind.metadata md['pedigree_id'] = next_id j = tables.individuals.add_row(flags=ind.flags, location=ind.location, parents=ind.parents, metadata=md) ind_map[j] = md['pedigree_id'] next_id += 1 tables.nodes.clear() # hack because of https://github.com/tskit-dev/tskit/issues/1256 # (which is fixed in github main) schema = tables.nodes.metadata_schema tables.nodes.metadata_schema = tskit.MetadataSchema(None) for n in both.nodes(): md = n.metadata if md is not None: assert n.individual != tskit.NULL ind = both.individual(n.individual) offset = md['slim_id'] - 2 * ind.metadata['pedigree_id'] md['slim_id'] = 2 * ind_map[ind.id] + offset tables.nodes.add_row(time=n.time, population=n.population, individual=n.individual, flags=n.flags, metadata=schema.validate_and_encode_row(md)) tables.nodes.metadata_schema = schema both = tables.tree_sequence()