def test_py_engine_multi(self): data = {"a": np.arange(10), "b": np.zeros(100)} n = 10 fileobj = io.BytesIO() for _ in range(n): kas.dump(data, fileobj, engine=kas.PY_ENGINE) fileobj.seek(0) for _ in range(n): data_2 = kas.load(fileobj, read_all=True, engine=kas.PY_ENGINE) self.verify_dicts_equal(data, data_2)
def test_format_name_error(self): ts = msprime.simulate(10) for bad_name in ["tskit.tree", "tskit.treesAndOther", "", "x" * 100]: ts.dump(self.temp_file) with kastore.load(self.temp_file) as store: data = dict(store) data["format/name"] = np.array(bytearray(bad_name.encode()), dtype=np.int8) kastore.dump(data, self.temp_file) with pytest.raises(exceptions.FileFormatError): tskit.load(self.temp_file)
def test_new_version_load_error(self): ts = msprime.simulate(10, random_seed=1) for bad_version in [(CURRENT_FILE_MAJOR + j, 0) for j in range(1, 5)]: ts.dump(self.temp_file) with kastore.load(self.temp_file, use_mmap=False) as store: data = dict(store) data["format/version"] = np.array(bad_version, dtype=np.uint32) kastore.dump(data, self.temp_file) self.assertRaises(msprime.VersionTooNewError, msprime.load, self.temp_file)
def test_old_version_load_error(self): ts = msprime.simulate(10, random_seed=1) for bad_version in [(0, 1), (0, 8), (2, 0), (CURRENT_FILE_MAJOR - 1, 0)]: ts.dump(self.temp_file) with kastore.load(self.temp_file) as store: data = dict(store) data["format/version"] = np.array(bad_version, dtype=np.uint32) kastore.dump(data, self.temp_file) with pytest.raises(tskit.VersionTooOldError): tskit.load(self.temp_file)
def test_load_and_dump_fd_single_rw(self): data = {"a": np.arange(10)} with open(self.temp_file, "r+b") as f: fd = f.fileno() kas.dump(data, fd, engine=self.engine) for read_all in [True, False]: os.lseek(fd, 0, os.SEEK_SET) data_out = kas.load(fd, read_all=read_all, engine=self.engine) data2 = dict(data_out.items()) self.verify_dicts_equal(data, data2)
def verify_fields(self, ts): ts.dump(self.temp_file) with kastore.load(self.temp_file, use_mmap=False) as store: all_data = dict(store) for key in all_data.keys(): data = dict(all_data) del data[key] kastore.dump(data, self.temp_file) self.assertRaises(exceptions.FileFormatError, msprime.load, self.temp_file)
def verify(self, data): kas.dump(data, self.temp_file) for read_all in [True, False]: new_data = kas.load(self.temp_file, read_all=read_all) for key, array in new_data.items(): info = new_data.info(key) s = str(info) self.assertGreater(len(s), 0) self.assertEqual(array.nbytes, info.size) self.assertEqual(array.shape, info.shape) self.assertEqual(array.dtype, np.dtype(info.dtype))
def handle(self): while True: try: data = kas.load(self.request.fileno(), engine=self.engine, read_all=True) except EOFError: break kas.dump(dict(data), self.request.fileno(), engine=self.engine) # We only read one list, so shutdown the server straight away self.server.shutdown()
def test_missing_metadata_schema(self, ts_fixture, tmp_path): ts1 = ts_fixture temp_file = tmp_path / "tmp.trees" ts1.dump(temp_file) with kastore.load(temp_file) as store: all_data = dict(store) del all_data["reference_sequence/metadata_schema"] kastore.dump(all_data, temp_file) ts2 = tskit.load(temp_file) assert ts2.has_reference_sequence assert repr(ts2.reference_sequence.metadata_schema) == ""
def test_missing_attr(self, ts_fixture, tmp_path, attr): ts1 = ts_fixture temp_file = tmp_path / "tmp.trees" ts1.dump(temp_file) with kastore.load(temp_file) as store: all_data = dict(store) del all_data[f"reference_sequence/{attr}"] kastore.dump(all_data, temp_file) ts2 = tskit.load(temp_file) assert ts2.has_reference_sequence assert getattr(ts2.reference_sequence, attr) == ""
def test_load_fileobj_single(self): data = {"a": np.arange(10)} kas.dump(data, self.temp_file, engine=self.engine) file_size = os.stat(self.temp_file).st_size for read_all in [True, False]: with open(self.temp_file, "rb") as f: data_out = kas.load(f, read_all=read_all, engine=self.engine) data2 = dict(data_out.items()) file_offset = f.tell() self.verify_dicts_equal(data, data2) self.assertEqual(file_offset, file_size)
def test_empty_mutation_time(self): ts1 = migration_example() ts1.dump(self.temp_file) ts2 = tskit.load(self.temp_file) assert ts1.tables == ts2.tables assert len(ts1.tables.mutations.metadata) == 0 with kastore.load(self.temp_file) as store: all_data = dict(store) del all_data["mutations/time"] kastore.dump(all_data, self.temp_file) ts3 = tskit.load(self.temp_file) assert ts1.tables == ts3.tables
def test_load_from_pathlib_Path(self): data = {"a": np.arange(10)} kas.dump(data, str(self.temp_file), engine=self.engine) file_size = self.temp_file.stat().st_size for read_all in [True, False]: data_out = kas.load(self.temp_file, read_all=read_all, engine=self.engine) data2 = dict(data_out.items()) file_size2 = self.temp_file.stat().st_size self.verify_dicts_equal(data, data2) self.assertEqual(file_size, file_size2)
def test_truncated_file_data(self): for num_items in range(2, 5): self.write_file(num_items) with open(self.temp_file, "rb") as f: buff = bytearray(f.read()) with open(self.temp_file, "wb") as f: f.write(buff[:-1]) with self.assertRaises(kas.FileFormatError): # Must call dict to ensure all the keys are loaded. dict( kas.load(self.temp_file, engine=self.engine, read_all=self.read_all))
def verify_missing_fields(self, ts): ts.dump(self.temp_file) with kastore.load(self.temp_file) as store: all_data = dict(store) for key in all_data.keys(): # We skip these keys as they are optional if "metadata_schema" not in key and key != "metadata": data = dict(all_data) del data[key] kastore.dump(data, self.temp_file) with self.assertRaises( (exceptions.FileFormatError, exceptions.LibraryError)): tskit.load(self.temp_file)
def verify_keys(self, ts): keys = [ "edges/child", "edges/left", "edges/parent", "edges/right", "format/name", "format/version", "indexes/edge_insertion_order", "indexes/edge_removal_order", "individuals/flags", "individuals/location", "individuals/location_offset", "individuals/metadata", "individuals/metadata_offset", "migrations/dest", "migrations/left", "migrations/node", "migrations/right", "migrations/source", "migrations/time", "mutations/derived_state", "mutations/derived_state_offset", "mutations/metadata", "mutations/metadata_offset", "mutations/node", "mutations/parent", "mutations/site", "nodes/flags", "nodes/individual", "nodes/metadata", "nodes/metadata_offset", "nodes/population", "nodes/time", "populations/metadata", "populations/metadata_offset", "provenances/record", "provenances/record_offset", "provenances/timestamp", "provenances/timestamp_offset", "sequence_length", "sites/ancestral_state", "sites/ancestral_state_offset", "sites/metadata", "sites/metadata_offset", "sites/position", "uuid", ] ts.dump(self.temp_file) store = kastore.load(self.temp_file) self.assertEqual(sorted(list(store.keys())), keys)
def verify(self, data): for engine in [kas.C_ENGINE, kas.PY_ENGINE]: for read_all in [True, False]: kas.dump(data, self.temp_file, engine=engine) new_data = kas.load(self.temp_file, read_all=read_all, engine=engine) self.assertEqual(sorted(new_data.keys()), sorted(data.keys())) for key, source_array in data.items(): dest_array = new_data[key] # Numpy's testing assert_equal will deal correctly with NaNs. np.testing.assert_equal(source_array, dest_array) # Make sure the file is closed before opening it again. del new_data
def test_empty_migration_metadata(self): ts1 = migration_example() ts1.dump(self.temp_file) ts2 = tskit.load(self.temp_file) self.assertEqual(ts1.tables, ts2.tables) self.assertEqual(len(ts1.tables.migrations.metadata), 0) with kastore.load(self.temp_file) as store: all_data = dict(store) del all_data["migrations/metadata"] del all_data["migrations/metadata_offset"] kastore.dump(all_data, self.temp_file) ts3 = tskit.load(self.temp_file) self.assertEqual(ts1.tables, ts3.tables)
def test_empty_edge_metadata(self): ts1 = migration_example() ts1.dump(self.temp_file) ts2 = tskit.load(self.temp_file) assert ts1.tables == ts2.tables assert len(ts1.tables.edges.metadata) == 0 with kastore.load(self.temp_file) as store: all_data = dict(store) del all_data["edges/metadata"] del all_data["edges/metadata_offset"] kastore.dump(all_data, self.temp_file) ts3 = tskit.load(self.temp_file) assert ts1.tables == ts3.tables
def verify_fields(self, ts): ts.dump(self.temp_file) with kastore.load(self.temp_file) as store: all_data = dict(store) for key in all_data.keys(): # We skip this key as it is optional if "metadata_schema" not in key: data = dict(all_data) del data[key] print(key) kastore.dump(data, self.temp_file) with self.assertRaises( (exceptions.FileFormatError, exceptions.LibraryError)): tskit.load(self.temp_file)
def test_empty_mutation_time(self): ts1 = migration_example() ts1.dump(self.temp_file) ts2 = tskit.load(self.temp_file) assert ts1.tables == ts2.tables assert len(ts1.tables.mutations.metadata) == 0 with kastore.load(self.temp_file) as store: all_data = dict(store) del all_data["mutations/time"] kastore.dump(all_data, self.temp_file) ts3 = tskit.load(self.temp_file) # Null out the time column t1 = ts1.dump_tables() t1.mutations.time = np.full_like(t1.mutations.time, tskit.UNKNOWN_TIME) t1.assert_equals(ts3.tables)
def load(cls, path): ''' Load a :class:`SlimTreeSequence` from a .trees file on disk. :param string path: The path to a .trees file. :rtype SlimTreeSequence: ''' ts = tskit.load(path) # extract the reference sequence from the kastore kas = kastore.load(path) if 'reference_sequence/data' in kas: int_rs = kas['reference_sequence/data'] reference_sequence = int_rs.tostring().decode('ascii') else: reference_sequence = None return cls(ts, reference_sequence)
def load_from_stream(q_err, q_out, file_in, engine, read_all): """ kas.load() stores from `file_in` and put their data onto `q_out`. Uncaught exceptions are placed onto the `q_err` queue. """ try: with open(file_in, "rb") as f: while True: try: data = kas.load(f, read_all=read_all, engine=engine) except EOFError: break q_out.put(dict(data.items())) except Exception as exc: tb = traceback.format_exc() q_err.put((exc, tb))
def dump(self, path, **kwargs): ''' Dumps the tree sequence to the path specified. This is mostly just a wrapper for tskit.TreeSequence.dump(), but also writes out the reference sequence. :param str path: The file path to write the TreeSequence to. :param kwargs: Additional keyword args to pass to tskit.TreeSequence.dump ''' super(SlimTreeSequence, self).dump(path, **kwargs) if self.reference_sequence is not None: # to convert to a kastore store we need to reload from a file, # and for it to be mutable we need to make it a dict kas = dict(kastore.load(path)) kas['reference_sequence/data'] = np.frombuffer( self.reference_sequence.encode(), dtype=np.int8) kastore.dump(kas, path)
def test_empty_individual_parents(self): ts1 = migration_example() ts1.dump(self.temp_file) ts2 = tskit.load(self.temp_file) assert ts1.tables == ts2.tables assert len(ts1.tables.individuals.parents) > 0 with kastore.load(self.temp_file) as store: all_data = dict(store) del all_data["individuals/parents"] del all_data["individuals/parents_offset"] kastore.dump(all_data, self.temp_file) ts3 = tskit.load(self.temp_file) tables = ts1.dump_tables() tables.individuals.packset_parents([ [], ] * tables.individuals.num_rows) tables.assert_equals(ts3.tables)
def test_load_and_dump_fd_multi_rw(self): datalist = [{ "i" + str(i): i + np.arange(10**5, dtype=int), "f" + str(i): i + np.arange(10**5, dtype=float), } for i in range(20)] with open(self.temp_file, "r+b") as f: fd = f.fileno() for data in datalist: kas.dump(data, fd, engine=self.engine) for read_all in [True, False]: os.lseek(fd, 0, os.SEEK_SET) for data in datalist: data_out = kas.load(fd, read_all=read_all, engine=self.engine) data2 = dict(data_out.items()) self.verify_dicts_equal(data, data2)
def load_parameters_from_file(filename): """ Load parameters from the provenance record of `filename`. Using tskit can be slow when loading provenance records for a lot of files. We deliberately bypass the tskit API here, so that we don't unnecessarily load and parse all of the treesequence tables. """ ka = kastore.load(filename) record_offset = ka["provenances/record_offset"] i = record_offset[0] for j in record_offset[1:]: record = ka["provenances/record"][i:j].tostring() d = json.loads(record) if d["software"]["name"] == genomatnn.__name__: return d.get("parameters") i = j return None
def test_no_reference_sequence(self, ts_fixture, tmp_path): ts1 = ts_fixture temp_file = tmp_path / "tmp.trees" ts1.dump(temp_file) with kastore.load(temp_file) as store: all_data = dict(store) del all_data["reference_sequence/metadata_schema"] del all_data["reference_sequence/metadata"] del all_data["reference_sequence/data"] del all_data["reference_sequence/url"] for key in all_data.keys(): assert not key.startswith("reference_sequence") kastore.dump(all_data, temp_file) ts2 = tskit.load(temp_file) assert not ts2.has_reference_sequence() tables = ts2.dump_tables() tables.reference_sequence = ts1.reference_sequence tables.assert_equals(ts1.tables)
def verify_missing_fields(self, ts): ts.dump(self.temp_file) with kastore.load(self.temp_file) as store: all_data = dict(store) for key in all_data.keys(): # We skip these keys as they are optional if "metadata_schema" not in key and key not in [ "metadata", "time_units", "mutations/time", ]: data = dict(all_data) del data[key] kastore.dump(data, self.temp_file) with pytest.raises( (exceptions.FileFormatError, exceptions.LibraryError) ): tskit.load(self.temp_file)
def test_load_and_dump_fileobj_multi(self): datalist = [{ "i" + str(i): i + np.arange(10**5, dtype=int), "f" + str(i): i + np.arange(10**5, dtype=float), } for i in range(10)] file_offsets = [] with open(self.temp_file, "wb") as f: for data in datalist: kas.dump(data, f, engine=self.engine) file_offsets.append(f.tell()) for read_all in [True, False]: with open(self.temp_file, "rb") as f: for data, file_offset in zip(datalist, file_offsets): data_out = kas.load(f, read_all=read_all, engine=self.engine) data2 = dict(data_out.items()) file_offset2 = f.tell() self.verify_dicts_equal(data, data2) self.assertEqual(file_offset, file_offset2)