Example #1
0
 def test_py_engine_multi(self):
     data = {"a": np.arange(10), "b": np.zeros(100)}
     n = 10
     fileobj = io.BytesIO()
     for _ in range(n):
         kas.dump(data, fileobj, engine=kas.PY_ENGINE)
     fileobj.seek(0)
     for _ in range(n):
         data_2 = kas.load(fileobj, read_all=True, engine=kas.PY_ENGINE)
         self.verify_dicts_equal(data, data_2)
Example #2
0
 def test_format_name_error(self):
     ts = msprime.simulate(10)
     for bad_name in ["tskit.tree", "tskit.treesAndOther", "", "x" * 100]:
         ts.dump(self.temp_file)
         with kastore.load(self.temp_file) as store:
             data = dict(store)
         data["format/name"] = np.array(bytearray(bad_name.encode()), dtype=np.int8)
         kastore.dump(data, self.temp_file)
         with pytest.raises(exceptions.FileFormatError):
             tskit.load(self.temp_file)
Example #3
0
 def test_new_version_load_error(self):
     ts = msprime.simulate(10, random_seed=1)
     for bad_version in [(CURRENT_FILE_MAJOR + j, 0) for j in range(1, 5)]:
         ts.dump(self.temp_file)
         with kastore.load(self.temp_file, use_mmap=False) as store:
             data = dict(store)
         data["format/version"] = np.array(bad_version, dtype=np.uint32)
         kastore.dump(data, self.temp_file)
         self.assertRaises(msprime.VersionTooNewError, msprime.load,
                           self.temp_file)
Example #4
0
 def test_old_version_load_error(self):
     ts = msprime.simulate(10, random_seed=1)
     for bad_version in [(0, 1), (0, 8), (2, 0), (CURRENT_FILE_MAJOR - 1, 0)]:
         ts.dump(self.temp_file)
         with kastore.load(self.temp_file) as store:
             data = dict(store)
         data["format/version"] = np.array(bad_version, dtype=np.uint32)
         kastore.dump(data, self.temp_file)
         with pytest.raises(tskit.VersionTooOldError):
             tskit.load(self.temp_file)
Example #5
0
 def test_load_and_dump_fd_single_rw(self):
     data = {"a": np.arange(10)}
     with open(self.temp_file, "r+b") as f:
         fd = f.fileno()
         kas.dump(data, fd, engine=self.engine)
         for read_all in [True, False]:
             os.lseek(fd, 0, os.SEEK_SET)
             data_out = kas.load(fd, read_all=read_all, engine=self.engine)
             data2 = dict(data_out.items())
             self.verify_dicts_equal(data, data2)
Example #6
0
 def verify_fields(self, ts):
     ts.dump(self.temp_file)
     with kastore.load(self.temp_file, use_mmap=False) as store:
         all_data = dict(store)
     for key in all_data.keys():
         data = dict(all_data)
         del data[key]
         kastore.dump(data, self.temp_file)
         self.assertRaises(exceptions.FileFormatError, msprime.load,
                           self.temp_file)
Example #7
0
 def verify(self, data):
     kas.dump(data, self.temp_file)
     for read_all in [True, False]:
         new_data = kas.load(self.temp_file, read_all=read_all)
         for key, array in new_data.items():
             info = new_data.info(key)
             s = str(info)
             self.assertGreater(len(s), 0)
             self.assertEqual(array.nbytes, info.size)
             self.assertEqual(array.shape, info.shape)
             self.assertEqual(array.dtype, np.dtype(info.dtype))
Example #8
0
 def handle(self):
     while True:
         try:
             data = kas.load(self.request.fileno(),
                             engine=self.engine,
                             read_all=True)
         except EOFError:
             break
         kas.dump(dict(data), self.request.fileno(), engine=self.engine)
     # We only read one list, so shutdown the server straight away
     self.server.shutdown()
Example #9
0
 def test_missing_metadata_schema(self, ts_fixture, tmp_path):
     ts1 = ts_fixture
     temp_file = tmp_path / "tmp.trees"
     ts1.dump(temp_file)
     with kastore.load(temp_file) as store:
         all_data = dict(store)
     del all_data["reference_sequence/metadata_schema"]
     kastore.dump(all_data, temp_file)
     ts2 = tskit.load(temp_file)
     assert ts2.has_reference_sequence
     assert repr(ts2.reference_sequence.metadata_schema) == ""
Example #10
0
 def test_missing_attr(self, ts_fixture, tmp_path, attr):
     ts1 = ts_fixture
     temp_file = tmp_path / "tmp.trees"
     ts1.dump(temp_file)
     with kastore.load(temp_file) as store:
         all_data = dict(store)
     del all_data[f"reference_sequence/{attr}"]
     kastore.dump(all_data, temp_file)
     ts2 = tskit.load(temp_file)
     assert ts2.has_reference_sequence
     assert getattr(ts2.reference_sequence, attr) == ""
Example #11
0
 def test_load_fileobj_single(self):
     data = {"a": np.arange(10)}
     kas.dump(data, self.temp_file, engine=self.engine)
     file_size = os.stat(self.temp_file).st_size
     for read_all in [True, False]:
         with open(self.temp_file, "rb") as f:
             data_out = kas.load(f, read_all=read_all, engine=self.engine)
             data2 = dict(data_out.items())
             file_offset = f.tell()
         self.verify_dicts_equal(data, data2)
         self.assertEqual(file_offset, file_size)
Example #12
0
 def test_empty_mutation_time(self):
     ts1 = migration_example()
     ts1.dump(self.temp_file)
     ts2 = tskit.load(self.temp_file)
     assert ts1.tables == ts2.tables
     assert len(ts1.tables.mutations.metadata) == 0
     with kastore.load(self.temp_file) as store:
         all_data = dict(store)
     del all_data["mutations/time"]
     kastore.dump(all_data, self.temp_file)
     ts3 = tskit.load(self.temp_file)
     assert ts1.tables == ts3.tables
Example #13
0
 def test_load_from_pathlib_Path(self):
     data = {"a": np.arange(10)}
     kas.dump(data, str(self.temp_file), engine=self.engine)
     file_size = self.temp_file.stat().st_size
     for read_all in [True, False]:
         data_out = kas.load(self.temp_file,
                             read_all=read_all,
                             engine=self.engine)
         data2 = dict(data_out.items())
         file_size2 = self.temp_file.stat().st_size
         self.verify_dicts_equal(data, data2)
         self.assertEqual(file_size, file_size2)
Example #14
0
 def test_truncated_file_data(self):
     for num_items in range(2, 5):
         self.write_file(num_items)
         with open(self.temp_file, "rb") as f:
             buff = bytearray(f.read())
         with open(self.temp_file, "wb") as f:
             f.write(buff[:-1])
         with self.assertRaises(kas.FileFormatError):
             # Must call dict to ensure all the keys are loaded.
             dict(
                 kas.load(self.temp_file,
                          engine=self.engine,
                          read_all=self.read_all))
Example #15
0
 def verify_missing_fields(self, ts):
     ts.dump(self.temp_file)
     with kastore.load(self.temp_file) as store:
         all_data = dict(store)
     for key in all_data.keys():
         # We skip these keys as they are optional
         if "metadata_schema" not in key and key != "metadata":
             data = dict(all_data)
             del data[key]
             kastore.dump(data, self.temp_file)
             with self.assertRaises(
                 (exceptions.FileFormatError, exceptions.LibraryError)):
                 tskit.load(self.temp_file)
Example #16
0
 def verify_keys(self, ts):
     keys = [
         "edges/child",
         "edges/left",
         "edges/parent",
         "edges/right",
         "format/name",
         "format/version",
         "indexes/edge_insertion_order",
         "indexes/edge_removal_order",
         "individuals/flags",
         "individuals/location",
         "individuals/location_offset",
         "individuals/metadata",
         "individuals/metadata_offset",
         "migrations/dest",
         "migrations/left",
         "migrations/node",
         "migrations/right",
         "migrations/source",
         "migrations/time",
         "mutations/derived_state",
         "mutations/derived_state_offset",
         "mutations/metadata",
         "mutations/metadata_offset",
         "mutations/node",
         "mutations/parent",
         "mutations/site",
         "nodes/flags",
         "nodes/individual",
         "nodes/metadata",
         "nodes/metadata_offset",
         "nodes/population",
         "nodes/time",
         "populations/metadata",
         "populations/metadata_offset",
         "provenances/record",
         "provenances/record_offset",
         "provenances/timestamp",
         "provenances/timestamp_offset",
         "sequence_length",
         "sites/ancestral_state",
         "sites/ancestral_state_offset",
         "sites/metadata",
         "sites/metadata_offset",
         "sites/position",
         "uuid",
     ]
     ts.dump(self.temp_file)
     store = kastore.load(self.temp_file)
     self.assertEqual(sorted(list(store.keys())), keys)
Example #17
0
 def verify(self, data):
     for engine in [kas.C_ENGINE, kas.PY_ENGINE]:
         for read_all in [True, False]:
             kas.dump(data, self.temp_file, engine=engine)
             new_data = kas.load(self.temp_file,
                                 read_all=read_all,
                                 engine=engine)
             self.assertEqual(sorted(new_data.keys()), sorted(data.keys()))
             for key, source_array in data.items():
                 dest_array = new_data[key]
                 # Numpy's testing assert_equal will deal correctly with NaNs.
                 np.testing.assert_equal(source_array, dest_array)
             # Make sure the file is closed before opening it again.
             del new_data
Example #18
0
    def test_empty_migration_metadata(self):
        ts1 = migration_example()
        ts1.dump(self.temp_file)
        ts2 = tskit.load(self.temp_file)
        self.assertEqual(ts1.tables, ts2.tables)
        self.assertEqual(len(ts1.tables.migrations.metadata), 0)

        with kastore.load(self.temp_file) as store:
            all_data = dict(store)
        del all_data["migrations/metadata"]
        del all_data["migrations/metadata_offset"]
        kastore.dump(all_data, self.temp_file)
        ts3 = tskit.load(self.temp_file)
        self.assertEqual(ts1.tables, ts3.tables)
Example #19
0
    def test_empty_edge_metadata(self):
        ts1 = migration_example()
        ts1.dump(self.temp_file)
        ts2 = tskit.load(self.temp_file)
        assert ts1.tables == ts2.tables
        assert len(ts1.tables.edges.metadata) == 0

        with kastore.load(self.temp_file) as store:
            all_data = dict(store)
        del all_data["edges/metadata"]
        del all_data["edges/metadata_offset"]
        kastore.dump(all_data, self.temp_file)
        ts3 = tskit.load(self.temp_file)
        assert ts1.tables == ts3.tables
Example #20
0
 def verify_fields(self, ts):
     ts.dump(self.temp_file)
     with kastore.load(self.temp_file) as store:
         all_data = dict(store)
     for key in all_data.keys():
         # We skip this key as it is optional
         if "metadata_schema" not in key:
             data = dict(all_data)
             del data[key]
             print(key)
             kastore.dump(data, self.temp_file)
             with self.assertRaises(
                 (exceptions.FileFormatError, exceptions.LibraryError)):
                 tskit.load(self.temp_file)
Example #21
0
 def test_empty_mutation_time(self):
     ts1 = migration_example()
     ts1.dump(self.temp_file)
     ts2 = tskit.load(self.temp_file)
     assert ts1.tables == ts2.tables
     assert len(ts1.tables.mutations.metadata) == 0
     with kastore.load(self.temp_file) as store:
         all_data = dict(store)
     del all_data["mutations/time"]
     kastore.dump(all_data, self.temp_file)
     ts3 = tskit.load(self.temp_file)
     # Null out the time column
     t1 = ts1.dump_tables()
     t1.mutations.time = np.full_like(t1.mutations.time, tskit.UNKNOWN_TIME)
     t1.assert_equals(ts3.tables)
Example #22
0
    def load(cls, path):
        '''
        Load a :class:`SlimTreeSequence` from a .trees file on disk.

        :param string path: The path to a .trees file.
        :rtype SlimTreeSequence:
        '''
        ts = tskit.load(path)
        # extract the reference sequence from the kastore
        kas = kastore.load(path)
        if 'reference_sequence/data' in kas:
            int_rs = kas['reference_sequence/data']
            reference_sequence = int_rs.tostring().decode('ascii')
        else:
            reference_sequence = None
        return cls(ts, reference_sequence)
Example #23
0
def load_from_stream(q_err, q_out, file_in, engine, read_all):
    """
    kas.load() stores from `file_in` and put their data onto `q_out`.
    Uncaught exceptions are placed onto the `q_err` queue.
    """
    try:
        with open(file_in, "rb") as f:
            while True:
                try:
                    data = kas.load(f, read_all=read_all, engine=engine)
                except EOFError:
                    break
                q_out.put(dict(data.items()))
    except Exception as exc:
        tb = traceback.format_exc()
        q_err.put((exc, tb))
Example #24
0
    def dump(self, path, **kwargs):
        '''
        Dumps the tree sequence to the path specified. This is mostly just a wrapper for
        tskit.TreeSequence.dump(), but also writes out the reference sequence.

        :param str path: The file path to write the TreeSequence to.
        :param kwargs: Additional keyword args to pass to tskit.TreeSequence.dump
        '''
        super(SlimTreeSequence, self).dump(path, **kwargs)
        if self.reference_sequence is not None:
            # to convert to a kastore store we need to reload from a file,
            # and for it to be mutable we need to make it a dict
            kas = dict(kastore.load(path))
            kas['reference_sequence/data'] = np.frombuffer(
                self.reference_sequence.encode(), dtype=np.int8)
            kastore.dump(kas, path)
Example #25
0
 def test_empty_individual_parents(self):
     ts1 = migration_example()
     ts1.dump(self.temp_file)
     ts2 = tskit.load(self.temp_file)
     assert ts1.tables == ts2.tables
     assert len(ts1.tables.individuals.parents) > 0
     with kastore.load(self.temp_file) as store:
         all_data = dict(store)
     del all_data["individuals/parents"]
     del all_data["individuals/parents_offset"]
     kastore.dump(all_data, self.temp_file)
     ts3 = tskit.load(self.temp_file)
     tables = ts1.dump_tables()
     tables.individuals.packset_parents([
         [],
     ] * tables.individuals.num_rows)
     tables.assert_equals(ts3.tables)
Example #26
0
 def test_load_and_dump_fd_multi_rw(self):
     datalist = [{
         "i" + str(i): i + np.arange(10**5, dtype=int),
         "f" + str(i): i + np.arange(10**5, dtype=float),
     } for i in range(20)]
     with open(self.temp_file, "r+b") as f:
         fd = f.fileno()
         for data in datalist:
             kas.dump(data, fd, engine=self.engine)
         for read_all in [True, False]:
             os.lseek(fd, 0, os.SEEK_SET)
             for data in datalist:
                 data_out = kas.load(fd,
                                     read_all=read_all,
                                     engine=self.engine)
                 data2 = dict(data_out.items())
                 self.verify_dicts_equal(data, data2)
Example #27
0
def load_parameters_from_file(filename):
    """
    Load parameters from the provenance record of `filename`.

    Using tskit can be slow when loading provenance records for a lot of files.
    We deliberately bypass the tskit API here, so that we don't unnecessarily
    load and parse all of the treesequence tables.
    """
    ka = kastore.load(filename)
    record_offset = ka["provenances/record_offset"]
    i = record_offset[0]
    for j in record_offset[1:]:
        record = ka["provenances/record"][i:j].tostring()
        d = json.loads(record)
        if d["software"]["name"] == genomatnn.__name__:
            return d.get("parameters")
        i = j
    return None
Example #28
0
 def test_no_reference_sequence(self, ts_fixture, tmp_path):
     ts1 = ts_fixture
     temp_file = tmp_path / "tmp.trees"
     ts1.dump(temp_file)
     with kastore.load(temp_file) as store:
         all_data = dict(store)
     del all_data["reference_sequence/metadata_schema"]
     del all_data["reference_sequence/metadata"]
     del all_data["reference_sequence/data"]
     del all_data["reference_sequence/url"]
     for key in all_data.keys():
         assert not key.startswith("reference_sequence")
     kastore.dump(all_data, temp_file)
     ts2 = tskit.load(temp_file)
     assert not ts2.has_reference_sequence()
     tables = ts2.dump_tables()
     tables.reference_sequence = ts1.reference_sequence
     tables.assert_equals(ts1.tables)
Example #29
0
 def verify_missing_fields(self, ts):
     ts.dump(self.temp_file)
     with kastore.load(self.temp_file) as store:
         all_data = dict(store)
     for key in all_data.keys():
         # We skip these keys as they are optional
         if "metadata_schema" not in key and key not in [
             "metadata",
             "time_units",
             "mutations/time",
         ]:
             data = dict(all_data)
             del data[key]
             kastore.dump(data, self.temp_file)
             with pytest.raises(
                 (exceptions.FileFormatError, exceptions.LibraryError)
             ):
                 tskit.load(self.temp_file)
Example #30
0
 def test_load_and_dump_fileobj_multi(self):
     datalist = [{
         "i" + str(i): i + np.arange(10**5, dtype=int),
         "f" + str(i): i + np.arange(10**5, dtype=float),
     } for i in range(10)]
     file_offsets = []
     with open(self.temp_file, "wb") as f:
         for data in datalist:
             kas.dump(data, f, engine=self.engine)
             file_offsets.append(f.tell())
     for read_all in [True, False]:
         with open(self.temp_file, "rb") as f:
             for data, file_offset in zip(datalist, file_offsets):
                 data_out = kas.load(f,
                                     read_all=read_all,
                                     engine=self.engine)
                 data2 = dict(data_out.items())
                 file_offset2 = f.tell()
                 self.verify_dicts_equal(data, data2)
                 self.assertEqual(file_offset, file_offset2)