Example #1
0
 def test_nonexistent_file(self) -> None:
     with self.assertRaises(FileNotFoundError):
         load(name="name",
              series=["source"],
              data=[(["some_nonexistent_file"], UtfPlainTextReader)],
              batching=DEFAULT_BATCHING_SCHEME,
              buffer_size=5)
Example #2
0
    def test_glob(self):
        filenames = sorted(["abc1", "abc2", "abcxx", "xyz"])
        contents = ["a", "b", "c", "d"]
        with tempfile.TemporaryDirectory() as tmp_dir:
            for fname, text in zip(filenames, contents):
                with open(os.path.join(tmp_dir, fname), "w") as file:
                    print(text, file=file)

            dataset = load(name="dataset",
                           series=["data"],
                           data=[[
                               os.path.join(tmp_dir, "abc?"),
                               os.path.join(tmp_dir, "xyz*")
                           ]])

            series_iterator = dataset.get_series("data")
            self.assertEqual(list(series_iterator), [["a"], ["b"], ["d"]])
Example #3
0
    def test_lazy_dataset(self):
        i = 0  # iteration counter

        def reader(files: List[str]) -> Iterable[List[str]]:
            del files
            nonlocal i
            for i in range(10):  # pylint: disable=unused-variable
                yield ["foo"]

        dataset = load(name="data",
                       series=["source", "source_prep"],
                       data=[([], reader), (lambda x: x, "source")],
                       buffer_size=5)

        series = dataset.get_series("source_prep")

        # Check that the reader is being iterated lazily
        for j, _ in enumerate(series):
            self.assertEqual(i, j)
        self.assertEqual(i, 9)
Example #4
0
 def test_nonexistent_file(self):
     with self.assertRaises(FileNotFoundError):
         load(name="name",
              series=["source"],
              data=[(["some_nonexistent_file"], UtfPlainTextReader)],
              buffer_size=5)