def test_nonexistent_file(self) -> None: with self.assertRaises(FileNotFoundError): load(name="name", series=["source"], data=[(["some_nonexistent_file"], UtfPlainTextReader)], batching=DEFAULT_BATCHING_SCHEME, buffer_size=5)
def test_glob(self): filenames = sorted(["abc1", "abc2", "abcxx", "xyz"]) contents = ["a", "b", "c", "d"] with tempfile.TemporaryDirectory() as tmp_dir: for fname, text in zip(filenames, contents): with open(os.path.join(tmp_dir, fname), "w") as file: print(text, file=file) dataset = load(name="dataset", series=["data"], data=[[ os.path.join(tmp_dir, "abc?"), os.path.join(tmp_dir, "xyz*") ]]) series_iterator = dataset.get_series("data") self.assertEqual(list(series_iterator), [["a"], ["b"], ["d"]])
def test_lazy_dataset(self): i = 0 # iteration counter def reader(files: List[str]) -> Iterable[List[str]]: del files nonlocal i for i in range(10): # pylint: disable=unused-variable yield ["foo"] dataset = load(name="data", series=["source", "source_prep"], data=[([], reader), (lambda x: x, "source")], buffer_size=5) series = dataset.get_series("source_prep") # Check that the reader is being iterated lazily for j, _ in enumerate(series): self.assertEqual(i, j) self.assertEqual(i, 9)
def test_nonexistent_file(self): with self.assertRaises(FileNotFoundError): load(name="name", series=["source"], data=[(["some_nonexistent_file"], UtfPlainTextReader)], buffer_size=5)