def test_decompress_gzip(self): def create_compressed(root, content="this is the content"): file = os.path.join(root, "file") compressed = f"{file}.gz" with gzip.open(compressed, "wb") as fh: fh.write(content.encode()) return compressed, file, content with get_tmp_dir() as temp_dir: compressed, file, content = create_compressed(temp_dir) utils._decompress(compressed) self.assertTrue(os.path.exists(file)) with open(file, "r") as fh: self.assertEqual(fh.read(), content)
def test_decompress(self, extension, tmpdir): def create_compressed(root, content="this is the content"): file = os.path.join(root, "file") compressed = f"{file}{extension}" compressed_file_opener = _COMPRESSED_FILE_OPENERS[extension] with compressed_file_opener(compressed, "wb") as fh: fh.write(content.encode()) return compressed, file, content compressed, file, content = create_compressed(tmpdir) utils._decompress(compressed) assert os.path.exists(file) with open(file, "r") as fh: assert fh.read() == content
def _decompress(file: pathlib.Path) -> None: _decompress(str(file), remove_finished=True)
def test_decompress_no_compression(self): with self.assertRaises(RuntimeError): utils._decompress("foo.tar")
def test_decompress_no_compression(self): with pytest.raises(RuntimeError): utils._decompress("foo.tar")
def _decompress(file: pathlib.Path) -> pathlib.Path: return pathlib.Path(_decompress(str(file), remove_finished=True))
class TestOnlineResource: class DummyResource(OnlineResource): def __init__(self, download_fn=None, **kwargs): super().__init__(**kwargs) self._download_fn = download_fn def _download(self, root): if self._download_fn is None: raise pytest.UsageError( "`_download()` was called, but `DummyResource(...)` was constructed without `download_fn`." ) return self._download_fn(self, root) def _make_file(self, root, *, content, name="file.txt"): file = root / name with open(file, "w") as fh: fh.write(content) return file def _make_folder(self, root, *, name="folder"): folder = root / name subfolder = folder / "subfolder" subfolder.mkdir(parents=True) files = {} for idx, root in enumerate([folder, folder, subfolder]): content = f"sentinel{idx}" file = self._make_file(root, name=f"file{idx}.txt", content=content) files[str(file)] = content return folder, files def _make_tar(self, root, *, name="archive.tar", remove=True): folder, files = self._make_folder(root, name=name.split(".")[0]) archive = make_tar(root, name, folder, remove=remove) files = { str(archive / pathlib.Path(file).relative_to(root)): content for file, content in files.items() } return archive, files def test_load_file(self, tmp_path): content = "sentinel" file = self._make_file(tmp_path, content=content) resource = self.DummyResource(file_name=file.name) dp = resource.load(tmp_path) assert isinstance(dp, FileOpener) data = list(dp) assert len(data) == 1 path, buffer = data[0] assert path == str(file) assert buffer.read().decode() == content def test_load_folder(self, tmp_path): folder, files = self._make_folder(tmp_path) resource = self.DummyResource(file_name=folder.name) dp = resource.load(tmp_path) assert isinstance(dp, FileOpener) assert {path: buffer.read().decode() for path, buffer in dp} == files def test_load_archive(self, tmp_path): archive, files = self._make_tar(tmp_path) resource = self.DummyResource(file_name=archive.name) dp = resource.load(tmp_path) assert isinstance(dp, TarArchiveLoader) assert {path: buffer.read().decode() for path, buffer in dp} == files def test_priority_decompressed_gt_raw(self, tmp_path): # We don't need to actually compress here. Adding the suffix is sufficient self._make_file(tmp_path, content="raw_sentinel", name="file.txt.gz") file = self._make_file(tmp_path, content="decompressed_sentinel", name="file.txt") resource = self.DummyResource(file_name=file.name) dp = resource.load(tmp_path) path, buffer = next(iter(dp)) assert path == str(file) assert buffer.read().decode() == "decompressed_sentinel" def test_priority_extracted_gt_decompressed(self, tmp_path): archive, _ = self._make_tar(tmp_path, remove=False) resource = self.DummyResource(file_name=archive.name) dp = resource.load(tmp_path) # If the archive had been selected, this would be a `TarArchiveReader` assert isinstance(dp, FileOpener) def test_download(self, tmp_path): download_fn_was_called = False def download_fn(resource, root): nonlocal download_fn_was_called download_fn_was_called = True return self._make_file(root, content="_", name=resource.file_name) resource = self.DummyResource( file_name="file.txt", download_fn=download_fn, ) resource.load(tmp_path) assert download_fn_was_called, "`download_fn()` was never called" # This tests the `"decompress"` literal as well as a custom callable @pytest.mark.parametrize( "preprocess", [ "decompress", lambda path: _decompress(str(path), remove_finished=True), ], ) def test_preprocess_decompress(self, tmp_path, preprocess): file_name = "file.txt.gz" content = "sentinel" def download_fn(resource, root): file = root / resource.file_name with gzip.open(file, "wb") as fh: fh.write(content.encode()) return file resource = self.DummyResource(file_name=file_name, preprocess=preprocess, download_fn=download_fn) dp = resource.load(tmp_path) data = list(dp) assert len(data) == 1 path, buffer = data[0] assert path == str(tmp_path / file_name).replace(".gz", "") assert buffer.read().decode() == content def test_preprocess_extract(self, tmp_path): files = None def download_fn(resource, root): nonlocal files archive, files = self._make_tar(root, name=resource.file_name) return archive resource = self.DummyResource(file_name="folder.tar", preprocess="extract", download_fn=download_fn) dp = resource.load(tmp_path) assert files is not None, "`download_fn()` was never called" assert isinstance(dp, FileOpener) actual = {path: buffer.read().decode() for path, buffer in dp} expected = { path.replace(resource.file_name, resource.file_name.split(".")[0]): content for path, content in files.items() } assert actual == expected def test_preprocess_only_after_download(self, tmp_path): file = self._make_file(tmp_path, content="_") def preprocess(path): raise AssertionError( "`preprocess` was called although the file was already present." ) resource = self.DummyResource( file_name=file.name, preprocess=preprocess, ) resource.load(tmp_path)