def test_xopen_local(text_path): from datasets.utils.streaming_download_manager import xopen with xopen(text_path, encoding="utf-8") as f, open(text_path, encoding="utf-8") as expected_file: assert list(f) == list(expected_file)
def test_streaming_gg_drive_zipped(): urlpath = StreamingDownloadManager().download_and_extract( TEST_GG_DRIVE_ZIPPED_URL) all_files = list(xglob(xjoin(urlpath, "*"))) assert len(all_files) == 1 assert xbasename(all_files[0]) == TEST_GG_DRIVE_FILENAME with xopen(all_files[0]) as f: assert f.read() == TEST_GG_DRIVE_CONTENT
def test_streaming_dl_manager_download(text_path): dl_manager = StreamingDownloadManager() out = dl_manager.download(text_path) assert out == text_path with xopen(out, encoding="utf-8") as f, open(text_path, encoding="utf-8") as expected_file: assert f.read() == expected_file.read()
def test_xopen_local(text_path): with xopen(text_path, "r", encoding="utf-8") as f, open(text_path, encoding="utf-8") as expected_file: assert list(f) == list(expected_file) with xpathopen(Path(text_path), "r", encoding="utf-8") as f, open( text_path, encoding="utf-8") as expected_file: assert list(f) == list(expected_file)
def test_streaming_dl_manager_download_and_extract_with_extraction(text_gz_path, text_path): dl_manager = StreamingDownloadManager() output_path = dl_manager.download_and_extract(text_gz_path) path = os.path.basename(text_gz_path) path = path[: path.rindex(".")] assert output_path == f"gzip://{path}::{text_gz_path}" fsspec_open_file = xopen(output_path, encoding="utf-8") with fsspec_open_file as f, open(text_path, encoding="utf-8") as expected_file: assert f.read() == expected_file.read()
def test_streaming_dl_manager_download(text_path): from datasets.utils.streaming_download_manager import StreamingDownloadManager, xopen dl_manager = StreamingDownloadManager() out = dl_manager.download(text_path) assert out == text_path with xopen(out, encoding="utf-8") as f, open(text_path, encoding="utf-8") as expected_file: assert f.read() == expected_file.read()
def test_streaming_dl_manager_download_and_extract_with_extraction( text_gz_path, text_path): from datasets.utils.streaming_download_manager import StreamingDownloadManager, xopen dl_manager = StreamingDownloadManager() filename = os.path.basename(text_gz_path).rstrip(".gz") out = dl_manager.download_and_extract(text_gz_path) assert out == f"gzip://{filename}::{text_gz_path}" with xopen(out, encoding="utf-8") as f, open(text_path, encoding="utf-8") as expected_file: assert f.read() == expected_file.read()
def test_streaming_dl_manager_extract_all_supported_single_file_compression_types( compression_fs_class, gz_file, xz_file, zstd_file, bz2_file, lz4_file, text_file ): input_paths = {"gzip": gz_file, "xz": xz_file, "zstd": zstd_file, "bz2": bz2_file, "lz4": lz4_file} input_path = str(input_paths[compression_fs_class.protocol]) dl_manager = StreamingDownloadManager() output_path = dl_manager.extract(input_path) path = os.path.basename(input_path) path = path[: path.rindex(".")] assert output_path == f"{compression_fs_class.protocol}://{path}::{input_path}" fsspec_open_file = xopen(output_path, encoding="utf-8") with fsspec_open_file as f, open(text_file, encoding="utf-8") as expected_file: assert f.read() == expected_file.read()
def test_streaming_gg_drive_gzipped(): urlpath = StreamingDownloadManager().download_and_extract( TEST_GG_DRIVE_GZIPPED_URL) with xopen(urlpath) as f: assert f.read() == TEST_GG_DRIVE_CONTENT
def test_streaming_gg_drive(): with xopen(TEST_GG_DRIVE_URL) as f: assert f.read() == TEST_GG_DRIVE_CONTENT
def test_xopen_remote(): with xopen(TEST_URL, "r", encoding="utf-8") as f: assert list(f) == TEST_URL_CONTENT.splitlines(keepends=True) with xpathopen(Path(TEST_URL), "r", encoding="utf-8") as f: assert list(f) == TEST_URL_CONTENT.splitlines(keepends=True)
def test_xopen_remote(): from datasets.utils.streaming_download_manager import xopen with xopen(TEST_URL, encoding="utf-8") as f: assert list(f) == TEST_URL_CONTENT.splitlines(keepends=True)