Esempio n. 1
0
def test_iter_archive_path(tar_jsonl_path):
    dl_manager = StreamingDownloadManager()
    for num_jsonl, (path, file) in enumerate(dl_manager.iter_archive(
            str(tar_jsonl_path)),
                                             start=1):
        _test_jsonl(path, file)
    assert num_jsonl == 2
def test_streaming_dl_manager_download(text_path):
    dl_manager = StreamingDownloadManager()
    out = dl_manager.download(text_path)
    assert out == text_path
    with xopen(out,
               encoding="utf-8") as f, open(text_path,
                                            encoding="utf-8") as expected_file:
        assert f.read() == expected_file.read()
Esempio n. 3
0
def test_streaming_dl_manager_download_and_extract_with_extraction(text_gz_path, text_path):
    dl_manager = StreamingDownloadManager()
    output_path = dl_manager.download_and_extract(text_gz_path)
    path = os.path.basename(text_gz_path)
    path = path[: path.rindex(".")]
    assert output_path == f"gzip://{path}::{text_gz_path}"
    fsspec_open_file = xopen(output_path, encoding="utf-8")
    with fsspec_open_file as f, open(text_path, encoding="utf-8") as expected_file:
        assert f.read() == expected_file.read()
def test_streaming_dl_manager_download(text_path):
    from datasets.utils.streaming_download_manager import StreamingDownloadManager, xopen

    dl_manager = StreamingDownloadManager()
    out = dl_manager.download(text_path)
    assert out == text_path
    with xopen(out,
               encoding="utf-8") as f, open(text_path,
                                            encoding="utf-8") as expected_file:
        assert f.read() == expected_file.read()
Esempio n. 5
0
def test_iter_archive_file(tar_nested_jsonl_path):
    dl_manager = StreamingDownloadManager()
    for num_tar, (path, file) in enumerate(dl_manager.iter_archive(
            str(tar_nested_jsonl_path)),
                                           start=1):
        for num_jsonl, (subpath,
                        subfile) in enumerate(dl_manager.iter_archive(file),
                                              start=1):
            _test_jsonl(subpath, subfile)
    assert num_tar == 1
    assert num_jsonl == 2
def test_streaming_dl_manager_download_and_extract_with_extraction(
        text_gz_path, text_path):
    from datasets.utils.streaming_download_manager import StreamingDownloadManager, xopen

    dl_manager = StreamingDownloadManager()
    filename = os.path.basename(text_gz_path).rstrip(".gz")
    out = dl_manager.download_and_extract(text_gz_path)
    assert out == f"gzip://{filename}::{text_gz_path}"
    with xopen(out,
               encoding="utf-8") as f, open(text_path,
                                            encoding="utf-8") as expected_file:
        assert f.read() == expected_file.read()
def test_iter_archive_path(tar_jsonl_path):
    dl_manager = StreamingDownloadManager()
    archive_iterable = dl_manager.iter_archive(str(tar_jsonl_path))
    num_jsonl = 0
    for num_jsonl, (path, file) in enumerate(archive_iterable, start=1):
        _test_jsonl(path, file)
    assert num_jsonl == 2
    # do it twice to make sure it's reset correctly
    num_jsonl = 0
    for num_jsonl, (path, file) in enumerate(archive_iterable, start=1):
        _test_jsonl(path, file)
    assert num_jsonl == 2
Esempio n. 8
0
def test_streaming_dl_manager_extract_all_supported_single_file_compression_types(
    compression_fs_class, gz_file, xz_file, zstd_file, bz2_file, lz4_file, text_file
):
    input_paths = {"gzip": gz_file, "xz": xz_file, "zstd": zstd_file, "bz2": bz2_file, "lz4": lz4_file}
    input_path = str(input_paths[compression_fs_class.protocol])
    dl_manager = StreamingDownloadManager()
    output_path = dl_manager.extract(input_path)
    path = os.path.basename(input_path)
    path = path[: path.rindex(".")]
    assert output_path == f"{compression_fs_class.protocol}://{path}::{input_path}"
    fsspec_open_file = xopen(output_path, encoding="utf-8")
    with fsspec_open_file as f, open(text_file, encoding="utf-8") as expected_file:
        assert f.read() == expected_file.read()
def test_streaming_gg_drive_zipped():
    urlpath = StreamingDownloadManager().download_and_extract(
        TEST_GG_DRIVE_ZIPPED_URL)
    all_files = list(xglob(xjoin(urlpath, "*")))
    assert len(all_files) == 1
    assert xbasename(all_files[0]) == TEST_GG_DRIVE_FILENAME
    with xopen(all_files[0]) as f:
        assert f.read() == TEST_GG_DRIVE_CONTENT
def test_iter_archive_file(tar_nested_jsonl_path):
    dl_manager = StreamingDownloadManager()
    files_iterable = dl_manager.iter_archive(str(tar_nested_jsonl_path))
    num_tar, num_jsonl = 0, 0
    for num_tar, (path, file) in enumerate(files_iterable, start=1):
        for num_jsonl, (subpath,
                        subfile) in enumerate(dl_manager.iter_archive(file),
                                              start=1):
            _test_jsonl(subpath, subfile)
    assert num_tar == 1
    assert num_jsonl == 2
    # do it twice to make sure it's reset correctly
    num_tar, num_jsonl = 0, 0
    for num_tar, (path, file) in enumerate(files_iterable, start=1):
        for num_jsonl, (subpath,
                        subfile) in enumerate(dl_manager.iter_archive(file),
                                              start=1):
            _test_jsonl(subpath, subfile)
    assert num_tar == 1
    assert num_jsonl == 2
def test_streaming_gg_drive_gzipped():
    urlpath = StreamingDownloadManager().download_and_extract(
        TEST_GG_DRIVE_GZIPPED_URL)
    with xopen(urlpath) as f:
        assert f.read() == TEST_GG_DRIVE_CONTENT
def test_streaming_dl_manager_download_and_extract_with_join(
        input_path, filename, expected_path):
    dl_manager = StreamingDownloadManager()
    extracted_path = dl_manager.download_and_extract(input_path)
    output_path = xjoin(extracted_path, filename)
    assert output_path == expected_path
def test_streaming_dl_manager_download_and_extract_no_extraction(urlpath):
    dl_manager = StreamingDownloadManager()
    assert dl_manager.download_and_extract(urlpath) == urlpath
def test_streaming_dl_manager_download_dummy_path(urlpath):
    dl_manager = StreamingDownloadManager()
    assert dl_manager.download(urlpath) == urlpath
def test_streaming_dl_manager_download_dummy_path(urlpath):
    from datasets.utils.streaming_download_manager import StreamingDownloadManager

    dl_manager = StreamingDownloadManager()
    assert dl_manager.download(urlpath) == urlpath
def test_streaming_dl_manager_extract(text_gz_path):
    from datasets.utils.streaming_download_manager import StreamingDownloadManager

    dl_manager = StreamingDownloadManager()
    path = os.path.basename(text_gz_path).rstrip(".gz")
    assert dl_manager.extract(text_gz_path) == f"gzip://{path}::{text_gz_path}"
def test_streaming_dl_manager_download_and_extract_no_extraction(urlpath):
    from datasets.utils.streaming_download_manager import StreamingDownloadManager

    dl_manager = StreamingDownloadManager()
    assert dl_manager.download_and_extract(urlpath) == urlpath