Beispiel #1
0
def test_load_dataset_local(dataset_loading_script_dir, data_dir,
                            keep_in_memory, caplog):
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = load_dataset(dataset_loading_script_dir,
                               data_dir=data_dir,
                               keep_in_memory=keep_in_memory)
    assert isinstance(dataset, DatasetDict)
    assert all(isinstance(d, Dataset) for d in dataset.values())
    assert len(dataset) == 2
    assert isinstance(next(iter(dataset["train"])), dict)
    for offline_simulation_mode in list(OfflineSimulationMode):
        with offline(offline_simulation_mode):
            caplog.clear()
            # Load dataset from cache
            dataset = datasets.load_dataset(DATASET_LOADING_SCRIPT_NAME,
                                            data_dir=data_dir)
            assert len(dataset) == 2
            assert "Using the latest cached version of the module" in caplog.text
    with pytest.raises(FileNotFoundError) as exc_info:
        datasets.load_dataset(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST)
    m_combined_path = re.search(
        fr"http\S*{re.escape(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST + '/' + SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST + '.py')}\b",
        str(exc_info.value),
    )
    assert m_combined_path is not None and is_remote_url(
        m_combined_path.group())
    assert os.path.abspath(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST) in str(
        exc_info.value)
Beispiel #2
0
 def check_if_url_is_valid(url):
     if is_remote_url(url) and "\\" in url:
         raise ValueError(
             f"Bad remote url '{url} since it contains a backslash"
         )