Example #1
0
def test_datasetdict_from_text(split, features, keep_in_memory, text_path,
                               tmp_path):
    if split:
        path = {split: text_path}
    else:
        split = "train"
        path = {"train": text_path, "test": text_path}
    cache_dir = tmp_path / "cache"
    default_expected_features = {"text": "string"}
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = DatasetDict.from_text(path,
                                        features=features,
                                        cache_dir=cache_dir,
                                        keep_in_memory=keep_in_memory)
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 1
    assert dataset.column_names == ["text"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
def test_datasetdict_from_text_keep_in_memory(keep_in_memory, text_path,
                                              tmp_path):
    cache_dir = tmp_path / "cache"
    expected_features = {"text": "string"}
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = DatasetDict.from_text({"train": text_path},
                                        cache_dir=cache_dir,
                                        keep_in_memory=keep_in_memory)
    _check_text_datasetdict(dataset, expected_features)
def test_datasetdict_from_text_split(split, text_path, tmp_path):
    if split:
        path = {split: text_path}
    else:
        split = "train"
        path = {"train": text_path, "test": text_path}
    cache_dir = tmp_path / "cache"
    expected_features = {"text": "string"}
    dataset = DatasetDict.from_text(path, cache_dir=cache_dir)
    _check_text_datasetdict(dataset,
                            expected_features,
                            splits=list(path.keys()))
    assert all(dataset[split].split == split for split in path.keys())
def test_datasetdict_from_text_features(features, text_path, tmp_path):
    cache_dir = tmp_path / "cache"
    default_expected_features = {"text": "string"}
    expected_features = features.copy(
    ) if features else default_expected_features
    features = (Features({
        feature: Value(dtype)
        for feature, dtype in features.items()
    }) if features is not None else None)
    dataset = DatasetDict.from_text({"train": text_path},
                                    features=features,
                                    cache_dir=cache_dir)
    _check_text_datasetdict(dataset, expected_features)