Esempio n. 1
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        train_path = dl_manager.download(_URLs["train"])
        dev_path = dl_manager.download(_URLs["dev"])
        test_fixed_path = dl_manager.download(_URLs["test_fixed"])
        test_random_path = dl_manager.download(_URLs["test_random"])

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": train_path,
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=NamedSplit("test_random"),
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": test_random_path,
                    "split": "test_random"
                },
            ),
            datasets.SplitGenerator(
                name=NamedSplit("test_fixed"),
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": test_fixed_path,
                    "split": "test_fixed"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": dev_path,
                    "split": "dev",
                },
            ),
        ]
        "col_2": "int64",
        "col_3": "float64"
    }
    expected_features = features.copy(
    ) if features else default_expected_features
    features = (Features({
        feature: Value(dtype)
        for feature, dtype in features.items()
    }) if features is not None else None)
    dataset = DatasetDict.from_csv({"train": csv_path},
                                   features=features,
                                   cache_dir=cache_dir)
    _check_csv_datasetdict(dataset, expected_features)


@pytest.mark.parametrize("split", [None, NamedSplit("train"), "train", "test"])
def test_datasetdict_from_csv_split(split, csv_path, tmp_path):
    if split:
        path = {split: csv_path}
    else:
        split = "train"
        path = {"train": csv_path, "test": csv_path}
    cache_dir = tmp_path / "cache"
    expected_features = {
        "col_1": "int64",
        "col_2": "int64",
        "col_3": "float64"
    }
    dataset = DatasetDict.from_csv(path, cache_dir=cache_dir)
    _check_csv_datasetdict(dataset,
                           expected_features,