Example #1
0
def test_dataset_feature_with_none(feature):
    data = {"col": [None]}
    features = Features({"col": feature})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"col"}
    assert item["col"] is None
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"col"}
    assert isinstance(batch["col"], list) and all(item is None
                                                  for item in batch["col"])
    column = dset["col"]
    assert len(column) == 1
    assert isinstance(column, list) and all(item is None for item in column)

    # nested tests

    data = {"col": [[None]]}
    features = Features({"col": Sequence(feature)})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"col"}
    assert all(i is None for i in item["col"])

    data = {"nested": [{"col": None}]}
    features = Features({"nested": {"col": feature}})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"nested"}
    assert item["nested"].keys() == {"col"}
    assert item["nested"]["col"] is None
def test_builder_as_dataset(split, expected_dataset_class, expected_dataset_length, in_memory, tmp_path):
    cache_dir = str(tmp_path)
    dummy_builder = DummyBuilder(cache_dir=cache_dir, name="dummy")
    os.makedirs(dummy_builder.cache_dir)

    dummy_builder.info.splits = SplitDict()
    dummy_builder.info.splits.add(SplitInfo("train", num_examples=10))
    dummy_builder.info.splits.add(SplitInfo("test", num_examples=10))

    for info_split in dummy_builder.info.splits:
        with ArrowWriter(
            path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{info_split}.arrow"),
            features=Features({"text": Value("string")}),
        ) as writer:
            writer.write_batch({"text": ["foo"] * 10})
            writer.finalize()

    with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase():
        dataset = dummy_builder.as_dataset(split=split, in_memory=in_memory)
    assert isinstance(dataset, expected_dataset_class)
    if isinstance(dataset, DatasetDict):
        assert list(dataset.keys()) == ["train", "test"]
        datasets = dataset.values()
        expected_splits = ["train", "test"]
    elif isinstance(dataset, Dataset):
        datasets = [dataset]
        expected_splits = [split]
    for dataset, expected_split in zip(datasets, expected_splits):
        assert dataset.split == expected_split
        assert len(dataset) == expected_dataset_length
        assert dataset.features == Features({"text": Value("string")})
        dataset.column_names == ["text"]
def test_iterable_dataset_cast(generate_examples_fn):
    ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 10})
    features = Features({"id": Value("int64"), "label": Value("int64")})
    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))
    new_features = Features({"id": Value("int64"), "label": Value("bool")})
    casted_dataset = dataset.cast(new_features)
    assert list(casted_dataset) == [new_features.encode_example(ex) for _, ex in ex_iterable]
Example #4
0
def test_datasetdict_from_text(split, features, keep_in_memory, text_path,
                               tmp_path):
    if split:
        path = {split: text_path}
    else:
        split = "train"
        path = {"train": text_path, "test": text_path}
    cache_dir = tmp_path / "cache"
    default_expected_features = {"text": "string"}
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = DatasetDict.from_text(path,
                                        features=features,
                                        cache_dir=cache_dir,
                                        keep_in_memory=keep_in_memory)
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 1
    assert dataset.column_names == ["text"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
Example #5
0
def test_datasetdict_from_csv(split, features, keep_in_memory, csv_path,
                              tmp_path):
    if split:
        path = {split: csv_path}
    else:
        split = "train"
        path = {"train": csv_path, "test": csv_path}
    cache_dir = tmp_path / "cache"
    # CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
    default_expected_features = {
        "col_1": "int64",
        "col_2": "int64",
        "col_3": "float64"
    }
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = DatasetDict.from_csv(path,
                                       features=features,
                                       cache_dir=cache_dir,
                                       keep_in_memory=keep_in_memory)
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
 def test_flatten(self):
     dset_split = Dataset.from_dict(
         {
             "a": [{
                 "b": {
                     "c": ["text"]
                 }
             }] * 10,
             "foo": [1] * 10
         },
         features=Features({
             "a": {
                 "b": Sequence({"c": Value("string")})
             },
             "foo": Value("int64")
         }),
     )
     dset = DatasetDict({"train": dset_split, "test": dset_split})
     dset = dset.flatten()
     self.assertDictEqual(dset.column_names, {
         "train": ["a.b.c", "foo"],
         "test": ["a.b.c", "foo"]
     })
     self.assertListEqual(sorted(dset["train"].features.keys()),
                          ["a.b.c", "foo"])
     self.assertDictEqual(
         dset["train"].features,
         Features({
             "a.b.c": Sequence(Value("string")),
             "foo": Value("int64")
         }))
     del dset
Example #7
0
 def test_from_dict(self):
     input_schema = Features({"text": Value("string")})
     label_schema = Features({"summary": Value("string")})
     template_dict = {"text_column": "input_text", "summary_column": "input_summary"}
     task = Summarization.from_dict(template_dict)
     self.assertEqual("summarization", task.task)
     self.assertEqual(input_schema, task.input_schema)
     self.assertEqual(label_schema, task.label_schema)
Example #8
0
 def test_from_dict(self):
     input_schema = Features({"text": Value("string")})
     # Labels are cast to tuple during `TextClassification.__post_init__`, so we do the same here
     label_schema = Features({"labels": ClassLabel})
     template_dict = {"text_column": "input_text", "label_column": "input_labels"}
     task = TextClassification.from_dict(template_dict)
     self.assertEqual("text-classification", task.task)
     self.assertEqual(input_schema, task.input_schema)
     self.assertEqual(label_schema, task.label_schema)
 def test_cache_dir_for_features(self):
     with tempfile.TemporaryDirectory() as tmp_dir:
         f1 = Features({"id": Value("int8")})
         f2 = Features({"id": Value("int32")})
         dummy_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, name="dummy", features=f1)
         other_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, name="dummy", features=f1)
         self.assertEqual(dummy_builder.cache_dir, other_builder.cache_dir)
         other_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, name="dummy", features=f2)
         self.assertNotEqual(dummy_builder.cache_dir, other_builder.cache_dir)
def test_audio_feature_type_to_arrow():
    features = Features({"audio": Audio()})
    assert features.arrow_schema == pa.schema({"audio": Audio().pa_type})
    features = Features({"struct_containing_an_audio": {"audio": Audio()}})
    assert features.arrow_schema == pa.schema(
        {"struct_containing_an_audio": pa.struct({"audio": Audio().pa_type})})
    features = Features({"sequence_of_audios": Sequence(Audio())})
    assert features.arrow_schema == pa.schema(
        {"sequence_of_audios": pa.list_(Audio().pa_type)})
 def test_align_labels_with_mapping(self):
     train_features = Features({
         "input_text":
         Value("string"),
         "input_labels":
         ClassLabel(num_classes=3,
                    names=["entailment", "neutral", "contradiction"]),
     })
     test_features = Features({
         "input_text":
         Value("string"),
         "input_labels":
         ClassLabel(num_classes=3,
                    names=["entailment", "contradiction", "neutral"]),
     })
     train_data = {
         "input_text": ["a", "a", "b", "b", "c", "c"],
         "input_labels": [0, 0, 1, 1, 2, 2]
     }
     test_data = {
         "input_text": ["a", "a", "c", "c", "b", "b"],
         "input_labels": [0, 0, 1, 1, 2, 2]
     }
     label2id = {"CONTRADICTION": 0, "ENTAILMENT": 2, "NEUTRAL": 1}
     id2label = {v: k for k, v in label2id.items()}
     train_expected_labels = [2, 2, 1, 1, 0, 0]
     test_expected_labels = [2, 2, 0, 0, 1, 1]
     train_expected_label_names = [
         id2label[idx] for idx in train_expected_labels
     ]
     test_expected_label_names = [
         id2label[idx] for idx in test_expected_labels
     ]
     dsets = DatasetDict({
         "train":
         Dataset.from_dict(train_data, features=train_features),
         "test":
         Dataset.from_dict(test_data, features=test_features),
     })
     dsets = dsets.align_labels_with_mapping(label2id, "input_labels")
     self.assertListEqual(train_expected_labels,
                          dsets["train"]["input_labels"])
     self.assertListEqual(test_expected_labels,
                          dsets["test"]["input_labels"])
     train_aligned_label_names = [
         dsets["train"].features["input_labels"].int2str(idx)
         for idx in dsets["train"]["input_labels"]
     ]
     test_aligned_label_names = [
         dsets["test"].features["input_labels"].int2str(idx)
         for idx in dsets["test"]["input_labels"]
     ]
     self.assertListEqual(train_expected_label_names,
                          train_aligned_label_names)
     self.assertListEqual(test_expected_label_names,
                          test_aligned_label_names)
Example #12
0
def test_encode_batch_with_example_with_empty_first_elem():
    features = Features({
        "x": Sequence(Sequence(ClassLabel(names=["a", "b"]))),
    })
    encoded_batch = features.encode_batch(
        {"x": [
            [["a"], ["b"]],
            [[], ["b"]],
        ]})
    assert encoded_batch == {"x": [[[0], [1]], [[], [1]]]}
Example #13
0
 def test_from_dict(self):
     input_schema = Features({"image": Image()})
     label_schema = Features({"labels": ClassLabel})
     template_dict = {
         "image_column": "input_image",
         "label_column": "input_label",
     }
     task = ImageClassification.from_dict(template_dict)
     self.assertEqual("image-classification", task.task)
     self.assertEqual(input_schema, task.input_schema)
     self.assertEqual(label_schema, task.label_schema)
 def test_from_dict(self):
     input_schema = Features({"audio_file_path": Value("string")})
     label_schema = Features({"transcription": Value("string")})
     template_dict = {
         "audio_file_path_column": "input_audio_file_path",
         "transcription_column": "input_transcription",
     }
     task = AutomaticSpeechRecognition.from_dict(template_dict)
     self.assertEqual("automatic-speech-recognition", task.task)
     self.assertEqual(input_schema, task.input_schema)
     self.assertEqual(label_schema, task.label_schema)
def test_dataset_concatenate_audio_features(shared_datadir):
    # we use a different data structure between 1 and 2 to make sure they are compatible with each other
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data1 = {"audio": [audio_path]}
    dset1 = Dataset.from_dict(data1, features=Features({"audio": Audio()}))
    data2 = {"audio": [{"bytes": open(audio_path, "rb").read()}]}
    dset2 = Dataset.from_dict(data2, features=Features({"audio": Audio()}))
    concatenated_dataset = concatenate_datasets([dset1, dset2])
    assert len(concatenated_dataset) == len(dset1) + len(dset2)
    assert concatenated_dataset[0]["audio"]["array"].shape == dset1[0][
        "audio"]["array"].shape
    assert concatenated_dataset[1]["audio"]["array"].shape == dset2[0][
        "audio"]["array"].shape
 def test_from_dict(self):
     input_schema = Features({"image_file_path": Value("string")})
     label_schema = Features(
         {"labels": ClassLabel(names=tuple(self.labels))})
     template_dict = {
         "image_file_path_column": "input_image_file_path",
         "label_column": "input_label",
         "labels": self.labels,
     }
     task = ImageClassification.from_dict(template_dict)
     self.assertEqual("image-classification", task.task)
     self.assertEqual(input_schema, task.input_schema)
     self.assertEqual(label_schema, task.label_schema)
Example #17
0
 def test_flatten_with_sequence(self):
     features = Features(
         {"foo": Sequence({"bar": {
             "my_value": Value("int32")
         }})})
     _features = features.copy()
     flattened_features = features.flatten()
     assert flattened_features == {
         "foo.bar": [{
             "my_value": Value("int32")
         }]
     }
     assert features == _features, "calling flatten shouldn't alter the current features"
Example #18
0
    def test_as_dataset(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy")
            os.makedirs(dummy_builder.cache_dir)

            dummy_builder.info.splits = SplitDict()
            dummy_builder.info.splits.add(SplitInfo("train", num_examples=10))
            dummy_builder.info.splits.add(SplitInfo("test", num_examples=10))

            for split in dummy_builder.info.splits:
                writer = ArrowWriter(
                    path=os.path.join(dummy_builder.cache_dir,
                                      f"dummy_builder-{split}.arrow"),
                    features=Features({"text": Value("string")}),
                )
                writer.write_batch({"text": ["foo"] * 10})
                writer.finalize()

            dsets = dummy_builder.as_dataset()
            self.assertIsInstance(dsets, DatasetDict)
            self.assertListEqual(list(dsets.keys()), ["train", "test"])
            self.assertEqual(len(dsets["train"]), 10)
            self.assertEqual(len(dsets["test"]), 10)
            self.assertDictEqual(dsets["train"].features,
                                 Features({"text": Value("string")}))
            self.assertDictEqual(dsets["test"].features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dsets["train"].column_names, ["text"])
            self.assertListEqual(dsets["test"].column_names, ["text"])
            del dsets

            dset = dummy_builder.as_dataset("train")
            self.assertIsInstance(dset, Dataset)
            self.assertEqual(dset.split, "train")
            self.assertEqual(len(dset), 10)
            self.assertDictEqual(dset.features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dset.column_names, ["text"])
            del dset

            dset = dummy_builder.as_dataset("train+test[:30%]")
            self.assertIsInstance(dset, Dataset)
            self.assertEqual(dset.split, "train+test[:30%]")
            self.assertEqual(len(dset), 13)
            self.assertDictEqual(dset.features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dset.column_names, ["text"])
            del dset
Example #19
0
def test_load_dataset_builder_for_absolute_script_dir(
        dataset_loading_script_dir, data_dir):
    builder = datasets.load_dataset_builder(dataset_loading_script_dir,
                                            data_dir=data_dir)
    assert isinstance(builder, DatasetBuilder)
    assert builder.name == DATASET_LOADING_SCRIPT_NAME
    assert builder.info.features == Features({"text": Value("string")})
def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data = {"audio": [audio_path]}
    features = Features({"audio": Audio(decode=False)})
    dset = Dataset.from_dict(data, features=features)
    with dset.formatted_as("numpy"):
        item = dset[0]
        assert item.keys() == {"audio"}
        assert item["audio"] == {"path": audio_path, "bytes": None}
        batch = dset[:1]
        assert batch.keys() == {"audio"}
        assert len(batch["audio"]) == 1
        assert batch["audio"][0] == {"path": audio_path, "bytes": None}
        column = dset["audio"]
        assert len(column) == 1
        assert column[0] == {"path": audio_path, "bytes": None}

    with dset.formatted_as("pandas"):
        item = dset[0]
        assert item.shape == (1, 1)
        assert item.columns == ["audio"]
        assert item["audio"][0] == {"path": audio_path, "bytes": None}
        batch = dset[:1]
        assert batch.shape == (1, 1)
        assert batch.columns == ["audio"]
        assert batch["audio"][0] == {"path": audio_path, "bytes": None}
        column = dset["audio"]
        assert len(column) == 1
        assert column[0] == {"path": audio_path, "bytes": None}
Example #21
0
def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path):
    audio_filename = "test_audio_44100.mp3"
    data = {"audio": []}
    for file_path, file_obj in iter_archive(tar_mp3_path):
        data["audio"].append({"path": file_path, "bytes": file_obj.read()})
        break
    features = Features({"audio": Audio()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"audio"}
    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
    assert item["audio"]["path"] == audio_filename
    assert item["audio"]["array"].shape == (109440, )
    assert item["audio"]["sampling_rate"] == 44100
    batch = dset[:1]
    assert batch.keys() == {"audio"}
    assert len(batch["audio"]) == 1
    assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"}
    assert batch["audio"][0]["path"] == audio_filename
    assert batch["audio"][0]["array"].shape == (109440, )
    assert batch["audio"][0]["sampling_rate"] == 44100
    column = dset["audio"]
    assert len(column) == 1
    assert column[0].keys() == {"path", "array", "sampling_rate"}
    assert column[0]["path"] == audio_filename
    assert column[0]["array"].shape == (109440, )
    assert column[0]["sampling_rate"] == 44100
Example #22
0
def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.mp3")
    data = {"audio": [audio_path]}
    features = Features({"audio": Audio()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item["audio"]["sampling_rate"] == 44100
    dset = dset.cast_column("audio", Audio(sampling_rate=16000))
    item = dset[0]
    assert item.keys() == {"audio"}
    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
    assert item["audio"]["path"] == audio_path
    assert item["audio"]["array"].shape == (39707,)
    assert item["audio"]["sampling_rate"] == 16000
    batch = dset[:1]
    assert batch.keys() == {"audio"}
    assert len(batch["audio"]) == 1
    assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"}
    assert batch["audio"][0]["path"] == audio_path
    assert batch["audio"][0]["array"].shape == (39707,)
    assert batch["audio"][0]["sampling_rate"] == 16000
    column = dset["audio"]
    assert len(column) == 1
    assert column[0].keys() == {"path", "array", "sampling_rate"}
    assert column[0]["path"] == audio_path
    assert column[0]["array"].shape == (39707,)
    assert column[0]["sampling_rate"] == 16000
Example #23
0
def dataset():
    n = 10
    features = Features({
        "tokens":
        Sequence(Value("string")),
        "labels":
        Sequence(ClassLabel(names=["negative", "positive"])),
        "answers":
        Sequence({
            "text": Value("string"),
            "answer_start": Value("int32"),
        }),
        "id":
        Value("int64"),
    })
    dataset = Dataset.from_dict(
        {
            "tokens": [["foo"] * 5] * n,
            "labels": [[1] * 5] * n,
            "answers": [{
                "answer_start": [97],
                "text": ["1976"]
            }] * 10,
            "id": list(range(n)),
        },
        features=features,
    )
    return dataset
Example #24
0
def test_dataset_with_audio_feature_map_is_decoded(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data = {"audio": [audio_path], "text": ["Hello"]}
    features = Features({"audio": Audio(), "text": Value("string")})
    dset = Dataset.from_dict(data, features=features)

    def process_audio_sampling_rate_by_example(example):
        example["double_sampling_rate"] = 2 * example["audio"]["sampling_rate"]
        return example

    decoded_dset = dset.map(process_audio_sampling_rate_by_example)
    for item in decoded_dset:
        assert item.keys() == {"audio", "text", "double_sampling_rate"}
        assert item["double_sampling_rate"] == 88200

    def process_audio_sampling_rate_by_batch(batch):
        double_sampling_rates = []
        for audio in batch["audio"]:
            double_sampling_rates.append(2 * audio["sampling_rate"])
        batch["double_sampling_rate"] = double_sampling_rates
        return batch

    decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True)
    for item in decoded_dset:
        assert item.keys() == {"audio", "text", "double_sampling_rate"}
        assert item["double_sampling_rate"] == 88200
Example #25
0
 def test_flatten(self):
     features = Features({
         "foo": {
             "bar1": Value("int32"),
             "bar2": {
                 "foobar": Value("string")
             }
         }
     })
     _features = features.copy()
     flattened_features = features.flatten()
     assert flattened_features == {
         "foo.bar1": Value("int32"),
         "foo.bar2.foobar": Value("string")
     }
     assert features == _features, "calling flatten shouldn't alter the current features"
def test_load_dataset_zip_jsonl(data_file, streaming, zip_jsonl_path,
                                zip_jsonl_with_dir_path, jsonl_path):
    data_file_paths = {
        "zip_jsonl_path": zip_jsonl_path,
        "zip_jsonl_with_dir_path": zip_jsonl_with_dir_path,
        "jsonl_path": jsonl_path,
    }
    data_files = str(data_file_paths[data_file])
    expected_size = 8 if data_file.startswith("zip") else 4
    features = Features({
        "col_1": Value("string"),
        "col_2": Value("int32"),
        "col_3": Value("float32")
    })
    ds = load_dataset("json",
                      split="train",
                      data_files=data_files,
                      features=features,
                      streaming=streaming)
    if streaming:
        ds_item_counter = 0
        for ds_item in ds:
            if ds_item_counter == 0:
                assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
            ds_item_counter += 1
        assert ds_item_counter == expected_size
    else:
        assert ds.shape[0] == expected_size
        ds_item = next(iter(ds))
        assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
def test_iterable_dataset_map_complex_features(dataset: IterableDataset, generate_examples_fn):
    # https://github.com/huggingface/datasets/issues/3505
    ex_iterable = ExamplesIterable(generate_examples_fn, {"label": "positive"})
    features = Features(
        {
            "id": Value("int64"),
            "label": Value("string"),
        }
    )
    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))
    dataset = dataset.cast_column("label", ClassLabel(names=["negative", "positive"]))
    dataset = dataset.map(lambda x: {"id+1": x["id"] + 1, **x})
    assert isinstance(dataset._ex_iterable, MappedExamplesIterable)
    features["label"] = ClassLabel(names=["negative", "positive"])
    assert [{k: v for k, v in ex.items() if k != "id+1"} for ex in dataset] == [
        features.encode_example(ex) for _, ex in ex_iterable
    ]
Example #28
0
def test_load_dataset_builder_for_community_dataset_with_script():
    builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER)
    assert isinstance(builder, DatasetBuilder)
    assert builder.name == SAMPLE_DATASET_IDENTIFIER.split("/")[-1]
    assert builder.info.features == Features({"text": Value("string")})
    namespace = SAMPLE_DATASET_IDENTIFIER[: SAMPLE_DATASET_IDENTIFIER.index("/")]
    assert builder._relative_data_dir().startswith(namespace)
    assert SAMPLE_DATASET_IDENTIFIER.replace("/", "___") in builder.__module__
Example #29
0
def test_load_dataset_streaming_csv(path_extension, streaming, csv_path, bz2_csv_path):
    paths = {"csv": csv_path, "csv.bz2": bz2_csv_path}
    data_files = str(paths[path_extension])
    features = Features({"col_1": Value("string"), "col_2": Value("int32"), "col_3": Value("float32")})
    ds = load_dataset("csv", split="train", data_files=data_files, features=features, streaming=streaming)
    assert isinstance(ds, IterableDataset if streaming else Dataset)
    ds_item = next(iter(ds))
    assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
Example #30
0
def test_load_dataset_builder_for_relative_script_dir(dataset_loading_script_dir, data_dir):
    with set_current_working_directory_to_temp_dir():
        relative_script_dir = DATASET_LOADING_SCRIPT_NAME
        shutil.copytree(dataset_loading_script_dir, relative_script_dir)
        builder = datasets.load_dataset_builder(relative_script_dir, data_dir=data_dir)
        assert isinstance(builder, DatasetBuilder)
        assert builder.name == DATASET_LOADING_SCRIPT_NAME
        assert builder.info.features == Features({"text": Value("string")})