def test_dataset_feature_with_none(feature): data = {"col": [None]} features = Features({"col": feature}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"col"} assert item["col"] is None batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"col"} assert isinstance(batch["col"], list) and all(item is None for item in batch["col"]) column = dset["col"] assert len(column) == 1 assert isinstance(column, list) and all(item is None for item in column) # nested tests data = {"col": [[None]]} features = Features({"col": Sequence(feature)}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"col"} assert all(i is None for i in item["col"]) data = {"nested": [{"col": None}]} features = Features({"nested": {"col": feature}}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"nested"} assert item["nested"].keys() == {"col"} assert item["nested"]["col"] is None
def test_builder_as_dataset(split, expected_dataset_class, expected_dataset_length, in_memory, tmp_path): cache_dir = str(tmp_path) dummy_builder = DummyBuilder(cache_dir=cache_dir, name="dummy") os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for info_split in dummy_builder.info.splits: with ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{info_split}.arrow"), features=Features({"text": Value("string")}), ) as writer: writer.write_batch({"text": ["foo"] * 10}) writer.finalize() with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase(): dataset = dummy_builder.as_dataset(split=split, in_memory=in_memory) assert isinstance(dataset, expected_dataset_class) if isinstance(dataset, DatasetDict): assert list(dataset.keys()) == ["train", "test"] datasets = dataset.values() expected_splits = ["train", "test"] elif isinstance(dataset, Dataset): datasets = [dataset] expected_splits = [split] for dataset, expected_split in zip(datasets, expected_splits): assert dataset.split == expected_split assert len(dataset) == expected_dataset_length assert dataset.features == Features({"text": Value("string")}) dataset.column_names == ["text"]
def test_iterable_dataset_cast(generate_examples_fn): ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 10}) features = Features({"id": Value("int64"), "label": Value("int64")}) dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) new_features = Features({"id": Value("int64"), "label": Value("bool")}) casted_dataset = dataset.cast(new_features) assert list(casted_dataset) == [new_features.encode_example(ex) for _, ex in ex_iterable]
def test_datasetdict_from_text(split, features, keep_in_memory, text_path, tmp_path): if split: path = {split: text_path} else: split = "train" path = {"train": text_path, "test": text_path} cache_dir = tmp_path / "cache" default_expected_features = {"text": "string"} expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = DatasetDict.from_text(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory) assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 1 assert dataset.column_names == ["text"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_datasetdict_from_csv(split, features, keep_in_memory, csv_path, tmp_path): if split: path = {split: csv_path} else: split = "train" path = {"train": csv_path, "test": csv_path} cache_dir = tmp_path / "cache" # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = { "col_1": "int64", "col_2": "int64", "col_3": "float64" } expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = DatasetDict.from_csv(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory) assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_flatten(self): dset_split = Dataset.from_dict( { "a": [{ "b": { "c": ["text"] } }] * 10, "foo": [1] * 10 }, features=Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }), ) dset = DatasetDict({"train": dset_split, "test": dset_split}) dset = dset.flatten() self.assertDictEqual(dset.column_names, { "train": ["a.b.c", "foo"], "test": ["a.b.c", "foo"] }) self.assertListEqual(sorted(dset["train"].features.keys()), ["a.b.c", "foo"]) self.assertDictEqual( dset["train"].features, Features({ "a.b.c": Sequence(Value("string")), "foo": Value("int64") })) del dset
def test_from_dict(self): input_schema = Features({"text": Value("string")}) label_schema = Features({"summary": Value("string")}) template_dict = {"text_column": "input_text", "summary_column": "input_summary"} task = Summarization.from_dict(template_dict) self.assertEqual("summarization", task.task) self.assertEqual(input_schema, task.input_schema) self.assertEqual(label_schema, task.label_schema)
def test_from_dict(self): input_schema = Features({"text": Value("string")}) # Labels are cast to tuple during `TextClassification.__post_init__`, so we do the same here label_schema = Features({"labels": ClassLabel}) template_dict = {"text_column": "input_text", "label_column": "input_labels"} task = TextClassification.from_dict(template_dict) self.assertEqual("text-classification", task.task) self.assertEqual(input_schema, task.input_schema) self.assertEqual(label_schema, task.label_schema)
def test_cache_dir_for_features(self): with tempfile.TemporaryDirectory() as tmp_dir: f1 = Features({"id": Value("int8")}) f2 = Features({"id": Value("int32")}) dummy_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, name="dummy", features=f1) other_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, name="dummy", features=f1) self.assertEqual(dummy_builder.cache_dir, other_builder.cache_dir) other_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, name="dummy", features=f2) self.assertNotEqual(dummy_builder.cache_dir, other_builder.cache_dir)
def test_audio_feature_type_to_arrow(): features = Features({"audio": Audio()}) assert features.arrow_schema == pa.schema({"audio": Audio().pa_type}) features = Features({"struct_containing_an_audio": {"audio": Audio()}}) assert features.arrow_schema == pa.schema( {"struct_containing_an_audio": pa.struct({"audio": Audio().pa_type})}) features = Features({"sequence_of_audios": Sequence(Audio())}) assert features.arrow_schema == pa.schema( {"sequence_of_audios": pa.list_(Audio().pa_type)})
def test_align_labels_with_mapping(self): train_features = Features({ "input_text": Value("string"), "input_labels": ClassLabel(num_classes=3, names=["entailment", "neutral", "contradiction"]), }) test_features = Features({ "input_text": Value("string"), "input_labels": ClassLabel(num_classes=3, names=["entailment", "contradiction", "neutral"]), }) train_data = { "input_text": ["a", "a", "b", "b", "c", "c"], "input_labels": [0, 0, 1, 1, 2, 2] } test_data = { "input_text": ["a", "a", "c", "c", "b", "b"], "input_labels": [0, 0, 1, 1, 2, 2] } label2id = {"CONTRADICTION": 0, "ENTAILMENT": 2, "NEUTRAL": 1} id2label = {v: k for k, v in label2id.items()} train_expected_labels = [2, 2, 1, 1, 0, 0] test_expected_labels = [2, 2, 0, 0, 1, 1] train_expected_label_names = [ id2label[idx] for idx in train_expected_labels ] test_expected_label_names = [ id2label[idx] for idx in test_expected_labels ] dsets = DatasetDict({ "train": Dataset.from_dict(train_data, features=train_features), "test": Dataset.from_dict(test_data, features=test_features), }) dsets = dsets.align_labels_with_mapping(label2id, "input_labels") self.assertListEqual(train_expected_labels, dsets["train"]["input_labels"]) self.assertListEqual(test_expected_labels, dsets["test"]["input_labels"]) train_aligned_label_names = [ dsets["train"].features["input_labels"].int2str(idx) for idx in dsets["train"]["input_labels"] ] test_aligned_label_names = [ dsets["test"].features["input_labels"].int2str(idx) for idx in dsets["test"]["input_labels"] ] self.assertListEqual(train_expected_label_names, train_aligned_label_names) self.assertListEqual(test_expected_label_names, test_aligned_label_names)
def test_encode_batch_with_example_with_empty_first_elem(): features = Features({ "x": Sequence(Sequence(ClassLabel(names=["a", "b"]))), }) encoded_batch = features.encode_batch( {"x": [ [["a"], ["b"]], [[], ["b"]], ]}) assert encoded_batch == {"x": [[[0], [1]], [[], [1]]]}
def test_from_dict(self): input_schema = Features({"image": Image()}) label_schema = Features({"labels": ClassLabel}) template_dict = { "image_column": "input_image", "label_column": "input_label", } task = ImageClassification.from_dict(template_dict) self.assertEqual("image-classification", task.task) self.assertEqual(input_schema, task.input_schema) self.assertEqual(label_schema, task.label_schema)
def test_from_dict(self): input_schema = Features({"audio_file_path": Value("string")}) label_schema = Features({"transcription": Value("string")}) template_dict = { "audio_file_path_column": "input_audio_file_path", "transcription_column": "input_transcription", } task = AutomaticSpeechRecognition.from_dict(template_dict) self.assertEqual("automatic-speech-recognition", task.task) self.assertEqual(input_schema, task.input_schema) self.assertEqual(label_schema, task.label_schema)
def test_dataset_concatenate_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") data1 = {"audio": [audio_path]} dset1 = Dataset.from_dict(data1, features=Features({"audio": Audio()})) data2 = {"audio": [{"bytes": open(audio_path, "rb").read()}]} dset2 = Dataset.from_dict(data2, features=Features({"audio": Audio()})) concatenated_dataset = concatenate_datasets([dset1, dset2]) assert len(concatenated_dataset) == len(dset1) + len(dset2) assert concatenated_dataset[0]["audio"]["array"].shape == dset1[0][ "audio"]["array"].shape assert concatenated_dataset[1]["audio"]["array"].shape == dset2[0][ "audio"]["array"].shape
def test_from_dict(self): input_schema = Features({"image_file_path": Value("string")}) label_schema = Features( {"labels": ClassLabel(names=tuple(self.labels))}) template_dict = { "image_file_path_column": "input_image_file_path", "label_column": "input_label", "labels": self.labels, } task = ImageClassification.from_dict(template_dict) self.assertEqual("image-classification", task.task) self.assertEqual(input_schema, task.input_schema) self.assertEqual(label_schema, task.label_schema)
def test_flatten_with_sequence(self): features = Features( {"foo": Sequence({"bar": { "my_value": Value("int32") }})}) _features = features.copy() flattened_features = features.flatten() assert flattened_features == { "foo.bar": [{ "my_value": Value("int32") }] } assert features == _features, "calling flatten shouldn't alter the current features"
def test_as_dataset(self): with tempfile.TemporaryDirectory() as tmp_dir: dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy") os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for split in dummy_builder.info.splits: writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 10}) writer.finalize() dsets = dummy_builder.as_dataset() self.assertIsInstance(dsets, DatasetDict) self.assertListEqual(list(dsets.keys()), ["train", "test"]) self.assertEqual(len(dsets["train"]), 10) self.assertEqual(len(dsets["test"]), 10) self.assertDictEqual(dsets["train"].features, Features({"text": Value("string")})) self.assertDictEqual(dsets["test"].features, Features({"text": Value("string")})) self.assertListEqual(dsets["train"].column_names, ["text"]) self.assertListEqual(dsets["test"].column_names, ["text"]) del dsets dset = dummy_builder.as_dataset("train") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train") self.assertEqual(len(dset), 10) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) del dset dset = dummy_builder.as_dataset("train+test[:30%]") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test[:30%]") self.assertEqual(len(dset), 13) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) del dset
def test_load_dataset_builder_for_absolute_script_dir( dataset_loading_script_dir, data_dir): builder = datasets.load_dataset_builder(dataset_loading_script_dir, data_dir=data_dir) assert isinstance(builder, DatasetBuilder) assert builder.name == DATASET_LOADING_SCRIPT_NAME assert builder.info.features == Features({"text": Value("string")})
def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} features = Features({"audio": Audio(decode=False)}) dset = Dataset.from_dict(data, features=features) with dset.formatted_as("numpy"): item = dset[0] assert item.keys() == {"audio"} assert item["audio"] == {"path": audio_path, "bytes": None} batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert batch["audio"][0] == {"path": audio_path, "bytes": None} column = dset["audio"] assert len(column) == 1 assert column[0] == {"path": audio_path, "bytes": None} with dset.formatted_as("pandas"): item = dset[0] assert item.shape == (1, 1) assert item.columns == ["audio"] assert item["audio"][0] == {"path": audio_path, "bytes": None} batch = dset[:1] assert batch.shape == (1, 1) assert batch.columns == ["audio"] assert batch["audio"][0] == {"path": audio_path, "bytes": None} column = dset["audio"] assert len(column) == 1 assert column[0] == {"path": audio_path, "bytes": None}
def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): audio_filename = "test_audio_44100.mp3" data = {"audio": []} for file_path, file_obj in iter_archive(tar_mp3_path): data["audio"].append({"path": file_path, "bytes": file_obj.read()}) break features = Features({"audio": Audio()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} assert item["audio"].keys() == {"path", "array", "sampling_rate"} assert item["audio"]["path"] == audio_filename assert item["audio"]["array"].shape == (109440, ) assert item["audio"]["sampling_rate"] == 44100 batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} assert batch["audio"][0]["path"] == audio_filename assert batch["audio"][0]["array"].shape == (109440, ) assert batch["audio"][0]["sampling_rate"] == 44100 column = dset["audio"] assert len(column) == 1 assert column[0].keys() == {"path", "array", "sampling_rate"} assert column[0]["path"] == audio_filename assert column[0]["array"].shape == (109440, ) assert column[0]["sampling_rate"] == 44100
def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.mp3") data = {"audio": [audio_path]} features = Features({"audio": Audio()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item["audio"]["sampling_rate"] == 44100 dset = dset.cast_column("audio", Audio(sampling_rate=16000)) item = dset[0] assert item.keys() == {"audio"} assert item["audio"].keys() == {"path", "array", "sampling_rate"} assert item["audio"]["path"] == audio_path assert item["audio"]["array"].shape == (39707,) assert item["audio"]["sampling_rate"] == 16000 batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} assert batch["audio"][0]["path"] == audio_path assert batch["audio"][0]["array"].shape == (39707,) assert batch["audio"][0]["sampling_rate"] == 16000 column = dset["audio"] assert len(column) == 1 assert column[0].keys() == {"path", "array", "sampling_rate"} assert column[0]["path"] == audio_path assert column[0]["array"].shape == (39707,) assert column[0]["sampling_rate"] == 16000
def dataset(): n = 10 features = Features({ "tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"])), "answers": Sequence({ "text": Value("string"), "answer_start": Value("int32"), }), "id": Value("int64"), }) dataset = Dataset.from_dict( { "tokens": [["foo"] * 5] * n, "labels": [[1] * 5] * n, "answers": [{ "answer_start": [97], "text": ["1976"] }] * 10, "id": list(range(n)), }, features=features, ) return dataset
def test_dataset_with_audio_feature_map_is_decoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path], "text": ["Hello"]} features = Features({"audio": Audio(), "text": Value("string")}) dset = Dataset.from_dict(data, features=features) def process_audio_sampling_rate_by_example(example): example["double_sampling_rate"] = 2 * example["audio"]["sampling_rate"] return example decoded_dset = dset.map(process_audio_sampling_rate_by_example) for item in decoded_dset: assert item.keys() == {"audio", "text", "double_sampling_rate"} assert item["double_sampling_rate"] == 88200 def process_audio_sampling_rate_by_batch(batch): double_sampling_rates = [] for audio in batch["audio"]: double_sampling_rates.append(2 * audio["sampling_rate"]) batch["double_sampling_rate"] = double_sampling_rates return batch decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True) for item in decoded_dset: assert item.keys() == {"audio", "text", "double_sampling_rate"} assert item["double_sampling_rate"] == 88200
def test_flatten(self): features = Features({ "foo": { "bar1": Value("int32"), "bar2": { "foobar": Value("string") } } }) _features = features.copy() flattened_features = features.flatten() assert flattened_features == { "foo.bar1": Value("int32"), "foo.bar2.foobar": Value("string") } assert features == _features, "calling flatten shouldn't alter the current features"
def test_load_dataset_zip_jsonl(data_file, streaming, zip_jsonl_path, zip_jsonl_with_dir_path, jsonl_path): data_file_paths = { "zip_jsonl_path": zip_jsonl_path, "zip_jsonl_with_dir_path": zip_jsonl_with_dir_path, "jsonl_path": jsonl_path, } data_files = str(data_file_paths[data_file]) expected_size = 8 if data_file.startswith("zip") else 4 features = Features({ "col_1": Value("string"), "col_2": Value("int32"), "col_3": Value("float32") }) ds = load_dataset("json", split="train", data_files=data_files, features=features, streaming=streaming) if streaming: ds_item_counter = 0 for ds_item in ds: if ds_item_counter == 0: assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0} ds_item_counter += 1 assert ds_item_counter == expected_size else: assert ds.shape[0] == expected_size ds_item = next(iter(ds)) assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
def test_iterable_dataset_map_complex_features(dataset: IterableDataset, generate_examples_fn): # https://github.com/huggingface/datasets/issues/3505 ex_iterable = ExamplesIterable(generate_examples_fn, {"label": "positive"}) features = Features( { "id": Value("int64"), "label": Value("string"), } ) dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) dataset = dataset.cast_column("label", ClassLabel(names=["negative", "positive"])) dataset = dataset.map(lambda x: {"id+1": x["id"] + 1, **x}) assert isinstance(dataset._ex_iterable, MappedExamplesIterable) features["label"] = ClassLabel(names=["negative", "positive"]) assert [{k: v for k, v in ex.items() if k != "id+1"} for ex in dataset] == [ features.encode_example(ex) for _, ex in ex_iterable ]
def test_load_dataset_builder_for_community_dataset_with_script(): builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER) assert isinstance(builder, DatasetBuilder) assert builder.name == SAMPLE_DATASET_IDENTIFIER.split("/")[-1] assert builder.info.features == Features({"text": Value("string")}) namespace = SAMPLE_DATASET_IDENTIFIER[: SAMPLE_DATASET_IDENTIFIER.index("/")] assert builder._relative_data_dir().startswith(namespace) assert SAMPLE_DATASET_IDENTIFIER.replace("/", "___") in builder.__module__
def test_load_dataset_streaming_csv(path_extension, streaming, csv_path, bz2_csv_path): paths = {"csv": csv_path, "csv.bz2": bz2_csv_path} data_files = str(paths[path_extension]) features = Features({"col_1": Value("string"), "col_2": Value("int32"), "col_3": Value("float32")}) ds = load_dataset("csv", split="train", data_files=data_files, features=features, streaming=streaming) assert isinstance(ds, IterableDataset if streaming else Dataset) ds_item = next(iter(ds)) assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
def test_load_dataset_builder_for_relative_script_dir(dataset_loading_script_dir, data_dir): with set_current_working_directory_to_temp_dir(): relative_script_dir = DATASET_LOADING_SCRIPT_NAME shutil.copytree(dataset_loading_script_dir, relative_script_dir) builder = datasets.load_dataset_builder(relative_script_dir, data_dir=data_dir) assert isinstance(builder, DatasetBuilder) assert builder.name == DATASET_LOADING_SCRIPT_NAME assert builder.info.features == Features({"text": Value("string")})