def test_features(self): n_rows = 10 n_cols = 3 def get_features(type): return Features({str(i): type for i in range(n_cols)}) with tempfile.TemporaryDirectory() as tmp_dir: open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write("\n".join( ",".join([str(i) for i in range(n_cols)]) for _ in range(n_rows + 1))) for type in [ Value("float64"), Value("int8"), ClassLabel(num_classes=n_cols) ]: features = get_features(type) ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", features=features, ) self.assertEqual(len(ds), n_rows) self.assertDictEqual(ds.features, features) del ds
def test_load_dataset_zip_jsonl(data_file, streaming, zip_jsonl_path, zip_jsonl_with_dir_path, jsonl_path): data_file_paths = { "zip_jsonl_path": zip_jsonl_path, "zip_jsonl_with_dir_path": zip_jsonl_with_dir_path, "jsonl_path": jsonl_path, } data_files = str(data_file_paths[data_file]) expected_size = 8 if data_file.startswith("zip") else 4 features = Features({ "col_1": Value("string"), "col_2": Value("int32"), "col_3": Value("float32") }) ds = load_dataset("json", split="train", data_files=data_files, features=features, streaming=streaming) if streaming: ds_item_counter = 0 for ds_item in ds: if ds_item_counter == 0: assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0} ds_item_counter += 1 assert ds_item_counter == expected_size else: assert ds.shape[0] == expected_size ds_item = next(iter(ds)) assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
def test_builder_as_dataset(split, expected_dataset_class, expected_dataset_length, in_memory, tmp_path): cache_dir = str(tmp_path) dummy_builder = DummyBuilder(cache_dir=cache_dir, name="dummy") os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for info_split in dummy_builder.info.splits: with ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{info_split}.arrow"), features=Features({"text": Value("string")}), ) as writer: writer.write_batch({"text": ["foo"] * 10}) writer.finalize() with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase(): dataset = dummy_builder.as_dataset(split=split, in_memory=in_memory) assert isinstance(dataset, expected_dataset_class) if isinstance(dataset, DatasetDict): assert list(dataset.keys()) == ["train", "test"] datasets = dataset.values() expected_splits = ["train", "test"] elif isinstance(dataset, Dataset): datasets = [dataset] expected_splits = [split] for dataset, expected_split in zip(datasets, expected_splits): assert dataset.split == expected_split assert len(dataset) == expected_dataset_length assert dataset.features == Features({"text": Value("string")}) dataset.column_names == ["text"]
def test_features_dicts_are_synced(self): def assert_features_dicts_are_synced(features: Features): assert (hasattr(features, "_column_requires_decoding") and features.keys() == features._column_requires_decoding.keys()) features = Features( {"foo": Sequence({"bar": { "my_value": Value("int32") }})}) assert_features_dicts_are_synced(features) features["barfoo"] = Image() assert_features_dicts_are_synced(features) del features["barfoo"] assert_features_dicts_are_synced(features) features.update({"foobar": Value("string")}) assert_features_dicts_are_synced(features) features.pop("foobar") assert_features_dicts_are_synced(features) features.popitem() assert_features_dicts_are_synced(features) features.setdefault("xyz", Value("bool")) assert_features_dicts_are_synced(features) features.clear() assert_features_dicts_are_synced(features)
def __init__(self): super(TernaryNaturalLanguageInference, self).__init__( num_classes=3, input_schema=Schema( features=OrderedDict([ ("premise", Value(dtype="string")), ("hypothesis", Value(dtype="string")), ]), grounding_candidates={ "premise": {"premise", "sentence1"}, "hypothesis": {"hypothesis", "sentence2"}, }, ), output_schema=Schema( features=OrderedDict([ ( "label", ClassLabel( names=["entailment", "neutral", "contradiction"]), ), ]), grounding_candidates={ "label": {"label"}, }, ), identifier=self.__class__.__name__, )
def test_flatten(self): dset_split = Dataset.from_dict( { "a": [{ "b": { "c": ["text"] } }] * 10, "foo": [1] * 10 }, features=Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }), ) dset = DatasetDict({"train": dset_split, "test": dset_split}) dset = dset.flatten() self.assertDictEqual(dset.column_names, { "train": ["a.b.c", "foo"], "test": ["a.b.c", "foo"] }) self.assertListEqual(sorted(dset["train"].features.keys()), ["a.b.c", "foo"]) self.assertDictEqual( dset["train"].features, Features({ "a.b.c": Sequence(Value("string")), "foo": Value("int64") })) del dset
def test_iterable_dataset_cast(generate_examples_fn): ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 10}) features = Features({"id": Value("int64"), "label": Value("int64")}) dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) new_features = Features({"id": Value("int64"), "label": Value("bool")}) casted_dataset = dataset.cast(new_features) assert list(casted_dataset) == [new_features.encode_example(ex) for _, ex in ex_iterable]
def dataset(): n = 10 features = Features({ "tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"])), "answers": Sequence({ "text": Value("string"), "answer_start": Value("int32"), }), "id": Value("int64"), }) dataset = Dataset.from_dict( { "tokens": [["foo"] * 5] * n, "labels": [[1] * 5] * n, "answers": [{ "answer_start": [97], "text": ["1976"] }] * 10, "id": list(range(n)), }, features=features, ) return dataset
def test_from_dict(self): input_schema = Features({"text": Value("string")}) label_schema = Features({"summary": Value("string")}) template_dict = {"text_column": "input_text", "summary_column": "input_summary"} task = Summarization.from_dict(template_dict) self.assertEqual("summarization", task.task) self.assertEqual(input_schema, task.input_schema) self.assertEqual(label_schema, task.label_schema)
def test_load_dataset_streaming_csv(path_extension, streaming, csv_path, bz2_csv_path): paths = {"csv": csv_path, "csv.bz2": bz2_csv_path} data_files = str(paths[path_extension]) features = Features({"col_1": Value("string"), "col_2": Value("int32"), "col_3": Value("float32")}) ds = load_dataset("csv", split="train", data_files=data_files, features=features, streaming=streaming) assert isinstance(ds, IterableDataset if streaming else Dataset) ds_item = next(iter(ds)) assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
def test_caching(self): n_rows = 10 features = Features({"foo": Value("string"), "bar": Value("string")}) with tempfile.TemporaryDirectory() as tmp_dir: # Use \n for newline. Windows automatically adds the \r when writing the file # see https://docs.python.org/3/library/os.html#os.linesep open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write("\n".join(",".join(["foo", "bar"]) for _ in range(n_rows + 1))) ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", keep_in_memory=False, ) data_file = ds.cache_files[0]["filename"] fingerprint = ds._fingerprint self.assertEqual(len(ds), n_rows) del ds ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", keep_in_memory=False, ) self.assertEqual(ds.cache_files[0]["filename"], data_file) self.assertEqual(ds._fingerprint, fingerprint) del ds ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", features=features, keep_in_memory=False, ) self.assertNotEqual(ds.cache_files[0]["filename"], data_file) self.assertNotEqual(ds._fingerprint, fingerprint) del ds open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write("\n".join(",".join(["Foo", "Bar"]) for _ in range(n_rows + 1))) ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", keep_in_memory=False, ) self.assertNotEqual(ds.cache_files[0]["filename"], data_file) self.assertNotEqual(ds._fingerprint, fingerprint) self.assertEqual(len(ds), n_rows) del ds
def _info(self): return MetricInfo( description="dummy metric for tests", citation="insert citation here", features=Features({ "inputs": Value("int64"), "targets": Value("int64") }), )
def _info(self): return MetricInfo( description="dummy metric for tests", citation="insert citation here", features=Features({ "predictions": Value("int64"), "references": Value("int64") }), )
def test_cache_dir_for_features(self): with tempfile.TemporaryDirectory() as tmp_dir: f1 = Features({"id": Value("int8")}) f2 = Features({"id": Value("int32")}) dummy_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, name="dummy", features=f1) other_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, name="dummy", features=f1) self.assertEqual(dummy_builder.cache_dir, other_builder.cache_dir) other_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, name="dummy", features=f2) self.assertNotEqual(dummy_builder.cache_dir, other_builder.cache_dir)
def test_from_arrow_schema_simple(self): data = {"a": [{"b": {"c": "text"}}] * 10, "foo": [1] * 10} original_features = Features({"a": {"b": {"c": Value("string")}}, "foo": Value("int64")}) dset = Dataset.from_dict(data, features=original_features) new_features = dset.features new_dset = Dataset.from_dict(data, features=new_features) self.assertEqual(original_features.type, new_features.type) self.assertDictEqual(dset[0], new_dset[0]) self.assertDictEqual(dset[:], new_dset[:])
def test_align_labels_with_mapping(self): train_features = Features({ "input_text": Value("string"), "input_labels": ClassLabel(num_classes=3, names=["entailment", "neutral", "contradiction"]), }) test_features = Features({ "input_text": Value("string"), "input_labels": ClassLabel(num_classes=3, names=["entailment", "contradiction", "neutral"]), }) train_data = { "input_text": ["a", "a", "b", "b", "c", "c"], "input_labels": [0, 0, 1, 1, 2, 2] } test_data = { "input_text": ["a", "a", "c", "c", "b", "b"], "input_labels": [0, 0, 1, 1, 2, 2] } label2id = {"CONTRADICTION": 0, "ENTAILMENT": 2, "NEUTRAL": 1} id2label = {v: k for k, v in label2id.items()} train_expected_labels = [2, 2, 1, 1, 0, 0] test_expected_labels = [2, 2, 0, 0, 1, 1] train_expected_label_names = [ id2label[idx] for idx in train_expected_labels ] test_expected_label_names = [ id2label[idx] for idx in test_expected_labels ] dsets = DatasetDict({ "train": Dataset.from_dict(train_data, features=train_features), "test": Dataset.from_dict(test_data, features=test_features), }) dsets = dsets.align_labels_with_mapping(label2id, "input_labels") self.assertListEqual(train_expected_labels, dsets["train"]["input_labels"]) self.assertListEqual(test_expected_labels, dsets["test"]["input_labels"]) train_aligned_label_names = [ dsets["train"].features["input_labels"].int2str(idx) for idx in dsets["train"]["input_labels"] ] test_aligned_label_names = [ dsets["test"].features["input_labels"].int2str(idx) for idx in dsets["test"]["input_labels"] ] self.assertListEqual(train_expected_label_names, train_aligned_label_names) self.assertListEqual(test_expected_label_names, test_aligned_label_names)
def test_cast(self): dset = self._create_dummy_dataset_dict(multiple_columns=True) features = dset["train"].features features["col_1"] = Value("float64") dset = dset.cast(features) for dset_split in dset.values(): self.assertEqual(dset_split.num_columns, 2) self.assertEqual(dset_split.features["col_1"], Value("float64")) self.assertIsInstance(dset_split[0]["col_1"], float) del dset
def _info(self): return MetricInfo( description="dummy metric for tests", citation="insert citation here", features=Features( {"predictions": Sequence(Value("int64")), "references": Sequence(Value("int64"))} if self.config_name == "multilabel" else {"predictions": Value("int64"), "references": Value("int64")} ), )
def test_from_dict(self): input_schema = Features({"audio_file_path": Value("string")}) label_schema = Features({"transcription": Value("string")}) template_dict = { "audio_file_path_column": "input_audio_file_path", "transcription_column": "input_transcription", } task = AutomaticSpeechRecognition.from_dict(template_dict) self.assertEqual("automatic-speech-recognition", task.task) self.assertEqual(input_schema, task.input_schema) self.assertEqual(label_schema, task.label_schema)
def test_load_dataset_zip_csv(zip_csv_path): data_files = str(zip_csv_path) features = Features({ "col_1": Value("string"), "col_2": Value("int32"), "col_3": Value("float32") }) ds = load_dataset("csv", split="train", data_files=data_files, features=features) ds_item = next(iter(ds)) assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
def test_flatten_with_sequence(self): features = Features( {"foo": Sequence({"bar": { "my_value": Value("int32") }})}) _features = features.copy() flattened_features = features.flatten() assert flattened_features == { "foo.bar": [{ "my_value": Value("int32") }] } assert features == _features, "calling flatten shouldn't alter the current features"
def test_from_excel_file(resources_data_path): """This only shows an example of how one could read in an excel file""" str_value = Value("string") int_value = Value("int64") features = Features(Notification=int_value, Type=str_value, Plant=int_value, Serial=str_value) file_path = resources_data_path / "test.xlsx" df = pd.read_excel(file_path) dataset = Dataset.from_pandas(df, features=features) assert len(dataset) > 0
def test_as_dataset(self): with tempfile.TemporaryDirectory() as tmp_dir: dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy") os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for split in dummy_builder.info.splits: writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 10}) writer.finalize() dsets = dummy_builder.as_dataset() self.assertIsInstance(dsets, DatasetDict) self.assertListEqual(list(dsets.keys()), ["train", "test"]) self.assertEqual(len(dsets["train"]), 10) self.assertEqual(len(dsets["test"]), 10) self.assertDictEqual(dsets["train"].features, Features({"text": Value("string")})) self.assertDictEqual(dsets["test"].features, Features({"text": Value("string")})) self.assertListEqual(dsets["train"].column_names, ["text"]) self.assertListEqual(dsets["test"].column_names, ["text"]) del dsets dset = dummy_builder.as_dataset("train") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train") self.assertEqual(len(dset), 10) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) del dset dset = dummy_builder.as_dataset("train+test[:30%]") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test[:30%]") self.assertEqual(len(dset), 13) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) del dset
def __init__(self): super(BinarySentiment, self).__init__( num_classes=2, input_schema=Schema( features=OrderedDict( [ ("text", Value(dtype="string")), ] ), grounding_candidates={ "text": {"text", "sentence"}, }, ), output_schema=Schema( features=OrderedDict( [ ("label", ClassLabel(names=["negative", "positive"])), ] ), grounding_candidates={ "label": {"label"}, }, ), identifier=self.__class__.__name__, )
def test_datasetdict_from_text(split, features, keep_in_memory, text_path, tmp_path): if split: path = {split: text_path} else: split = "train" path = {"train": text_path, "test": text_path} cache_dir = tmp_path / "cache" default_expected_features = {"text": "string"} expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = DatasetDict.from_text(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory) assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 1 assert dataset.column_names == ["text"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_dataset_with_audio_feature_map_is_decoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path], "text": ["Hello"]} features = Features({"audio": Audio(), "text": Value("string")}) dset = Dataset.from_dict(data, features=features) def process_audio_sampling_rate_by_example(example): example["double_sampling_rate"] = 2 * example["audio"]["sampling_rate"] return example decoded_dset = dset.map(process_audio_sampling_rate_by_example) for item in decoded_dset: assert item.keys() == {"audio", "text", "double_sampling_rate"} assert item["double_sampling_rate"] == 88200 def process_audio_sampling_rate_by_batch(batch): double_sampling_rates = [] for audio in batch["audio"]: double_sampling_rates.append(2 * audio["sampling_rate"]) batch["double_sampling_rate"] = double_sampling_rates return batch decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True) for item in decoded_dset: assert item.keys() == {"audio", "text", "double_sampling_rate"} assert item["double_sampling_rate"] == 88200
def test_load_dataset_builder_for_absolute_script_dir( dataset_loading_script_dir, data_dir): builder = datasets.load_dataset_builder(dataset_loading_script_dir, data_dir=data_dir) assert isinstance(builder, DatasetBuilder) assert builder.name == DATASET_LOADING_SCRIPT_NAME assert builder.info.features == Features({"text": Value("string")})
def test_flatten(self): features = Features({ "foo": { "bar1": Value("int32"), "bar2": { "foobar": Value("string") } } }) _features = features.copy() flattened_features = features.flatten() assert flattened_features == { "foo.bar1": Value("int32"), "foo.bar2.foobar": Value("string") } assert features == _features, "calling flatten shouldn't alter the current features"
def test_datasetdict_from_csv(split, features, keep_in_memory, csv_path, tmp_path): if split: path = {split: csv_path} else: split = "train" path = {"train": csv_path, "test": csv_path} cache_dir = tmp_path / "cache" # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = { "col_1": "int64", "col_2": "int64", "col_3": "float64" } expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = DatasetDict.from_csv(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory) assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_iterable_dataset_map_complex_features(dataset: IterableDataset, generate_examples_fn): # https://github.com/huggingface/datasets/issues/3505 ex_iterable = ExamplesIterable(generate_examples_fn, {"label": "positive"}) features = Features( { "id": Value("int64"), "label": Value("string"), } ) dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) dataset = dataset.cast_column("label", ClassLabel(names=["negative", "positive"])) dataset = dataset.map(lambda x: {"id+1": x["id"] + 1, **x}) assert isinstance(dataset._ex_iterable, MappedExamplesIterable) features["label"] = ClassLabel(names=["negative", "positive"]) assert [{k: v for k, v in ex.items() if k != "id+1"} for ex in dataset] == [ features.encode_example(ex) for _, ex in ex_iterable ]