def test_encode_batch_with_example_with_empty_first_elem(): features = Features({ "x": Sequence(Sequence(ClassLabel(names=["a", "b"]))), }) encoded_batch = features.encode_batch( {"x": [ [["a"], ["b"]], [[], ["b"]], ]}) assert encoded_batch == {"x": [[[0], [1]], [[], [1]]]}
def test_dataset_with_audio_feature_map_is_not_decoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path], "text": ["Hello"]} features = Features({"audio": Audio(), "text": Value("string")}) dset = Dataset.from_dict(data, features=features) expected_audio = features.encode_batch(data)["audio"][0] for item in dset._iter(decoded=False): assert item.keys() == {"audio", "text"} assert item == {"audio": expected_audio, "text": "Hello"} def process_text(example): example["text"] = example["text"] + " World!" return example processed_dset = dset.map(process_text) for item in processed_dset._iter(decoded=False): assert item.keys() == {"audio", "text"} assert item == {"audio": expected_audio, "text": "Hello World!"}