def test_dataset_with_image_feature_with_none(): data = {"image": [None]} features = Features({"image": Image()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"image"} assert item["image"] is None batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"image"} assert isinstance(batch["image"], list) and all(item is None for item in batch["image"]) column = dset["image"] assert len(column) == 1 assert isinstance(column, list) and all(item is None for item in column) # nested tests data = {"images": [[None]]} features = Features({"images": Sequence(Image())}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"images"} assert all(i is None for i in item["images"]) data = {"nested": [{"image": None}]} features = Features({"nested": {"image": Image()}}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"nested"} assert item["nested"].keys() == {"image"} assert item["nested"]["image"] is None
def test_text_datasetdict_reader(split, features, keep_in_memory, text_path, tmp_path): if split: path = {split: text_path} else: split = "train" path = {"train": text_path, "test": text_path} cache_dir = tmp_path / "cache" default_expected_features = {"text": "string"} expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = TextDatasetReader(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read() assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 1 assert dataset.column_names == ["text"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_datasetdict_from_json( split, features, keep_in_memory, jsonl_path, tmp_path, ): file_path = jsonl_path field = None if split: path = {split: file_path} else: split = "train" path = {"train": file_path, "test": file_path} cache_dir = tmp_path / "cache" default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = DatasetDict.from_json( path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, field=field ) assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_image_feature_type_to_arrow(): features = Features({"image": Image()}) assert features.arrow_schema == pa.schema({"image": Image().pa_type}) features = Features({"struct_containing_an_image": {"image": Image()}}) assert features.arrow_schema == pa.schema({"struct_containing_an_image": pa.struct({"image": Image().pa_type})}) features = Features({"sequence_of_images": Sequence(Image())}) assert features.arrow_schema == pa.schema({"sequence_of_images": pa.list_(Image().pa_type)})
def test_csv_dataset_reader(path_type, split, features, keep_in_memory, csv_path, tmp_path): if issubclass(path_type, str): path = csv_path elif issubclass(path_type, list): path = [csv_path] cache_dir = tmp_path / "cache" expected_split = str(split) if split else "train" # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = { "col_1": "int64", "col_2": "int64", "col_3": "float64" } expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = CsvDatasetReader(path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read() assert isinstance(dataset, Dataset) assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == expected_split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_flatten(self): dset_split = Dataset.from_dict( { "a": [{ "b": { "c": ["text"] } }] * 10, "foo": [1] * 10 }, features=Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }), ) dset = DatasetDict({"train": dset_split, "test": dset_split}) dset = dset.flatten() self.assertDictEqual(dset.column_names, { "train": ["a.b.c", "foo"], "test": ["a.b.c", "foo"] }) self.assertListEqual(list(dset["train"].features.keys()), ["a.b.c", "foo"]) self.assertDictEqual( dset["train"].features, Features({ "a.b.c": Sequence(Value("string")), "foo": Value("int64") })) del dset
def test_datasetdict_from_csv(split, features, keep_in_memory, csv_path, tmp_path): if split: path = {split: csv_path} else: split = "train" path = {"train": csv_path, "test": csv_path} cache_dir = tmp_path / "cache" # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = { "col_1": "int64", "col_2": "int64", "col_3": "float64" } expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = DatasetDict.from_csv(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory) assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_text_dataset_reader(path_type, split, features, keep_in_memory, text_path, tmp_path): if issubclass(path_type, str): path = text_path elif issubclass(path_type, list): path = [text_path] cache_dir = tmp_path / "cache" expected_split = str(split) if split else "train" default_expected_features = {"text": "string"} expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = TextDatasetReader(path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read() assert isinstance(dataset, Dataset) assert dataset.num_rows == 4 assert dataset.num_columns == 1 assert dataset.column_names == ["text"] assert dataset.split == expected_split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_shuffle(self): with tempfile.TemporaryDirectory() as tmp_dir: dsets = self._create_dummy_dataset_dict() indices_cache_file_names = { "train": os.path.join(tmp_dir, "train.arrow"), "test": os.path.join(tmp_dir, "test.arrow"), } seeds = { "train": 1234, "test": 1234, } dsets_shuffled = dsets.shuffle( seeds=seeds, indices_cache_file_names=indices_cache_file_names, load_from_cache_file=False ) self.assertListEqual(dsets_shuffled["train"]["filename"], dsets_shuffled["test"]["filename"]) self.assertEqual(len(dsets_shuffled["train"]), 30) self.assertEqual(dsets_shuffled["train"][0]["filename"], "my_name-train_028") self.assertEqual(dsets_shuffled["train"][2]["filename"], "my_name-train_010") self.assertDictEqual(dsets["train"].features, Features({"filename": Value("string")})) self.assertDictEqual(dsets_shuffled["train"].features, Features({"filename": Value("string")})) # Reproducibility indices_cache_file_names_2 = { "train": os.path.join(tmp_dir, "train_2.arrow"), "test": os.path.join(tmp_dir, "test_2.arrow"), } dsets_shuffled_2 = dsets.shuffle( seeds=seeds, indices_cache_file_names=indices_cache_file_names_2, load_from_cache_file=False ) self.assertListEqual(dsets_shuffled["train"]["filename"], dsets_shuffled_2["train"]["filename"]) seeds = { "train": 1234, "test": 1, } indices_cache_file_names_3 = { "train": os.path.join(tmp_dir, "train_3.arrow"), "test": os.path.join(tmp_dir, "test_3.arrow"), } dsets_shuffled_3 = dsets.shuffle( seeds=seeds, indices_cache_file_names=indices_cache_file_names_3, load_from_cache_file=False ) self.assertNotEqual(dsets_shuffled_3["train"]["filename"], dsets_shuffled_3["test"]["filename"]) # other input types dsets_shuffled_int = dsets.shuffle(42) dsets_shuffled_alias = dsets.shuffle(seed=42) dsets_shuffled_none = dsets.shuffle() self.assertEqual(len(dsets_shuffled_int["train"]), 30) self.assertEqual(len(dsets_shuffled_alias["train"]), 30) self.assertEqual(len(dsets_shuffled_none["train"]), 30) del dsets, dsets_shuffled, dsets_shuffled_2, dsets_shuffled_3 del dsets_shuffled_int, dsets_shuffled_alias, dsets_shuffled_none
def test_dataset_concatenate_image_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other image_path = str(shared_datadir / "test_image_rgb.jpg") data1 = {"image": [image_path]} dset1 = Dataset.from_dict(data1, features=Features({"image": Image()})) data2 = {"image": [{"bytes": open(image_path, "rb").read()}]} dset2 = Dataset.from_dict(data2, features=Features({"image": Image()})) concatenated_dataset = concatenate_datasets([dset1, dset2]) assert len(concatenated_dataset) == len(dset1) + len(dset2) assert concatenated_dataset[0]["image"] == dset1[0]["image"] assert concatenated_dataset[1]["image"] == dset2[0]["image"]
def run_sparse_retrieval(datasets, training_args): #### retreival process #### retriever = SparseRetrieval(tokenize_fn=tokenize, data_path="./data", context_path="wikipedia_documents.json" # context_path="all_wikipedia_documents.json" ) # sparse embedding retrieval # retriever.get_sparse_embedding() #df = retriever.retrieve(datasets['validation']) # bm25 retrieval # retriever.get_embedding_BM25() # df = retriever.retrieve_BM25(query_or_dataset=datasets['validation'], topk=10) # elastic search retrieval # retriever.get_elastic_search() df = retriever.retrieve_ES(query_or_dataset=datasets['validation'], topk=10) # faiss retrieval # df = retriever.retrieve_faiss(dataset['validation']) if training_args.do_predict: # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다. f = Features({ 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) elif training_args.do_eval: # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다. f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)}) return datasets
def run_sparse_retrieval(datasets, training_args, inf_args): #### retreival process #### if inf_args.retrieval == None: retriever = SparseRetrieval_BM25PLUS( tokenize_fn=tokenize, data_path="./data", context_path="wikipedia_documents.json") elif inf_args.retrieval.lower() == "sparse": retriever = SparseRetrieval(tokenize_fn=tokenize, data_path="./data", context_path="wikipedia_documents.json") # elif inf_args.retrieval.lower() == "bm25" or inf_args.retrieval.lower() == "bm25": # retriever = SparseRetrieval_BM25(tokenize_fn=tokenize, # data_path="./data", # context_path="wikipedia_documents.json") retriever.get_sparse_embedding() df = retriever.retrieve(datasets['validation'], inf_args.k) # faiss retrieval # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다. if training_args.do_predict: f = Features({ 'contexts': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다. elif training_args.do_eval: f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)}) return datasets
def load_datasets(lang="es", random_state=2021, preprocessing_args={}): """ Load emotion recognition datasets """ train_df = load_df(paths[lang]["train"]) test_df = load_df(paths[lang]["test"]) train_df, dev_df = train_test_split(train_df, stratify=train_df["label"], random_state=random_state) for df in [train_df, dev_df, test_df]: for label, idx in label2id.items(): df.loc[df["label"] == label, "label"] = idx df["label"] = df["label"].astype(int) preprocess = lambda x: preprocess_tweet(x, lang=lang, **preprocessing_args) train_df.loc[:, "text"] = train_df["text"].apply(preprocess) dev_df.loc[:, "text"] = dev_df["text"].apply(preprocess) test_df.loc[:, "text"] = test_df["text"].apply(preprocess) features = Features({ 'text': Value('string'), 'label': ClassLabel(num_classes=len(id2label), names=[id2label[k] for k in sorted(id2label.keys())]) }) train_dataset = Dataset.from_pandas(train_df, features=features) dev_dataset = Dataset.from_pandas(dev_df, features=features) test_dataset = Dataset.from_pandas(test_df, features=features) return train_dataset, dev_dataset, test_dataset
def test_dataset_with_image_feature_from_np_array(): import PIL.Image image_array = np.arange(640 * 480, dtype=np.uint8).reshape(480, 640) data = {"image": [image_array]} features = Features({"image": Image()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"image"} assert isinstance(item["image"], PIL.Image.Image) np.testing.assert_array_equal(np.array(item["image"]), image_array) assert item["image"].filename == "" assert item["image"].format == "PNG" assert item["image"].size == (640, 480) batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"image"} assert isinstance(batch["image"], list) and all(isinstance(item, PIL.Image.Image) for item in batch["image"]) np.testing.assert_array_equal(np.array(batch["image"][0]), image_array) assert batch["image"][0].filename == "" assert batch["image"][0].format == "PNG" assert batch["image"][0].size == (640, 480) column = dset["image"] assert len(column) == 1 assert isinstance(column, list) and all(isinstance(item, PIL.Image.Image) for item in column) np.testing.assert_array_equal(np.array(column[0]), image_array) assert column[0].filename == "" assert column[0].format == "PNG" assert column[0].size == (640, 480)
def test_push_dataset_to_hub_custom_features_image(self): image_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_image_rgb.jpg") data = {"x": [image_path, None], "y": [0, -1]} features = Features({"x": Image(), "y": Value("int32")}) ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") self.assertListEqual(ds.column_names, hub_ds.column_names) self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys())) self.assertDictEqual(ds.features, hub_ds.features) self.assertEqual(ds[:], hub_ds[:]) hub_ds = hub_ds.cast_column("x", Image(decode=False)) elem = hub_ds[0]["x"] path, bytes_ = elem["path"], elem["bytes"] self.assertTrue(bool(path) == (not embed_external_files)) self.assertTrue(bool(bytes_) == embed_external_files) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset")
def test_push_dataset_to_hub_custom_features_audio(self): audio_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_audio_44100.wav") data = {"x": [audio_path, None], "y": [0, -1]} features = Features({"x": Audio(), "y": Value("int32")}) ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") self.assertListEqual(ds.column_names, hub_ds.column_names) self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys())) self.assertDictEqual(ds.features, hub_ds.features) np.testing.assert_equal(ds[0]["x"]["array"], hub_ds[0]["x"]["array"]) self.assertEqual( ds[1], hub_ds[1] ) # don't test hub_ds[0] since audio decoding might be slightly different hub_ds = hub_ds.cast_column("x", Audio(decode=False)) elem = hub_ds[0]["x"] path, bytes_ = elem["path"], elem["bytes"] self.assertTrue(bool(path) == (not embed_external_files)) self.assertTrue(bool(bytes_) == embed_external_files) finally: self.cleanup_repo(ds_name)
def build_dataset(df, tokenizer, batch_size): features = Features({ 'id': Value('uint64'), 'context': Value('string'), 'text': Value('string'), }) dataset = Dataset.from_pandas(df, features=features) dataset = dataset.map( lambda x: tokenizer(x["text"], x["context"], padding="longest", truncation='longest_first'), batched=True, batch_size=batch_size, ) def format_dataset(dataset): dataset.set_format( type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask']) return dataset dataset = format_dataset(dataset) return dataset
def test_dataset_with_image_feature_tar_jpg(tar_jpg_path): import PIL.Image data = {"image": []} for file_path, file_obj in iter_archive(tar_jpg_path): data["image"].append({"path": file_path, "bytes": file_obj.read()}) break features = Features({"image": Image()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"image"} assert isinstance(item["image"], PIL.Image.Image) assert item["image"].filename == "" assert item["image"].format == "JPEG" assert item["image"].size == (640, 480) assert item["image"].mode == "RGB" batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"image"} assert isinstance(batch["image"], list) and all(isinstance(item, PIL.Image.Image) for item in batch["image"]) assert batch["image"][0].filename == "" assert batch["image"][0].format == "JPEG" assert batch["image"][0].size == (640, 480) assert batch["image"][0].mode == "RGB" column = dset["image"] assert len(column) == 1 assert isinstance(column, list) and all(isinstance(item, PIL.Image.Image) for item in column) assert column[0].filename == "" assert column[0].format == "JPEG" assert column[0].size == (640, 480) assert column[0].mode == "RGB"
def complex_dataset(): features = { "translation": Translation(languages=("en", "fr")), "sentiment": ClassLabel(num_classes=2), } return datasets.Dataset.from_dict(COMPLEX_DATA, Features(features))
def test_push_dataset_dict_to_hub_custom_features(self): features = Features({ "x": Value("int64"), "y": ClassLabel(names=["neg", "pos"]) }) ds = Dataset.from_dict({ "x": [1, 2, 3], "y": [0, 0, 1] }, features=features) local_ds = DatasetDict({"test": ds}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertDictEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds["test"].features.keys()), list(hub_ds["test"].features.keys())) self.assertDictEqual(local_ds["test"].features, hub_ds["test"].features) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset")
def save_data(train_df, val_df): train_f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'question_type': Value(dtype='int32', id=None) }) train_datasets = DatasetDict({ 'train': Dataset.from_pandas(train_df, features=train_f), 'validation': Dataset.from_pandas(val_df, features=train_f) }) file = open("../../data/question_type.pkl", "wb") pickle.dump(train_datasets, file) file.close()
def test_dataset_with_image_feature(shared_datadir): import PIL.Image image_path = str(shared_datadir / "test_image_rgb.jpg") data = {"image": [image_path]} features = Features({"image": Image()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"image"} assert isinstance(item["image"], PIL.Image.Image) assert os.path.samefile(item["image"].filename, image_path) assert item["image"].format == "JPEG" assert item["image"].size == (640, 480) assert item["image"].mode == "RGB" batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"image"} assert isinstance(batch["image"], list) and all(isinstance(item, PIL.Image.Image) for item in batch["image"]) assert os.path.samefile(batch["image"][0].filename, image_path) assert batch["image"][0].format == "JPEG" assert batch["image"][0].size == (640, 480) assert batch["image"][0].mode == "RGB" column = dset["image"] assert len(column) == 1 assert isinstance(column, list) and all(isinstance(item, PIL.Image.Image) for item in column) assert os.path.samefile(column[0].filename, image_path) assert column[0].format == "JPEG" assert column[0].size == (640, 480) assert column[0].mode == "RGB"
def test_formatted_dataset_with_image_feature_undecoded(shared_datadir): image_path = str(shared_datadir / "test_image_rgb.jpg") data = {"image": [image_path]} features = Features({"image": Image(decode=False)}) dset = Dataset.from_dict(data, features=features) with dset.formatted_as("numpy"): item = dset[0] assert item.keys() == {"image"} assert item["image"] == {"path": image_path, "bytes": None} batch = dset[:1] assert batch.keys() == {"image"} assert len(batch["image"]) == 1 assert batch["image"][0] == {"path": image_path, "bytes": None} column = dset["image"] assert len(column) == 1 assert column[0] == {"path": image_path, "bytes": None} with dset.formatted_as("pandas"): item = dset[0] assert item.shape == (1, 1) assert item.columns == ["image"] assert item["image"][0] == {"path": image_path, "bytes": None} batch = dset[:1] assert batch.shape == (1, 1) assert batch.columns == ["image"] assert batch["image"][0] == {"path": image_path, "bytes": None} column = dset["image"] assert len(column) == 1 assert column[0] == {"path": image_path, "bytes": None}
def _set_features(self): """Set the features of the dataset.""" with self.format(): self.info.features = Features.from_arrow_schema( pa.Table.from_pydict( self[:1], ).schema )
def get_etr_dataset(args): etr_path = p.join(args.path.train_data_dir, "etr_qa_dataset.json") if not p.exists(etr_path): raise FileNotFoundError( f"ETRI 데이터 셋 {etr_path}로 파일명 바꿔서 데이터 넣어주시길 바랍니다.") with open(etr_path, "r") as f: etr_dict = json.load(f) # print(etr_dict["data"][0]) new_dataset = defaultdict(list) cnt = 0 for datas in etr_dict["data"]: title = datas["title"] context = datas["paragraphs"][0]["context"] for questions in datas["paragraphs"][0]["qas"]: question = questions["question"] answers = { "answer_start": [questions["answers"][0]["answer_start"]], "text": [questions["answers"][0]["text"]], } new_dataset["id"].append(f"etr-custom-{cnt}") new_dataset["title"].append(title) new_dataset["context"].append(context) new_dataset["question"].append(question) new_dataset["answers"].append(answers) cnt += 1 f = Features({ "answers": Sequence( feature={ "text": Value(dtype="string", id=None), "answer_start": Value(dtype="int32", id=None) }, length=-1, id=None, ), "id": Value(dtype="string", id=None), "context": Value(dtype="string", id=None), "question": Value(dtype="string", id=None), "title": Value(dtype="string", id=None), }) df = pd.DataFrame(new_dataset) etr_dataset = Dataset.from_pandas(df, features=f) return etr_dataset
def test_dataset_with_image_feature_map(shared_datadir): image_path = str(shared_datadir / "test_image_rgb.jpg") data = {"image": [image_path], "caption": ["cats sleeping"]} features = Features({"image": Image(), "caption": Value("string")}) dset = Dataset.from_dict(data, features=features) for item in dset: assert item.keys() == {"image", "caption"} assert item == { "image": { "path": image_path, "bytes": None }, "caption": "cats sleeping" } # no decoding def process_caption(example): example["caption"] = "Two " + example["caption"] return example processed_dset = dset.map(process_caption) for item in processed_dset: assert item.keys() == {"image", "caption"} assert item == { "image": { "path": image_path, "bytes": None }, "caption": "Two cats sleeping" } # decoding example def process_image_by_example(example): example["mode"] = example["image"].mode return example decoded_dset = dset.map(process_image_by_example) for item in decoded_dset: assert item.keys() == {"image", "caption", "mode"} assert os.path.samefile(item["image"]["path"], image_path) assert item["caption"] == "cats sleeping" assert item["mode"] == "RGB" # decoding batch def process_image_by_batch(batch): batch["mode"] = [image.mode for image in batch["image"]] return batch decoded_dset = dset.map(process_image_by_batch, batched=True) for item in decoded_dset: assert item.keys() == {"image", "caption", "mode"} assert os.path.samefile(item["image"]["path"], image_path) assert item["caption"] == "cats sleeping" assert item["mode"] == "RGB"
def test_caching(self): n_rows = 10 features = Features({"foo": Value("string"), "bar": Value("string")}) with tempfile.TemporaryDirectory() as tmp_dir: # Use \n for newline. Windows automatically adds the \r when writing the file # see https://docs.python.org/3/library/os.html#os.linesep open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write("\n".join(",".join(["foo", "bar"]) for _ in range(n_rows + 1))) ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", keep_in_memory=False, ) data_file = ds.cache_files[0]["filename"] fingerprint = ds._fingerprint self.assertEqual(len(ds), n_rows) del ds ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", keep_in_memory=False, ) self.assertEqual(ds.cache_files[0]["filename"], data_file) self.assertEqual(ds._fingerprint, fingerprint) del ds ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", features=features, keep_in_memory=False, ) self.assertNotEqual(ds.cache_files[0]["filename"], data_file) self.assertNotEqual(ds._fingerprint, fingerprint) del ds open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write("\n".join(",".join(["Foo", "Bar"]) for _ in range(n_rows + 1))) ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", keep_in_memory=False, ) self.assertNotEqual(ds.cache_files[0]["filename"], data_file) self.assertNotEqual(ds._fingerprint, fingerprint) self.assertEqual(len(ds), n_rows) del ds
def test_parquet_datasetdict_reader_features(features, parquet_path, tmp_path): cache_dir = tmp_path / "cache" default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = ( Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None ) dataset = ParquetDatasetReader({"train": parquet_path}, features=features, cache_dir=cache_dir).read() _check_parquet_datasetdict(dataset, expected_features)
def test_dataset_from_text_features(features, text_path, tmp_path): cache_dir = tmp_path / "cache" default_expected_features = {"text": "string"} expected_features = features.copy() if features else default_expected_features features = ( Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None ) dataset = TextDatasetReader(text_path, features=features, cache_dir=cache_dir).read() _check_text_dataset(dataset, expected_features)
def test_dataset_from_json_features(features, jsonl_path, tmp_path): cache_dir = tmp_path / "cache" default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = ( Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None ) dataset = JsonDatasetReader(jsonl_path, features=features, cache_dir=cache_dir).read() _check_json_dataset(dataset, expected_features)