def test_features(self): n_rows = 10 n_cols = 3 def get_features(type): return Features({str(i): type for i in range(n_cols)}) with tempfile.TemporaryDirectory() as tmp_dir: open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write("\n".join( ",".join([str(i) for i in range(n_cols)]) for _ in range(n_rows + 1))) for type in [ Value("float64"), Value("int8"), ClassLabel(num_classes=n_cols) ]: features = get_features(type) ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", features=features, ) self.assertEqual(len(ds), n_rows) self.assertDictEqual(ds.features, features) del ds
def build_dataset(df, tokenizer, batch_size): features = Features({ 'id': Value('uint64'), 'context': Value('string'), 'text': Value('string'), }) dataset = Dataset.from_pandas(df, features=features) dataset = dataset.map( lambda x: tokenizer(x["text"], x["context"], padding="longest", truncation='longest_first'), batched=True, batch_size=batch_size, ) def format_dataset(dataset): dataset.set_format( type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask']) return dataset dataset = format_dataset(dataset) return dataset
def test_flatten(self): dset_split = Dataset.from_dict( { "a": [{ "b": { "c": ["text"] } }] * 10, "foo": [1] * 10 }, features=Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }), ) dset = DatasetDict({"train": dset_split, "test": dset_split}) dset = dset.flatten() self.assertDictEqual(dset.column_names, { "train": ["a.b.c", "foo"], "test": ["a.b.c", "foo"] }) self.assertListEqual(list(dset["train"].features.keys()), ["a.b.c", "foo"]) self.assertDictEqual( dset["train"].features, Features({ "a.b.c": Sequence(Value("string")), "foo": Value("int64") })) del dset
def test_cast_array_to_features(): arr = pa.array([[0, 1]]) assert cast_array_to_feature(arr, Sequence( Value("string"))).type == pa.list_(pa.string()) with pytest.raises(TypeError): cast_array_to_feature(arr, Sequence(Value("string")), allow_number_to_str=False)
def test_caching(self): n_rows = 10 features = Features({"foo": Value("string"), "bar": Value("string")}) with tempfile.TemporaryDirectory() as tmp_dir: # Use \n for newline. Windows automatically adds the \r when writing the file # see https://docs.python.org/3/library/os.html#os.linesep open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write("\n".join(",".join(["foo", "bar"]) for _ in range(n_rows + 1))) ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", keep_in_memory=False, ) data_file = ds.cache_files[0]["filename"] fingerprint = ds._fingerprint self.assertEqual(len(ds), n_rows) del ds ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", keep_in_memory=False, ) self.assertEqual(ds.cache_files[0]["filename"], data_file) self.assertEqual(ds._fingerprint, fingerprint) del ds ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", features=features, keep_in_memory=False, ) self.assertNotEqual(ds.cache_files[0]["filename"], data_file) self.assertNotEqual(ds._fingerprint, fingerprint) del ds open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write("\n".join(",".join(["Foo", "Bar"]) for _ in range(n_rows + 1))) ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", keep_in_memory=False, ) self.assertNotEqual(ds.cache_files[0]["filename"], data_file) self.assertNotEqual(ds._fingerprint, fingerprint) self.assertEqual(len(ds), n_rows) del ds
def test_cast_(self): dset = self._create_dummy_dataset_dict(multiple_columns=True) features = dset["train"].features features["col_1"] = Value("float64") dset.cast_(features) for dset_split in dset.values(): self.assertEqual(dset_split.num_columns, 2) self.assertEqual(dset_split.features["col_1"], Value("float64")) self.assertIsInstance(dset_split[0]["col_1"], float)
def main( rag_example_args: "RagExampleArguments", processing_args: "ProcessingArguments", index_hnsw_args: "IndexHnswArguments", ): ###################################### logger.info("Step 1 - Create the dataset") ###################################### # The dataset needed for RAG must have three columns: # - title (string): title of the document # - text (string): text of a passage of the document # - embeddings (array of dimension d): DPR representation of the passage # Let's say you have documents in tab-separated csv files with columns "title" and "text" assert os.path.isfile(rag_example_args.csv_path), "Please provide a valid path to a csv file" # You can load a Dataset object this way dataset = load_dataset( "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"] ) # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files # Then split the documents into passages of 100 words dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc) # And compute the embeddings ctx_encoder = DPRContextEncoder.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name).to(device=device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name) new_features = Features( {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))} ) # optional, save as float32 instead of float64 to save space dataset = dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer), batched=True, batch_size=processing_args.batch_size, features=new_features, ) # And finally save your dataset passages_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset") dataset.save_to_disk(passages_path) # from datasets import load_from_disk # dataset = load_from_disk(passages_path) # to reload the dataset ###################################### logger.info("Step 2 - Index the dataset") ###################################### # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) # And save the index index_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset_hnsw_index.faiss") dataset.get_index("embeddings").save(index_path)
def make_negative_dataset(args, bm25, queries, answers, contexts, name, num=16): total = [] scores, indices = bm25.get_relevant_doc_bulk(queries, topk=num * 2) answers, indices = np.array(answers, dtype="object"), np.array(indices) contexts = np.array(contexts, dtype="object") for idx, query in enumerate(queries): label = idx % num answer = answers[idx] context_list = contexts[indices[idx]] check_in = np.argwhere(context_list == answer) if check_in.shape[0] == 0: context_list[label] = answer context_list = context_list[:num] else: context_list[check_in[0][0]] = context_list[num] context_list[label] = answer context_list = context_list[:num] if idx % 100 == 0: print("query: ", query) print("answer: ", answer) print("negative:", context_list) print("label:", label) tmp = { "query": query, "negative_samples": context_list, "label": label } total.append(tmp) df = pd.DataFrame(total) f = Features({ "query": Value(dtype="string", id=None), "negative_samples": Sequence(feature=Value(dtype="string", id=None), length=-1, id=None), "label": Value(dtype="int32", id=None), }) dataset = Dataset.from_pandas(df, features=f) dataset.save_to_disk(os.path.join(args.path.train_data_dir, name))
def test_shuffle(self): with tempfile.TemporaryDirectory() as tmp_dir: dsets = self._create_dummy_dataset_dict() indices_cache_file_names = { "train": os.path.join(tmp_dir, "train.arrow"), "test": os.path.join(tmp_dir, "test.arrow"), } seeds = { "train": 1234, "test": 1234, } dsets_shuffled = dsets.shuffle( seeds=seeds, indices_cache_file_names=indices_cache_file_names, load_from_cache_file=False) self.assertListEqual(dsets_shuffled["train"]["filename"], dsets_shuffled["test"]["filename"]) self.assertEqual(len(dsets_shuffled["train"]), 30) self.assertEqual(dsets_shuffled["train"][0]["filename"], "my_name-train_028") self.assertEqual(dsets_shuffled["train"][2]["filename"], "my_name-train_010") self.assertDictEqual(dsets["train"].features, Features({"filename": Value("string")})) self.assertDictEqual(dsets_shuffled["train"].features, Features({"filename": Value("string")})) # Reproducibility indices_cache_file_names_2 = { "train": os.path.join(tmp_dir, "train_2.arrow"), "test": os.path.join(tmp_dir, "test_2.arrow"), } dsets_shuffled_2 = dsets.shuffle( seeds=seeds, indices_cache_file_names=indices_cache_file_names_2, load_from_cache_file=False) self.assertListEqual(dsets_shuffled["train"]["filename"], dsets_shuffled_2["train"]["filename"]) seeds = { "train": 1234, "test": 1, } indices_cache_file_names_3 = { "train": os.path.join(tmp_dir, "train_3.arrow"), "test": os.path.join(tmp_dir, "test_3.arrow"), } dsets_shuffled_3 = dsets.shuffle( seeds=seeds, indices_cache_file_names=indices_cache_file_names_3, load_from_cache_file=False) self.assertNotEqual(dsets_shuffled_3["train"]["filename"], dsets_shuffled_3["test"]["filename"]) del dsets, dsets_shuffled, dsets_shuffled_2, dsets_shuffled_3
def generate_faiss_index_dataset(data, ctx_encoder_name, args, device): """ Adapted from Huggingface example script at https://github.com/huggingface/transformers/blob/master/examples/research_projects/rag/use_own_knowledge_dataset.py """ import faiss if isinstance(data, str): dataset = load_dataset("csv", data_files=data, delimiter="\t", column_names=["title", "text"]) else: dataset = HFDataset.from_pandas(data) dataset = dataset.map( partial(split_documents, split_text_n=args.split_text_n, split_text_character=args.split_text_character), batched=True, num_proc=args.process_count, ) ctx_encoder = DPRContextEncoder.from_pretrained(ctx_encoder_name).to( device=device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( ctx_encoder_name) new_features = Features({ "text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32")) }) # optional, save as float32 instead of float64 to save space dataset = dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer, device=device), batched=True, batch_size=args.rag_embed_batch_size, features=new_features, ) if isinstance(data, str): dataset = dataset["train"] if args.save_knowledge_dataset: output_dataset_directory = os.path.join(args.output_dir, "knowledge_dataset") os.makedirs(output_dataset_directory, exist_ok=True) dataset.save_to_disk(output_dataset_directory) index = faiss.IndexHNSWFlat(args.faiss_d, args.faiss_m, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) return dataset
def embed_update(ctx_encoder, total_processes, device, process_num, shard_dir, csv_path): kb_dataset = load_dataset("csv", data_files=[csv_path], split="train", delimiter="\t", column_names=["title", "text"]) kb_dataset = kb_dataset.map( split_documents, batched=True, num_proc=1) # if you want you can load already splitted csv. kb_list = [ kb_dataset.shard(total_processes, i, contiguous=True) for i in range(total_processes) ] data_shrad = kb_list[process_num] arrow_folder = "data_" + str(process_num) passages_path = os.path.join(shard_dir, arrow_folder) context_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( "facebook/dpr-ctx_encoder-multiset-base") ctx_encoder = ctx_encoder.to(device=device) def embed(documents: dict, ctx_encoder: DPRContextEncoder, ctx_tokenizer: DPRContextEncoderTokenizerFast, device) -> dict: """Compute the DPR embeddings of document passages""" input_ids = ctx_tokenizer(documents["title"], documents["text"], truncation=True, padding="longest", return_tensors="pt")["input_ids"] embeddings = ctx_encoder(input_ids.to(device=device), return_dict=True).pooler_output return {"embeddings": embeddings.detach().cpu().numpy()} new_features = Features({ "text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32")) }) # optional, save as float32 instead of float64 to save space dataset = data_shrad.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=context_tokenizer, device=device), batched=True, batch_size=16, features=new_features, ) dataset.save_to_disk(passages_path)
def test_push_dataset_to_hub_custom_features_image(self): image_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_image_rgb.jpg") data = {"x": [image_path, None], "y": [0, -1]} features = Features({"x": Image(), "y": Value("int32")}) ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") self.assertListEqual(ds.column_names, hub_ds.column_names) self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys())) self.assertDictEqual(ds.features, hub_ds.features) self.assertEqual(ds[:], hub_ds[:]) hub_ds = hub_ds.cast_column("x", Image(decode=False)) elem = hub_ds[0]["x"] path, bytes_ = elem["path"], elem["bytes"] self.assertTrue(bool(path) == (not embed_external_files)) self.assertTrue(bool(bytes_) == embed_external_files) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset")
def load_datasets(lang="es", random_state=2021, preprocessing_args={}): """ Load emotion recognition datasets """ train_df = load_df(paths[lang]["train"]) test_df = load_df(paths[lang]["test"]) train_df, dev_df = train_test_split(train_df, stratify=train_df["label"], random_state=random_state) for df in [train_df, dev_df, test_df]: for label, idx in label2id.items(): df.loc[df["label"] == label, "label"] = idx df["label"] = df["label"].astype(int) preprocess = lambda x: preprocess_tweet(x, lang=lang, **preprocessing_args) train_df.loc[:, "text"] = train_df["text"].apply(preprocess) dev_df.loc[:, "text"] = dev_df["text"].apply(preprocess) test_df.loc[:, "text"] = test_df["text"].apply(preprocess) features = Features({ 'text': Value('string'), 'label': ClassLabel(num_classes=len(id2label), names=[id2label[k] for k in sorted(id2label.keys())]) }) train_dataset = Dataset.from_pandas(train_df, features=features) dev_dataset = Dataset.from_pandas(dev_df, features=features) test_dataset = Dataset.from_pandas(test_df, features=features) return train_dataset, dev_dataset, test_dataset
def test_csv_dataset_reader(path_type, split, features, keep_in_memory, csv_path, tmp_path): if issubclass(path_type, str): path = csv_path elif issubclass(path_type, list): path = [csv_path] cache_dir = tmp_path / "cache" expected_split = str(split) if split else "train" # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = { "col_1": "int64", "col_2": "int64", "col_3": "float64" } expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = CsvDatasetReader(path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read() assert isinstance(dataset, Dataset) assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == expected_split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_datasetdict_from_json( split, features, keep_in_memory, jsonl_path, tmp_path, ): file_path = jsonl_path field = None if split: path = {split: file_path} else: split = "train" path = {"train": file_path, "test": file_path} cache_dir = tmp_path / "cache" default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = DatasetDict.from_json( path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, field=field ) assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_datasetdict_from_csv(split, features, keep_in_memory, csv_path, tmp_path): if split: path = {split: csv_path} else: split = "train" path = {"train": csv_path, "test": csv_path} cache_dir = tmp_path / "cache" # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = { "col_1": "int64", "col_2": "int64", "col_3": "float64" } expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = DatasetDict.from_csv(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory) assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_text_datasetdict_reader(split, features, keep_in_memory, text_path, tmp_path): if split: path = {split: text_path} else: split = "train" path = {"train": text_path, "test": text_path} cache_dir = tmp_path / "cache" default_expected_features = {"text": "string"} expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = TextDatasetReader(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read() assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 1 assert dataset.column_names == ["text"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_push_dataset_dict_to_hub_custom_features(self): features = Features({ "x": Value("int64"), "y": ClassLabel(names=["neg", "pos"]) }) ds = Dataset.from_dict({ "x": [1, 2, 3], "y": [0, 0, 1] }, features=features) local_ds = DatasetDict({"test": ds}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertDictEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds["test"].features.keys()), list(hub_ds["test"].features.keys())) self.assertDictEqual(local_ds["test"].features, hub_ds["test"].features) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset")
def test_text_dataset_reader(path_type, split, features, keep_in_memory, text_path, tmp_path): if issubclass(path_type, str): path = text_path elif issubclass(path_type, list): path = [text_path] cache_dir = tmp_path / "cache" expected_split = str(split) if split else "train" default_expected_features = {"text": "string"} expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = TextDatasetReader(path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read() assert isinstance(dataset, Dataset) assert dataset.num_rows == 4 assert dataset.num_columns == 1 assert dataset.column_names == ["text"] assert dataset.split == expected_split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_push_dataset_to_hub_custom_features_audio(self): audio_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_audio_44100.wav") data = {"x": [audio_path, None], "y": [0, -1]} features = Features({"x": Audio(), "y": Value("int32")}) ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") self.assertListEqual(ds.column_names, hub_ds.column_names) self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys())) self.assertDictEqual(ds.features, hub_ds.features) np.testing.assert_equal(ds[0]["x"]["array"], hub_ds[0]["x"]["array"]) self.assertEqual( ds[1], hub_ds[1] ) # don't test hub_ds[0] since audio decoding might be slightly different hub_ds = hub_ds.cast_column("x", Audio(decode=False)) elem = hub_ds[0]["x"] path, bytes_ = elem["path"], elem["bytes"] self.assertTrue(bool(path) == (not embed_external_files)) self.assertTrue(bool(bytes_) == embed_external_files) finally: self.cleanup_repo(ds_name)
def load_datasets(preprocess_args={}): """ Return train, dev, test datasets """ train_files = glob(os.path.join(tass_dir, "train/*.tsv")) dev_files = glob(os.path.join(tass_dir, "dev/*.tsv")) test_files = glob(os.path.join(tass_dir, "test1.1/*.tsv")) train_dfs = {get_lang(file): load_df(file) for file in train_files} dev_dfs = {get_lang(file): load_df(file) for file in dev_files} test_dfs = { get_lang(file): load_df(file, test=True) for file in test_files } train_df = pd.concat(train_dfs.values()) dev_df = pd.concat(dev_dfs.values()) test_df = pd.concat(test_dfs.values()) print(len(train_df), len(dev_df), len(test_df)) """ Tokenize tweets """ preprocess_with_args = lambda x: preprocess_tweet(x, **preprocess_args) train_df["text"] = train_df["text"].apply(preprocess_with_args) dev_df["text"] = dev_df["text"].apply(preprocess_with_args) test_df["text"] = test_df["text"].apply(preprocess_with_args) features = Features({ 'text': Value('string'), 'lang': Value('string'), 'label': ClassLabel(num_classes=3, names=["neg", "neu", "pos"]) }) columns = ["text", "lang", "label"] train_dataset = Dataset.from_pandas(train_df[columns], features=features) dev_dataset = Dataset.from_pandas(dev_df[columns], features=features) test_dataset = Dataset.from_pandas(test_df[columns], features=features) return train_dataset, dev_dataset, test_dataset
def run_sparse_retrieval(datasets, training_args): #### retreival process #### retriever = BM25Arti(tokenize_fn=tokenize, data_path="./data", context_path="wikipedia_documents.json") df = retriever.retrieve(datasets['validation']) if training_args.do_predict: # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다. f = Features({ 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)}) return datasets
def test_dataset_with_image_feature_map(shared_datadir): image_path = str(shared_datadir / "test_image_rgb.jpg") data = {"image": [image_path], "caption": ["cats sleeping"]} features = Features({"image": Image(), "caption": Value("string")}) dset = Dataset.from_dict(data, features=features) for item in dset: assert item.keys() == {"image", "caption"} assert item == { "image": { "path": image_path, "bytes": None }, "caption": "cats sleeping" } # no decoding def process_caption(example): example["caption"] = "Two " + example["caption"] return example processed_dset = dset.map(process_caption) for item in processed_dset: assert item.keys() == {"image", "caption"} assert item == { "image": { "path": image_path, "bytes": None }, "caption": "Two cats sleeping" } # decoding example def process_image_by_example(example): example["mode"] = example["image"].mode return example decoded_dset = dset.map(process_image_by_example) for item in decoded_dset: assert item.keys() == {"image", "caption", "mode"} assert os.path.samefile(item["image"]["path"], image_path) assert item["caption"] == "cats sleeping" assert item["mode"] == "RGB" # decoding batch def process_image_by_batch(batch): batch["mode"] = [image.mode for image in batch["image"]] return batch decoded_dset = dset.map(process_image_by_batch, batched=True) for item in decoded_dset: assert item.keys() == {"image", "caption", "mode"} assert os.path.samefile(item["image"]["path"], image_path) assert item["caption"] == "cats sleeping" assert item["mode"] == "RGB"
def test_caching(self): n_rows = 10 features = Features({"foo": Value("string"), "bar": Value("string")}) with tempfile.TemporaryDirectory() as tmp_dir: open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write("\n".join(",".join(["foo", "bar"]) for _ in range(n_rows + 1))) ds = load_dataset("./datasets/csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train") data_file = ds._data_files[0] fingerprint = ds._fingerprint self.assertEqual(len(ds), n_rows) del ds ds = load_dataset("./datasets/csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train") self.assertEqual(ds._data_files[0], data_file) self.assertEqual(ds._fingerprint, fingerprint) del ds ds = load_dataset( "./datasets/csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", features=features, ) self.assertNotEqual(ds._data_files[0], data_file) self.assertNotEqual(ds._fingerprint, fingerprint) del ds open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write("\n".join(",".join(["Foo", "Bar"]) for _ in range(n_rows + 1))) ds = load_dataset("./datasets/csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train") self.assertNotEqual(ds._data_files[0], data_file) self.assertNotEqual(ds._fingerprint, fingerprint) del ds
def test_dataset_from_json_features(features, jsonl_path, tmp_path): cache_dir = tmp_path / "cache" default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = ( Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None ) dataset = JsonDatasetReader(jsonl_path, features=features, cache_dir=cache_dir).read() _check_json_dataset(dataset, expected_features)
def test_parquet_datasetdict_reader_features(features, parquet_path, tmp_path): cache_dir = tmp_path / "cache" default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = ( Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None ) dataset = ParquetDatasetReader({"train": parquet_path}, features=features, cache_dir=cache_dir).read() _check_parquet_datasetdict(dataset, expected_features)
def test_dataset_from_text_features(features, text_path, tmp_path): cache_dir = tmp_path / "cache" default_expected_features = {"text": "string"} expected_features = features.copy() if features else default_expected_features features = ( Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None ) dataset = TextDatasetReader(text_path, features=features, cache_dir=cache_dir).read() _check_text_dataset(dataset, expected_features)
def test_dataset_from_csv_features(features, csv_path, tmp_path): cache_dir = tmp_path / "cache" # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = ( Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None ) dataset = CsvDatasetReader(csv_path, features=features, cache_dir=cache_dir).read() _check_csv_dataset(dataset, expected_features)
def test_datasetdict_from_text_features(features, text_path, tmp_path): cache_dir = tmp_path / "cache" # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = {"text": "string"} expected_features = features.copy() if features else default_expected_features features = ( Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None ) dataset = TextDatasetReader({"train": text_path}, features=features, cache_dir=cache_dir).read() _check_text_datasetdict(dataset, expected_features)
def load_dataset(self) -> None: logger.debug('loading rag dataset: %s', self.name) self.dataset = load_dataset('csv', data_files=[self.csv_path], split='train', delimiter=',', column_names=['title', 'text']) self.dataset = self.dataset.map( split_documents, batched=False, num_proc=6, batch_size=100, ) ctx_encoder = DPRContextEncoder.from_pretrained( self.context_encoder).to(device=self.device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( self.context_encoder) new_features = Features({ 'text': Value('string'), 'title': Value('string'), 'embeddings': Sequence(Value('float32')) }) # optional, save as float32 instead of float64 to save space self.dataset = self.dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer, device=self.device), batched=True, batch_size=16, features=new_features, ) self.dataset.save_to_disk(self.dataset_path) index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT) self.dataset.add_faiss_index('embeddings', custom_index=index) self.dataset.get_index('embeddings').save(self.faiss_path)