def test_features(self):
        n_rows = 10
        n_cols = 3

        def get_features(type):
            return Features({str(i): type for i in range(n_cols)})

        with tempfile.TemporaryDirectory() as tmp_dir:
            open(os.path.join(tmp_dir, "table.csv"), "w",
                 encoding="utf-8").write("\n".join(
                     ",".join([str(i) for i in range(n_cols)])
                     for _ in range(n_rows + 1)))
            for type in [
                    Value("float64"),
                    Value("int8"),
                    ClassLabel(num_classes=n_cols)
            ]:
                features = get_features(type)
                ds = load_dataset(
                    "csv",
                    data_files=os.path.join(tmp_dir, "table.csv"),
                    cache_dir=tmp_dir,
                    split="train",
                    features=features,
                )
                self.assertEqual(len(ds), n_rows)
                self.assertDictEqual(ds.features, features)
                del ds
def build_dataset(df, tokenizer, batch_size):
    features = Features({
        'id': Value('uint64'),
        'context': Value('string'),
        'text': Value('string'),
    })

    dataset = Dataset.from_pandas(df, features=features)

    dataset = dataset.map(
        lambda x: tokenizer(x["text"],
                            x["context"],
                            padding="longest",
                            truncation='longest_first'),
        batched=True,
        batch_size=batch_size,
    )

    def format_dataset(dataset):
        dataset.set_format(
            type='torch',
            columns=['input_ids', 'token_type_ids', 'attention_mask'])
        return dataset

    dataset = format_dataset(dataset)

    return dataset
Esempio n. 3
0
 def test_flatten(self):
     dset_split = Dataset.from_dict(
         {
             "a": [{
                 "b": {
                     "c": ["text"]
                 }
             }] * 10,
             "foo": [1] * 10
         },
         features=Features({
             "a": {
                 "b": Sequence({"c": Value("string")})
             },
             "foo": Value("int64")
         }),
     )
     dset = DatasetDict({"train": dset_split, "test": dset_split})
     dset = dset.flatten()
     self.assertDictEqual(dset.column_names, {
         "train": ["a.b.c", "foo"],
         "test": ["a.b.c", "foo"]
     })
     self.assertListEqual(list(dset["train"].features.keys()),
                          ["a.b.c", "foo"])
     self.assertDictEqual(
         dset["train"].features,
         Features({
             "a.b.c": Sequence(Value("string")),
             "foo": Value("int64")
         }))
     del dset
Esempio n. 4
0
def test_cast_array_to_features():
    arr = pa.array([[0, 1]])
    assert cast_array_to_feature(arr, Sequence(
        Value("string"))).type == pa.list_(pa.string())
    with pytest.raises(TypeError):
        cast_array_to_feature(arr,
                              Sequence(Value("string")),
                              allow_number_to_str=False)
    def test_caching(self):
        n_rows = 10

        features = Features({"foo": Value("string"), "bar": Value("string")})

        with tempfile.TemporaryDirectory() as tmp_dir:
            # Use \n for newline. Windows automatically adds the \r when writing the file
            # see https://docs.python.org/3/library/os.html#os.linesep
            open(os.path.join(tmp_dir, "table.csv"), "w",
                 encoding="utf-8").write("\n".join(",".join(["foo", "bar"])
                                                   for _ in range(n_rows + 1)))
            ds = load_dataset(
                "csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                keep_in_memory=False,
            )
            data_file = ds.cache_files[0]["filename"]
            fingerprint = ds._fingerprint
            self.assertEqual(len(ds), n_rows)
            del ds
            ds = load_dataset(
                "csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                keep_in_memory=False,
            )
            self.assertEqual(ds.cache_files[0]["filename"], data_file)
            self.assertEqual(ds._fingerprint, fingerprint)
            del ds
            ds = load_dataset(
                "csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                features=features,
                keep_in_memory=False,
            )
            self.assertNotEqual(ds.cache_files[0]["filename"], data_file)
            self.assertNotEqual(ds._fingerprint, fingerprint)
            del ds

            open(os.path.join(tmp_dir, "table.csv"), "w",
                 encoding="utf-8").write("\n".join(",".join(["Foo", "Bar"])
                                                   for _ in range(n_rows + 1)))
            ds = load_dataset(
                "csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                keep_in_memory=False,
            )
            self.assertNotEqual(ds.cache_files[0]["filename"], data_file)
            self.assertNotEqual(ds._fingerprint, fingerprint)
            self.assertEqual(len(ds), n_rows)
            del ds
 def test_cast_(self):
     dset = self._create_dummy_dataset_dict(multiple_columns=True)
     features = dset["train"].features
     features["col_1"] = Value("float64")
     dset.cast_(features)
     for dset_split in dset.values():
         self.assertEqual(dset_split.num_columns, 2)
         self.assertEqual(dset_split.features["col_1"], Value("float64"))
         self.assertIsInstance(dset_split[0]["col_1"], float)
def main(
    rag_example_args: "RagExampleArguments",
    processing_args: "ProcessingArguments",
    index_hnsw_args: "IndexHnswArguments",
):

    ######################################
    logger.info("Step 1 - Create the dataset")
    ######################################

    # The dataset needed for RAG must have three columns:
    # - title (string): title of the document
    # - text (string): text of a passage of the document
    # - embeddings (array of dimension d): DPR representation of the passage
    # Let's say you have documents in tab-separated csv files with columns "title" and "text"
    assert os.path.isfile(rag_example_args.csv_path), "Please provide a valid path to a csv file"

    # You can load a Dataset object this way
    dataset = load_dataset(
        "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]
    )

    # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files

    # Then split the documents into passages of 100 words
    dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc)

    # And compute the embeddings
    ctx_encoder = DPRContextEncoder.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name).to(device=device)
    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name)
    new_features = Features(
        {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))}
    )  # optional, save as float32 instead of float64 to save space
    dataset = dataset.map(
        partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer),
        batched=True,
        batch_size=processing_args.batch_size,
        features=new_features,
    )

    # And finally save your dataset
    passages_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset")
    dataset.save_to_disk(passages_path)
    # from datasets import load_from_disk
    # dataset = load_from_disk(passages_path)  # to reload the dataset

    ######################################
    logger.info("Step 2 - Index the dataset")
    ######################################

    # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search
    index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m, faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)

    # And save the index
    index_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset_hnsw_index.faiss")
    dataset.get_index("embeddings").save(index_path)
def make_negative_dataset(args,
                          bm25,
                          queries,
                          answers,
                          contexts,
                          name,
                          num=16):
    total = []
    scores, indices = bm25.get_relevant_doc_bulk(queries, topk=num * 2)

    answers, indices = np.array(answers, dtype="object"), np.array(indices)
    contexts = np.array(contexts, dtype="object")

    for idx, query in enumerate(queries):
        label = idx % num

        answer = answers[idx]
        context_list = contexts[indices[idx]]

        check_in = np.argwhere(context_list == answer)

        if check_in.shape[0] == 0:
            context_list[label] = answer
            context_list = context_list[:num]
        else:
            context_list[check_in[0][0]] = context_list[num]
            context_list[label] = answer
            context_list = context_list[:num]

        if idx % 100 == 0:
            print("query: ", query)
            print("answer: ", answer)
            print("negative:", context_list)
            print("label:", label)

        tmp = {
            "query": query,
            "negative_samples": context_list,
            "label": label
        }

        total.append(tmp)

    df = pd.DataFrame(total)

    f = Features({
        "query":
        Value(dtype="string", id=None),
        "negative_samples":
        Sequence(feature=Value(dtype="string", id=None), length=-1, id=None),
        "label":
        Value(dtype="int32", id=None),
    })

    dataset = Dataset.from_pandas(df, features=f)
    dataset.save_to_disk(os.path.join(args.path.train_data_dir, name))
Esempio n. 9
0
    def test_shuffle(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            dsets = self._create_dummy_dataset_dict()

            indices_cache_file_names = {
                "train": os.path.join(tmp_dir, "train.arrow"),
                "test": os.path.join(tmp_dir, "test.arrow"),
            }
            seeds = {
                "train": 1234,
                "test": 1234,
            }
            dsets_shuffled = dsets.shuffle(
                seeds=seeds,
                indices_cache_file_names=indices_cache_file_names,
                load_from_cache_file=False)
            self.assertListEqual(dsets_shuffled["train"]["filename"],
                                 dsets_shuffled["test"]["filename"])

            self.assertEqual(len(dsets_shuffled["train"]), 30)
            self.assertEqual(dsets_shuffled["train"][0]["filename"],
                             "my_name-train_028")
            self.assertEqual(dsets_shuffled["train"][2]["filename"],
                             "my_name-train_010")
            self.assertDictEqual(dsets["train"].features,
                                 Features({"filename": Value("string")}))
            self.assertDictEqual(dsets_shuffled["train"].features,
                                 Features({"filename": Value("string")}))

            # Reproducibility
            indices_cache_file_names_2 = {
                "train": os.path.join(tmp_dir, "train_2.arrow"),
                "test": os.path.join(tmp_dir, "test_2.arrow"),
            }
            dsets_shuffled_2 = dsets.shuffle(
                seeds=seeds,
                indices_cache_file_names=indices_cache_file_names_2,
                load_from_cache_file=False)
            self.assertListEqual(dsets_shuffled["train"]["filename"],
                                 dsets_shuffled_2["train"]["filename"])

            seeds = {
                "train": 1234,
                "test": 1,
            }
            indices_cache_file_names_3 = {
                "train": os.path.join(tmp_dir, "train_3.arrow"),
                "test": os.path.join(tmp_dir, "test_3.arrow"),
            }
            dsets_shuffled_3 = dsets.shuffle(
                seeds=seeds,
                indices_cache_file_names=indices_cache_file_names_3,
                load_from_cache_file=False)
            self.assertNotEqual(dsets_shuffled_3["train"]["filename"],
                                dsets_shuffled_3["test"]["filename"])
            del dsets, dsets_shuffled, dsets_shuffled_2, dsets_shuffled_3
Esempio n. 10
0
def generate_faiss_index_dataset(data, ctx_encoder_name, args, device):
    """
    Adapted from Huggingface example script at https://github.com/huggingface/transformers/blob/master/examples/research_projects/rag/use_own_knowledge_dataset.py
    """
    import faiss

    if isinstance(data, str):
        dataset = load_dataset("csv",
                               data_files=data,
                               delimiter="\t",
                               column_names=["title", "text"])
    else:
        dataset = HFDataset.from_pandas(data)

    dataset = dataset.map(
        partial(split_documents,
                split_text_n=args.split_text_n,
                split_text_character=args.split_text_character),
        batched=True,
        num_proc=args.process_count,
    )

    ctx_encoder = DPRContextEncoder.from_pretrained(ctx_encoder_name).to(
        device=device)
    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
        ctx_encoder_name)

    new_features = Features({
        "text": Value("string"),
        "title": Value("string"),
        "embeddings": Sequence(Value("float32"))
    })  # optional, save as float32 instead of float64 to save space
    dataset = dataset.map(
        partial(embed,
                ctx_encoder=ctx_encoder,
                ctx_tokenizer=ctx_tokenizer,
                device=device),
        batched=True,
        batch_size=args.rag_embed_batch_size,
        features=new_features,
    )
    if isinstance(data, str):
        dataset = dataset["train"]

    if args.save_knowledge_dataset:
        output_dataset_directory = os.path.join(args.output_dir,
                                                "knowledge_dataset")
        os.makedirs(output_dataset_directory, exist_ok=True)
        dataset.save_to_disk(output_dataset_directory)

    index = faiss.IndexHNSWFlat(args.faiss_d, args.faiss_m,
                                faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)

    return dataset
Esempio n. 11
0
def embed_update(ctx_encoder, total_processes, device, process_num, shard_dir,
                 csv_path):

    kb_dataset = load_dataset("csv",
                              data_files=[csv_path],
                              split="train",
                              delimiter="\t",
                              column_names=["title", "text"])
    kb_dataset = kb_dataset.map(
        split_documents, batched=True,
        num_proc=1)  # if you want you can load already splitted csv.
    kb_list = [
        kb_dataset.shard(total_processes, i, contiguous=True)
        for i in range(total_processes)
    ]
    data_shrad = kb_list[process_num]

    arrow_folder = "data_" + str(process_num)
    passages_path = os.path.join(shard_dir, arrow_folder)

    context_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
        "facebook/dpr-ctx_encoder-multiset-base")
    ctx_encoder = ctx_encoder.to(device=device)

    def embed(documents: dict, ctx_encoder: DPRContextEncoder,
              ctx_tokenizer: DPRContextEncoderTokenizerFast, device) -> dict:
        """Compute the DPR embeddings of document passages"""
        input_ids = ctx_tokenizer(documents["title"],
                                  documents["text"],
                                  truncation=True,
                                  padding="longest",
                                  return_tensors="pt")["input_ids"]
        embeddings = ctx_encoder(input_ids.to(device=device),
                                 return_dict=True).pooler_output
        return {"embeddings": embeddings.detach().cpu().numpy()}

    new_features = Features({
        "text": Value("string"),
        "title": Value("string"),
        "embeddings": Sequence(Value("float32"))
    })  # optional, save as float32 instead of float64 to save space

    dataset = data_shrad.map(
        partial(embed,
                ctx_encoder=ctx_encoder,
                ctx_tokenizer=context_tokenizer,
                device=device),
        batched=True,
        batch_size=16,
        features=new_features,
    )
    dataset.save_to_disk(passages_path)
Esempio n. 12
0
    def test_push_dataset_to_hub_custom_features_image(self):
        image_path = os.path.join(os.path.dirname(__file__), "features",
                                  "data", "test_image_rgb.jpg")
        data = {"x": [image_path, None], "y": [0, -1]}
        features = Features({"x": Image(), "y": Value("int32")})
        ds = Dataset.from_dict(data, features=features)

        for embed_external_files in [True, False]:
            ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
            try:
                ds.push_to_hub(ds_name,
                               embed_external_files=embed_external_files,
                               token=self._token)
                hub_ds = load_dataset(ds_name,
                                      split="train",
                                      download_mode="force_redownload")

                self.assertListEqual(ds.column_names, hub_ds.column_names)
                self.assertListEqual(list(ds.features.keys()),
                                     list(hub_ds.features.keys()))
                self.assertDictEqual(ds.features, hub_ds.features)
                self.assertEqual(ds[:], hub_ds[:])
                hub_ds = hub_ds.cast_column("x", Image(decode=False))
                elem = hub_ds[0]["x"]
                path, bytes_ = elem["path"], elem["bytes"]
                self.assertTrue(bool(path) == (not embed_external_files))
                self.assertTrue(bool(bytes_) == embed_external_files)
            finally:
                self._api.delete_repo(ds_name.split("/")[1],
                                      organization=ds_name.split("/")[0],
                                      token=self._token,
                                      repo_type="dataset")
Esempio n. 13
0
def load_datasets(lang="es", random_state=2021, preprocessing_args={}):
    """
    Load emotion recognition datasets
    """

    train_df = load_df(paths[lang]["train"])
    test_df = load_df(paths[lang]["test"])
    train_df, dev_df = train_test_split(train_df,
                                        stratify=train_df["label"],
                                        random_state=random_state)

    for df in [train_df, dev_df, test_df]:
        for label, idx in label2id.items():
            df.loc[df["label"] == label, "label"] = idx
        df["label"] = df["label"].astype(int)

    preprocess = lambda x: preprocess_tweet(x, lang=lang, **preprocessing_args)

    train_df.loc[:, "text"] = train_df["text"].apply(preprocess)
    dev_df.loc[:, "text"] = dev_df["text"].apply(preprocess)
    test_df.loc[:, "text"] = test_df["text"].apply(preprocess)

    features = Features({
        'text':
        Value('string'),
        'label':
        ClassLabel(num_classes=len(id2label),
                   names=[id2label[k] for k in sorted(id2label.keys())])
    })

    train_dataset = Dataset.from_pandas(train_df, features=features)
    dev_dataset = Dataset.from_pandas(dev_df, features=features)
    test_dataset = Dataset.from_pandas(test_df, features=features)

    return train_dataset, dev_dataset, test_dataset
Esempio n. 14
0
def test_csv_dataset_reader(path_type, split, features, keep_in_memory,
                            csv_path, tmp_path):
    if issubclass(path_type, str):
        path = csv_path
    elif issubclass(path_type, list):
        path = [csv_path]
    cache_dir = tmp_path / "cache"

    expected_split = str(split) if split else "train"

    # CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
    default_expected_features = {
        "col_1": "int64",
        "col_2": "int64",
        "col_3": "float64"
    }
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = CsvDatasetReader(path,
                                   split=split,
                                   features=features,
                                   cache_dir=cache_dir,
                                   keep_in_memory=keep_in_memory).read()
    assert isinstance(dataset, Dataset)
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == expected_split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
Esempio n. 15
0
def test_datasetdict_from_json(
    split,
    features,
    keep_in_memory,
    jsonl_path,
    tmp_path,
):
    file_path = jsonl_path
    field = None
    if split:
        path = {split: file_path}
    else:
        split = "train"
        path = {"train": file_path, "test": file_path}
    cache_dir = tmp_path / "cache"
    default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"}
    expected_features = features.copy() if features else default_expected_features
    features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = DatasetDict.from_json(
            path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, field=field
        )
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
Esempio n. 16
0
def test_datasetdict_from_csv(split, features, keep_in_memory, csv_path,
                              tmp_path):
    if split:
        path = {split: csv_path}
    else:
        split = "train"
        path = {"train": csv_path, "test": csv_path}
    cache_dir = tmp_path / "cache"
    # CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
    default_expected_features = {
        "col_1": "int64",
        "col_2": "int64",
        "col_3": "float64"
    }
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = DatasetDict.from_csv(path,
                                       features=features,
                                       cache_dir=cache_dir,
                                       keep_in_memory=keep_in_memory)
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
Esempio n. 17
0
def test_text_datasetdict_reader(split, features, keep_in_memory, text_path,
                                 tmp_path):
    if split:
        path = {split: text_path}
    else:
        split = "train"
        path = {"train": text_path, "test": text_path}
    cache_dir = tmp_path / "cache"

    default_expected_features = {"text": "string"}
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = TextDatasetReader(path,
                                    features=features,
                                    cache_dir=cache_dir,
                                    keep_in_memory=keep_in_memory).read()
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 1
    assert dataset.column_names == ["text"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
Esempio n. 18
0
    def test_push_dataset_dict_to_hub_custom_features(self):
        features = Features({
            "x": Value("int64"),
            "y": ClassLabel(names=["neg", "pos"])
        })
        ds = Dataset.from_dict({
            "x": [1, 2, 3],
            "y": [0, 0, 1]
        },
                               features=features)

        local_ds = DatasetDict({"test": ds})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            local_ds.push_to_hub(ds_name, token=self._token)
            hub_ds = load_dataset(ds_name, download_mode="force_redownload")

            self.assertDictEqual(local_ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(local_ds["test"].features.keys()),
                                 list(hub_ds["test"].features.keys()))
            self.assertDictEqual(local_ds["test"].features,
                                 hub_ds["test"].features)
        finally:
            self._api.delete_repo(ds_name.split("/")[1],
                                  organization=ds_name.split("/")[0],
                                  token=self._token,
                                  repo_type="dataset")
Esempio n. 19
0
def test_text_dataset_reader(path_type, split, features, keep_in_memory,
                             text_path, tmp_path):
    if issubclass(path_type, str):
        path = text_path
    elif issubclass(path_type, list):
        path = [text_path]
    cache_dir = tmp_path / "cache"

    expected_split = str(split) if split else "train"

    default_expected_features = {"text": "string"}
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = TextDatasetReader(path,
                                    split=split,
                                    features=features,
                                    cache_dir=cache_dir,
                                    keep_in_memory=keep_in_memory).read()
    assert isinstance(dataset, Dataset)
    assert dataset.num_rows == 4
    assert dataset.num_columns == 1
    assert dataset.column_names == ["text"]
    assert dataset.split == expected_split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
Esempio n. 20
0
    def test_push_dataset_to_hub_custom_features_audio(self):
        audio_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_audio_44100.wav")
        data = {"x": [audio_path, None], "y": [0, -1]}
        features = Features({"x": Audio(), "y": Value("int32")})
        ds = Dataset.from_dict(data, features=features)

        for embed_external_files in [True, False]:
            ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
            try:
                ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token)
                hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload")

                self.assertListEqual(ds.column_names, hub_ds.column_names)
                self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys()))
                self.assertDictEqual(ds.features, hub_ds.features)
                np.testing.assert_equal(ds[0]["x"]["array"], hub_ds[0]["x"]["array"])
                self.assertEqual(
                    ds[1], hub_ds[1]
                )  # don't test hub_ds[0] since audio decoding might be slightly different
                hub_ds = hub_ds.cast_column("x", Audio(decode=False))
                elem = hub_ds[0]["x"]
                path, bytes_ = elem["path"], elem["bytes"]
                self.assertTrue(bool(path) == (not embed_external_files))
                self.assertTrue(bool(bytes_) == embed_external_files)
            finally:
                self.cleanup_repo(ds_name)
Esempio n. 21
0
def load_datasets(preprocess_args={}):
    """
    Return train, dev, test datasets
    """
    train_files = glob(os.path.join(tass_dir, "train/*.tsv"))
    dev_files = glob(os.path.join(tass_dir, "dev/*.tsv"))
    test_files = glob(os.path.join(tass_dir, "test1.1/*.tsv"))

    train_dfs = {get_lang(file): load_df(file) for file in train_files}
    dev_dfs = {get_lang(file): load_df(file) for file in dev_files}
    test_dfs = {
        get_lang(file): load_df(file, test=True)
        for file in test_files
    }

    train_df = pd.concat(train_dfs.values())
    dev_df = pd.concat(dev_dfs.values())
    test_df = pd.concat(test_dfs.values())

    print(len(train_df), len(dev_df), len(test_df))
    """
    Tokenize tweets
    """

    preprocess_with_args = lambda x: preprocess_tweet(x, **preprocess_args)

    train_df["text"] = train_df["text"].apply(preprocess_with_args)
    dev_df["text"] = dev_df["text"].apply(preprocess_with_args)
    test_df["text"] = test_df["text"].apply(preprocess_with_args)

    features = Features({
        'text':
        Value('string'),
        'lang':
        Value('string'),
        'label':
        ClassLabel(num_classes=3, names=["neg", "neu", "pos"])
    })

    columns = ["text", "lang", "label"]

    train_dataset = Dataset.from_pandas(train_df[columns], features=features)
    dev_dataset = Dataset.from_pandas(dev_df[columns], features=features)
    test_dataset = Dataset.from_pandas(test_df[columns], features=features)

    return train_dataset, dev_dataset, test_dataset
Esempio n. 22
0
def run_sparse_retrieval(datasets, training_args):
    #### retreival process ####

    retriever = BM25Arti(tokenize_fn=tokenize,
                         data_path="./data",
                         context_path="wikipedia_documents.json")
    df = retriever.retrieve(datasets['validation'])

    if training_args.do_predict:  # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다.
        f = Features({
            'context': Value(dtype='string', id=None),
            'id': Value(dtype='string', id=None),
            'question': Value(dtype='string', id=None)
        })

    datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)})
    return datasets
Esempio n. 23
0
def test_dataset_with_image_feature_map(shared_datadir):
    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = {"image": [image_path], "caption": ["cats sleeping"]}
    features = Features({"image": Image(), "caption": Value("string")})
    dset = Dataset.from_dict(data, features=features)

    for item in dset:
        assert item.keys() == {"image", "caption"}
        assert item == {
            "image": {
                "path": image_path,
                "bytes": None
            },
            "caption": "cats sleeping"
        }

    # no decoding

    def process_caption(example):
        example["caption"] = "Two " + example["caption"]
        return example

    processed_dset = dset.map(process_caption)
    for item in processed_dset:
        assert item.keys() == {"image", "caption"}
        assert item == {
            "image": {
                "path": image_path,
                "bytes": None
            },
            "caption": "Two cats sleeping"
        }

    # decoding example

    def process_image_by_example(example):
        example["mode"] = example["image"].mode
        return example

    decoded_dset = dset.map(process_image_by_example)
    for item in decoded_dset:
        assert item.keys() == {"image", "caption", "mode"}
        assert os.path.samefile(item["image"]["path"], image_path)
        assert item["caption"] == "cats sleeping"
        assert item["mode"] == "RGB"

    # decoding batch

    def process_image_by_batch(batch):
        batch["mode"] = [image.mode for image in batch["image"]]
        return batch

    decoded_dset = dset.map(process_image_by_batch, batched=True)
    for item in decoded_dset:
        assert item.keys() == {"image", "caption", "mode"}
        assert os.path.samefile(item["image"]["path"], image_path)
        assert item["caption"] == "cats sleeping"
        assert item["mode"] == "RGB"
Esempio n. 24
0
    def test_caching(self):
        n_rows = 10

        features = Features({"foo": Value("string"), "bar": Value("string")})

        with tempfile.TemporaryDirectory() as tmp_dir:
            open(os.path.join(tmp_dir, "table.csv"), "w",
                 encoding="utf-8").write("\n".join(",".join(["foo", "bar"])
                                                   for _ in range(n_rows + 1)))
            ds = load_dataset("./datasets/csv",
                              data_files=os.path.join(tmp_dir, "table.csv"),
                              cache_dir=tmp_dir,
                              split="train")
            data_file = ds._data_files[0]
            fingerprint = ds._fingerprint
            self.assertEqual(len(ds), n_rows)
            del ds
            ds = load_dataset("./datasets/csv",
                              data_files=os.path.join(tmp_dir, "table.csv"),
                              cache_dir=tmp_dir,
                              split="train")
            self.assertEqual(ds._data_files[0], data_file)
            self.assertEqual(ds._fingerprint, fingerprint)
            del ds
            ds = load_dataset(
                "./datasets/csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                features=features,
            )
            self.assertNotEqual(ds._data_files[0], data_file)
            self.assertNotEqual(ds._fingerprint, fingerprint)
            del ds

            open(os.path.join(tmp_dir, "table.csv"), "w",
                 encoding="utf-8").write("\n".join(",".join(["Foo", "Bar"])
                                                   for _ in range(n_rows + 1)))
            ds = load_dataset("./datasets/csv",
                              data_files=os.path.join(tmp_dir, "table.csv"),
                              cache_dir=tmp_dir,
                              split="train")
            self.assertNotEqual(ds._data_files[0], data_file)
            self.assertNotEqual(ds._fingerprint, fingerprint)
            del ds
Esempio n. 25
0
def test_dataset_from_json_features(features, jsonl_path, tmp_path):
    cache_dir = tmp_path / "cache"
    default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"}
    expected_features = features.copy() if features else default_expected_features
    features = (
        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None
    )
    dataset = JsonDatasetReader(jsonl_path, features=features, cache_dir=cache_dir).read()
    _check_json_dataset(dataset, expected_features)
def test_parquet_datasetdict_reader_features(features, parquet_path, tmp_path):
    cache_dir = tmp_path / "cache"
    default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"}
    expected_features = features.copy() if features else default_expected_features
    features = (
        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None
    )
    dataset = ParquetDatasetReader({"train": parquet_path}, features=features, cache_dir=cache_dir).read()
    _check_parquet_datasetdict(dataset, expected_features)
def test_dataset_from_text_features(features, text_path, tmp_path):
    cache_dir = tmp_path / "cache"
    default_expected_features = {"text": "string"}
    expected_features = features.copy() if features else default_expected_features
    features = (
        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None
    )
    dataset = TextDatasetReader(text_path, features=features, cache_dir=cache_dir).read()
    _check_text_dataset(dataset, expected_features)
Esempio n. 28
0
def test_dataset_from_csv_features(features, csv_path, tmp_path):
    cache_dir = tmp_path / "cache"
    # CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
    default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"}
    expected_features = features.copy() if features else default_expected_features
    features = (
        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None
    )
    dataset = CsvDatasetReader(csv_path, features=features, cache_dir=cache_dir).read()
    _check_csv_dataset(dataset, expected_features)
def test_datasetdict_from_text_features(features, text_path, tmp_path):
    cache_dir = tmp_path / "cache"
    # CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
    default_expected_features = {"text": "string"}
    expected_features = features.copy() if features else default_expected_features
    features = (
        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None
    )
    dataset = TextDatasetReader({"train": text_path}, features=features, cache_dir=cache_dir).read()
    _check_text_datasetdict(dataset, expected_features)
Esempio n. 30
0
    def load_dataset(self) -> None:
        logger.debug('loading rag dataset: %s', self.name)

        self.dataset = load_dataset('csv',
                                    data_files=[self.csv_path],
                                    split='train',
                                    delimiter=',',
                                    column_names=['title', 'text'])

        self.dataset = self.dataset.map(
            split_documents,
            batched=False,
            num_proc=6,
            batch_size=100,
        )

        ctx_encoder = DPRContextEncoder.from_pretrained(
            self.context_encoder).to(device=self.device)
        ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
            self.context_encoder)
        new_features = Features({
            'text': Value('string'),
            'title': Value('string'),
            'embeddings': Sequence(Value('float32'))
        })  # optional, save as float32 instead of float64 to save space

        self.dataset = self.dataset.map(
            partial(embed,
                    ctx_encoder=ctx_encoder,
                    ctx_tokenizer=ctx_tokenizer,
                    device=self.device),
            batched=True,
            batch_size=16,
            features=new_features,
        )

        self.dataset.save_to_disk(self.dataset_path)

        index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT)
        self.dataset.add_faiss_index('embeddings', custom_index=index)

        self.dataset.get_index('embeddings').save(self.faiss_path)