Ejemplo n.º 1
0
 def test_flatten(self):
     dset_split = Dataset.from_dict(
         {
             "a": [{
                 "b": {
                     "c": ["text"]
                 }
             }] * 10,
             "foo": [1] * 10
         },
         features=Features({
             "a": {
                 "b": Sequence({"c": Value("string")})
             },
             "foo": Value("int64")
         }),
     )
     dset = DatasetDict({"train": dset_split, "test": dset_split})
     dset = dset.flatten()
     self.assertDictEqual(dset.column_names, {
         "train": ["a.b.c", "foo"],
         "test": ["a.b.c", "foo"]
     })
     self.assertListEqual(list(dset["train"].features.keys()),
                          ["a.b.c", "foo"])
     self.assertDictEqual(
         dset["train"].features,
         Features({
             "a.b.c": Sequence(Value("string")),
             "foo": Value("int64")
         }))
     del dset
Ejemplo n.º 2
0
def test_cast_array_to_features():
    arr = pa.array([[0, 1]])
    assert cast_array_to_feature(arr, Sequence(
        Value("string"))).type == pa.list_(pa.string())
    with pytest.raises(TypeError):
        cast_array_to_feature(arr,
                              Sequence(Value("string")),
                              allow_number_to_str=False)
Ejemplo n.º 3
0
def test_dataset_with_image_feature_with_none():
    data = {"image": [None]}
    features = Features({"image": Image()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"image"}
    assert item["image"] is None
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"image"}
    assert isinstance(batch["image"], list) and all(item is None for item in batch["image"])
    column = dset["image"]
    assert len(column) == 1
    assert isinstance(column, list) and all(item is None for item in column)

    # nested tests

    data = {"images": [[None]]}
    features = Features({"images": Sequence(Image())})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"images"}
    assert all(i is None for i in item["images"])

    data = {"nested": [{"image": None}]}
    features = Features({"nested": {"image": Image()}})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"nested"}
    assert item["nested"].keys() == {"image"}
    assert item["nested"]["image"] is None
Ejemplo n.º 4
0
def test_image_feature_type_to_arrow():
    features = Features({"image": Image()})
    assert features.arrow_schema == pa.schema({"image": Image().pa_type})
    features = Features({"struct_containing_an_image": {"image": Image()}})
    assert features.arrow_schema == pa.schema({"struct_containing_an_image": pa.struct({"image": Image().pa_type})})
    features = Features({"sequence_of_images": Sequence(Image())})
    assert features.arrow_schema == pa.schema({"sequence_of_images": pa.list_(Image().pa_type)})
def save_data(train_df, val_df):
    train_f = Features({
        'answers':
        Sequence(feature={
            'text': Value(dtype='string', id=None),
            'answer_start': Value(dtype='int32', id=None)
        },
                 length=-1,
                 id=None),
        'context':
        Value(dtype='string', id=None),
        'id':
        Value(dtype='string', id=None),
        'question':
        Value(dtype='string', id=None),
        'question_type':
        Value(dtype='int32', id=None)
    })

    train_datasets = DatasetDict({
        'train':
        Dataset.from_pandas(train_df, features=train_f),
        'validation':
        Dataset.from_pandas(val_df, features=train_f)
    })
    file = open("../../data/question_type.pkl", "wb")
    pickle.dump(train_datasets, file)
    file.close()
Ejemplo n.º 6
0
def get_etr_dataset(args):
    etr_path = p.join(args.path.train_data_dir, "etr_qa_dataset.json")

    if not p.exists(etr_path):
        raise FileNotFoundError(
            f"ETRI 데이터 셋 {etr_path}로 파일명 바꿔서 데이터 넣어주시길 바랍니다.")

    with open(etr_path, "r") as f:
        etr_dict = json.load(f)

    #  print(etr_dict["data"][0])
    new_dataset = defaultdict(list)

    cnt = 0

    for datas in etr_dict["data"]:
        title = datas["title"]
        context = datas["paragraphs"][0]["context"]

        for questions in datas["paragraphs"][0]["qas"]:
            question = questions["question"]
            answers = {
                "answer_start": [questions["answers"][0]["answer_start"]],
                "text": [questions["answers"][0]["text"]],
            }

            new_dataset["id"].append(f"etr-custom-{cnt}")
            new_dataset["title"].append(title)
            new_dataset["context"].append(context)
            new_dataset["question"].append(question)
            new_dataset["answers"].append(answers)

            cnt += 1

    f = Features({
        "answers":
        Sequence(
            feature={
                "text": Value(dtype="string", id=None),
                "answer_start": Value(dtype="int32", id=None)
            },
            length=-1,
            id=None,
        ),
        "id":
        Value(dtype="string", id=None),
        "context":
        Value(dtype="string", id=None),
        "question":
        Value(dtype="string", id=None),
        "title":
        Value(dtype="string", id=None),
    })

    df = pd.DataFrame(new_dataset)
    etr_dataset = Dataset.from_pandas(df, features=f)

    return etr_dataset
def main(
    rag_example_args: "RagExampleArguments",
    processing_args: "ProcessingArguments",
    index_hnsw_args: "IndexHnswArguments",
):

    ######################################
    logger.info("Step 1 - Create the dataset")
    ######################################

    # The dataset needed for RAG must have three columns:
    # - title (string): title of the document
    # - text (string): text of a passage of the document
    # - embeddings (array of dimension d): DPR representation of the passage
    # Let's say you have documents in tab-separated csv files with columns "title" and "text"
    assert os.path.isfile(rag_example_args.csv_path), "Please provide a valid path to a csv file"

    # You can load a Dataset object this way
    dataset = load_dataset(
        "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]
    )

    # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files

    # Then split the documents into passages of 100 words
    dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc)

    # And compute the embeddings
    ctx_encoder = DPRContextEncoder.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name).to(device=device)
    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name)
    new_features = Features(
        {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))}
    )  # optional, save as float32 instead of float64 to save space
    dataset = dataset.map(
        partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer),
        batched=True,
        batch_size=processing_args.batch_size,
        features=new_features,
    )

    # And finally save your dataset
    passages_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset")
    dataset.save_to_disk(passages_path)
    # from datasets import load_from_disk
    # dataset = load_from_disk(passages_path)  # to reload the dataset

    ######################################
    logger.info("Step 2 - Index the dataset")
    ######################################

    # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search
    index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m, faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)

    # And save the index
    index_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset_hnsw_index.faiss")
    dataset.get_index("embeddings").save(index_path)
Ejemplo n.º 8
0
def make_negative_dataset(args,
                          bm25,
                          queries,
                          answers,
                          contexts,
                          name,
                          num=16):
    total = []
    scores, indices = bm25.get_relevant_doc_bulk(queries, topk=num * 2)

    answers, indices = np.array(answers, dtype="object"), np.array(indices)
    contexts = np.array(contexts, dtype="object")

    for idx, query in enumerate(queries):
        label = idx % num

        answer = answers[idx]
        context_list = contexts[indices[idx]]

        check_in = np.argwhere(context_list == answer)

        if check_in.shape[0] == 0:
            context_list[label] = answer
            context_list = context_list[:num]
        else:
            context_list[check_in[0][0]] = context_list[num]
            context_list[label] = answer
            context_list = context_list[:num]

        if idx % 100 == 0:
            print("query: ", query)
            print("answer: ", answer)
            print("negative:", context_list)
            print("label:", label)

        tmp = {
            "query": query,
            "negative_samples": context_list,
            "label": label
        }

        total.append(tmp)

    df = pd.DataFrame(total)

    f = Features({
        "query":
        Value(dtype="string", id=None),
        "negative_samples":
        Sequence(feature=Value(dtype="string", id=None), length=-1, id=None),
        "label":
        Value(dtype="int32", id=None),
    })

    dataset = Dataset.from_pandas(df, features=f)
    dataset.save_to_disk(os.path.join(args.path.train_data_dir, name))
Ejemplo n.º 9
0
def generate_faiss_index_dataset(data, ctx_encoder_name, args, device):
    """
    Adapted from Huggingface example script at https://github.com/huggingface/transformers/blob/master/examples/research_projects/rag/use_own_knowledge_dataset.py
    """
    import faiss

    if isinstance(data, str):
        dataset = load_dataset("csv",
                               data_files=data,
                               delimiter="\t",
                               column_names=["title", "text"])
    else:
        dataset = HFDataset.from_pandas(data)

    dataset = dataset.map(
        partial(split_documents,
                split_text_n=args.split_text_n,
                split_text_character=args.split_text_character),
        batched=True,
        num_proc=args.process_count,
    )

    ctx_encoder = DPRContextEncoder.from_pretrained(ctx_encoder_name).to(
        device=device)
    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
        ctx_encoder_name)

    new_features = Features({
        "text": Value("string"),
        "title": Value("string"),
        "embeddings": Sequence(Value("float32"))
    })  # optional, save as float32 instead of float64 to save space
    dataset = dataset.map(
        partial(embed,
                ctx_encoder=ctx_encoder,
                ctx_tokenizer=ctx_tokenizer,
                device=device),
        batched=True,
        batch_size=args.rag_embed_batch_size,
        features=new_features,
    )
    if isinstance(data, str):
        dataset = dataset["train"]

    if args.save_knowledge_dataset:
        output_dataset_directory = os.path.join(args.output_dir,
                                                "knowledge_dataset")
        os.makedirs(output_dataset_directory, exist_ok=True)
        dataset.save_to_disk(output_dataset_directory)

    index = faiss.IndexHNSWFlat(args.faiss_d, args.faiss_m,
                                faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)

    return dataset
Ejemplo n.º 10
0
def embed_update(ctx_encoder, total_processes, device, process_num, shard_dir,
                 csv_path):

    kb_dataset = load_dataset("csv",
                              data_files=[csv_path],
                              split="train",
                              delimiter="\t",
                              column_names=["title", "text"])
    kb_dataset = kb_dataset.map(
        split_documents, batched=True,
        num_proc=1)  # if you want you can load already splitted csv.
    kb_list = [
        kb_dataset.shard(total_processes, i, contiguous=True)
        for i in range(total_processes)
    ]
    data_shrad = kb_list[process_num]

    arrow_folder = "data_" + str(process_num)
    passages_path = os.path.join(shard_dir, arrow_folder)

    context_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
        "facebook/dpr-ctx_encoder-multiset-base")
    ctx_encoder = ctx_encoder.to(device=device)

    def embed(documents: dict, ctx_encoder: DPRContextEncoder,
              ctx_tokenizer: DPRContextEncoderTokenizerFast, device) -> dict:
        """Compute the DPR embeddings of document passages"""
        input_ids = ctx_tokenizer(documents["title"],
                                  documents["text"],
                                  truncation=True,
                                  padding="longest",
                                  return_tensors="pt")["input_ids"]
        embeddings = ctx_encoder(input_ids.to(device=device),
                                 return_dict=True).pooler_output
        return {"embeddings": embeddings.detach().cpu().numpy()}

    new_features = Features({
        "text": Value("string"),
        "title": Value("string"),
        "embeddings": Sequence(Value("float32"))
    })  # optional, save as float32 instead of float64 to save space

    dataset = data_shrad.map(
        partial(embed,
                ctx_encoder=ctx_encoder,
                ctx_tokenizer=context_tokenizer,
                device=device),
        batched=True,
        batch_size=16,
        features=new_features,
    )
    dataset.save_to_disk(passages_path)
Ejemplo n.º 11
0
def run_sparse_retrieval(datasets, training_args):
    #### retreival process ####

    retriever = SparseRetrieval(tokenize_fn=tokenize,
                                data_path="./data",
                                context_path="wikipedia_documents.json"
                                # context_path="all_wikipedia_documents.json"
                                )
    # sparse embedding retrieval
    # retriever.get_sparse_embedding()
    #df = retriever.retrieve(datasets['validation'])

    # bm25 retrieval
    # retriever.get_embedding_BM25()
    # df = retriever.retrieve_BM25(query_or_dataset=datasets['validation'], topk=10)

    # elastic search retrieval
    # retriever.get_elastic_search()
    df = retriever.retrieve_ES(query_or_dataset=datasets['validation'],
                               topk=10)

    # faiss retrieval
    # df = retriever.retrieve_faiss(dataset['validation'])

    if training_args.do_predict:  # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다.
        f = Features({
            'context': Value(dtype='string', id=None),
            'id': Value(dtype='string', id=None),
            'question': Value(dtype='string', id=None)
        })

    elif training_args.do_eval:  # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다.
        f = Features({
            'answers':
            Sequence(feature={
                'text': Value(dtype='string', id=None),
                'answer_start': Value(dtype='int32', id=None)
            },
                     length=-1,
                     id=None),
            'context':
            Value(dtype='string', id=None),
            'id':
            Value(dtype='string', id=None),
            'question':
            Value(dtype='string', id=None)
        })

    datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)})
    return datasets
Ejemplo n.º 12
0
def run_sparse_retrieval(datasets, training_args, inf_args):
    #### retreival process ####
    if inf_args.retrieval == None:
        retriever = SparseRetrieval_BM25PLUS(
            tokenize_fn=tokenize,
            data_path="./data",
            context_path="wikipedia_documents.json")
    elif inf_args.retrieval.lower() == "sparse":
        retriever = SparseRetrieval(tokenize_fn=tokenize,
                                    data_path="./data",
                                    context_path="wikipedia_documents.json")
    # elif inf_args.retrieval.lower() == "bm25" or inf_args.retrieval.lower() == "bm25":
    #     retriever = SparseRetrieval_BM25(tokenize_fn=tokenize,
    #                                          data_path="./data",
    #                                          context_path="wikipedia_documents.json")

    retriever.get_sparse_embedding()
    df = retriever.retrieve(datasets['validation'], inf_args.k)

    # faiss retrieval

    # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다.
    if training_args.do_predict:
        f = Features({
            'contexts': Value(dtype='string', id=None),
            'id': Value(dtype='string', id=None),
            'question': Value(dtype='string', id=None)
        })

    # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다.
    elif training_args.do_eval:
        f = Features({
            'answers':
            Sequence(feature={
                'text': Value(dtype='string', id=None),
                'answer_start': Value(dtype='int32', id=None)
            },
                     length=-1,
                     id=None),
            'context':
            Value(dtype='string', id=None),
            'id':
            Value(dtype='string', id=None),
            'question':
            Value(dtype='string', id=None)
        })

    datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)})
    return datasets
Ejemplo n.º 13
0
def read_dataset_from_csv(csv_path):
    """
    read the prepared csv data as Dataset object
    """
    df = pd.read_csv(csv_path,
                     converters={
                         'token': str,
                         'written': str,
                         'spoken': str
                     })
    feature_tag = Sequence(
        ClassLabel(num_classes=3, names=list(pd.factorize(df['tag'])[1])))
    df['tag'] = df['tag'].apply(feature_tag.feature.str2int)
    df_text = df.groupby(['sentence_id']).agg({'token': list, 'tag': list})
    dataset = Dataset.from_pandas(df_text)
    dataset.features["tag"] = feature_tag
    return dataset
Ejemplo n.º 14
0
    def load_dataset(self) -> None:
        logger.debug('loading rag dataset: %s', self.name)

        self.dataset = load_dataset('csv',
                                    data_files=[self.csv_path],
                                    split='train',
                                    delimiter=',',
                                    column_names=['title', 'text'])

        self.dataset = self.dataset.map(
            split_documents,
            batched=False,
            num_proc=6,
            batch_size=100,
        )

        ctx_encoder = DPRContextEncoder.from_pretrained(
            self.context_encoder).to(device=self.device)
        ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
            self.context_encoder)
        new_features = Features({
            'text': Value('string'),
            'title': Value('string'),
            'embeddings': Sequence(Value('float32'))
        })  # optional, save as float32 instead of float64 to save space

        self.dataset = self.dataset.map(
            partial(embed,
                    ctx_encoder=ctx_encoder,
                    ctx_tokenizer=ctx_tokenizer,
                    device=self.device),
            batched=True,
            batch_size=16,
            features=new_features,
        )

        self.dataset.save_to_disk(self.dataset_path)

        index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT)
        self.dataset.add_faiss_index('embeddings', custom_index=index)

        self.dataset.get_index('embeddings').save(self.faiss_path)
Ejemplo n.º 15
0
    def predict(self, input_path, output_path):
        key = 'tmp'
        input_df = pd.DataFrame()

        if self.pretrained:
            input_df['src_token'] = read_txt(input_path)
            input_df['src_token'] = input_df['src_token'].str.lower()
            input_df['token'] = input_df['src_token'].str.split()
            input_df['tag'] = input_df['token'].apply(lambda x: ['O'] * len(x))
            input_df['sentence_id'] = input_df.index

            trainer = Trainer(model=self.model,
                              tokenizer=self.tokenizer,
                              data_collator=self.data_collator)
            feature_tag = Sequence(
                ClassLabel(num_classes=3, names=self.label_list))
            input_df['tag'] = input_df['tag'].apply(
                feature_tag.feature.str2int)
            eval_dataset = Dataset.from_pandas(input_df)
            eval_dataset.features["tag"] = feature_tag
            # predict
            tokenized_datasets = DatasetDict({
                key: eval_dataset
            }).map(self.tokenize_and_align_labels, batched=True)
            _, true_predictions = self.predict_dataset(trainer,
                                                       tokenized_datasets[key])
            result = save_classifier_result(eval_dataset, true_predictions,
                                            output_path)
            return result
        else:
            input_df['token'] = read_txt(input_path)
            input_df['sentence_id'] = input_df.index
            input_df['tag'] = 'B'
            input_df.to_csv(output_path, index=False)
            print("Result saved to ", output_path)
            return input_df
Ejemplo n.º 16
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file

        table = csv.read_csv("./data/train.csv",
                             parse_options=ParseOptions(delimiter="\t"))
        class_label_ = table.column("label").unique()
        class_label = ClassLabel(num_classes=len(class_label_),
                                 names=class_label_.tolist())
        train = main_ner.process_data(data_args.train_file, class_label)
        test = main_ner.process_data(data_args.test_file, class_label)
        val = main_ner.process_data(data_args.validation_file, class_label)

        # table = csv.read_csv(data_args.train_file)
        extension = data_args.train_file.split(".")[-1]
        datasets = load_dataset(extension,
                                data_files=data_files,
                                delimiter="\t",
                                quoting=csv_lib.QUOTE_NONE)
        train_dataset = datasets["train"]
        test_dataset = datasets["test"]
        val_dataset = datasets["validation"]

        table = train_dataset.data
        label = table.column("label")
        class_label_ = label.unique()
        class_label = Sequence(feature=ClassLabel(
            num_classes=len(class_label_), names=class_label_.tolist()))

        train_dataset.features['ner_tags'] = class_label
        # train_ner_list: ChunkedArray = class_label.feature.str2int(train_dataset.data.column('label').to_numpy())
        # train_ner_array = pa.array(train_ner_list)
        # train_data = train_dataset.data.append_column("ner_tags", train_ner_array)
        train_dataset._data = train

        test_dataset.features['ner_tags'] = class_label
        test_dataset._data = test

        val_dataset.features['ner_tags'] = class_label
        # val_ner_list: ChunkedArray = class_label.feature.str2int(val_dataset.data.column('label').to_numpy())
        # val_ner_array = pa.array(val_ner_list)
        # val_data = val_dataset.data.append_column("ner_tags", val_ner_array)
        val_dataset._data = val

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if training_args.do_train:
        column_names = datasets["train"].column_names
        features = datasets["train"].features
    else:
        column_names = datasets["validation"].column_names
        features = datasets["validation"].features
    text_column_name = "tokens" if "tokens" in column_names else column_names[0]
    label_column_name = (f"{data_args.task_name}_tags"
                         if f"{data_args.task_name}_tags" in column_names else
                         column_names[1])

    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    seq: Sequence = features[label_column_name]
    # label_list = ["O", "B-GENE", "I-GENE"]
    # label_to_id = {i: i for i in range(len(label_list))}
    if isinstance(seq.feature, ClassLabel):
        label_list = features[label_column_name].feature.names
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=True,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Tokenizer check: this script requires a fast tokenizer.
    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise ValueError(
            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
            "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this "
            "requirement")

    # Preprocessing the dataset
    # Padding strategy
    padding = "max_length" if data_args.pad_to_max_length else False

    # Tokenize all texts and align the labels with them.
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )
        labels = []
        if len(examples) == 3:
            for i, label in enumerate(examples[label_column_name]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:
                    # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                    # ignored in the loss function.
                    if word_idx is None:
                        label_ids.append(-100)
                    # We set the label for the first token of each word.
                    elif word_idx != previous_word_idx:
                        label_ids.append(label_to_id[label[word_idx]])
                    # For the other tokens in a word, we set the label to either the current label or -100, depending on
                    # the label_all_tokens flag.
                    else:
                        label_ids.append(label_to_id[label[word_idx]] if
                                         data_args.label_all_tokens else -100)
                    previous_word_idx = word_idx

                labels.append(label_ids)
            tokenized_inputs["labels"] = labels
            return tokenized_inputs
        else:
            print("asdasdsa")

    tokenized_datasets = datasets.map(
        tokenize_and_align_labels,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Data collator
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # Metrics
    metric = load_metric("seqeval")

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        true_labels = [[
            label_list[l] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        results = metric.compute(predictions=true_predictions,
                                 references=true_labels)
        if data_args.return_entity_level_metrics:
            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        train_result = trainer.train(
            model_path=model_args.model_name_or_path if os.path.
            isdir(model_args.model_name_or_path) else None)
        trainer.save_model()  # Saves the tokenizer too for easy upload

        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        if trainer.is_world_process_zero():
            with open(output_train_file, "w") as writer:
                logger.info("***** Train results *****")
                for key, value in sorted(train_result.metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        results = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_ner.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    # Predict
    if training_args.do_predict:
        logger.info("*** Predict ***")

        test_dataset = tokenized_datasets["test"]
        predictions, labels, metrics = trainer.predict(test_dataset)
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        output_test_results_file = os.path.join(training_args.output_dir,
                                                "test_results.txt")
        if trainer.is_world_process_zero():
            with open(output_test_results_file, "w") as writer:
                for key, value in sorted(metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir,
                                                    "test_predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_test_predictions_file, "w") as writer:
                for prediction in true_predictions:
                    writer.write(" ".join(prediction) + "\n")

    return results
  # Padding of token_boxes up the bounding boxes to the sequence length.
  input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"]
  padding_length = max_seq_length - len(input_ids)
  token_boxes += [pad_token_box] * padding_length
  encoding['bbox'] = token_boxes

  assert len(encoding['input_ids']) == max_seq_length
  assert len(encoding['attention_mask']) == max_seq_length
  assert len(encoding['token_type_ids']) == max_seq_length
  assert len(encoding['bbox']) == max_seq_length

  return encoding

# we need to define the features ourselves as the bbox of LayoutLM are an extra feature
features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'image_path': Value(dtype='string'),
    'words': Sequence(feature=Value(dtype='string')),
})

classes = ["bill", "invoice", "others", "Purchase_Order", "remittance"]


# Model Loading
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@st.cache(allow_output_mutation=True)
def load_model():
    url = "https://vast-ml-models.s3-ap-southeast-2.amazonaws.com/Document-Classification-5-labels-final.bin"
Ejemplo n.º 18
0
def make_custom_dataset(dataset_path):
    if not (os.path.isdir("../data/train_dataset")
            or os.path.isdir("../data/wikipedia_documents.json")):
        raise Exception("Set the original data path to '../data'")

    train_f = Features({
        'answers':
        Sequence(feature={
            'text': Value(dtype='string', id=None),
            'answer_start': Value(dtype='int32', id=None)
        },
                 length=-1,
                 id=None),
        'context':
        Value(dtype='string', id=None),
        'id':
        Value(dtype='string', id=None),
        'question':
        Value(dtype='string', id=None)
    })

    if not os.path.isfile("../data/preprocess_wiki.json"):
        with open("../data/wikipedia_documents.json", "r") as f:
            wiki = json.load(f)
        new_wiki = dict()
        for ids in range(len(wiki)):
            new_wiki[str(ids)] = run_preprocess_to(wiki[str(ids)])
        with open('../data/preprocess_wiki.json', 'w',
                  encoding='utf-8') as make_file:
            json.dump(new_wiki, make_file, indent="\t", ensure_ascii=False)

    if not os.path.isfile("/opt/ml/input/data/preprocess_train.pkl"):
        train_dataset = load_from_disk("../data/train_dataset")['train']
        val_dataset = load_from_disk("../data/train_dataset")['validation']

        new_train_data, new_val_data = [], []
        for data in train_dataset:
            new_data = run_preprocess(data)
            new_train_data.append(new_data)
        for data in val_dataset:
            new_data = run_preprocess(data)
            new_val_data.append(new_data)

        train_df = pd.DataFrame(new_train_data)
        val_df = pd.DataFrame(new_val_data)
        dataset = DatasetDict({
            'train':
            Dataset.from_pandas(train_df, features=train_f),
            'validation':
            Dataset.from_pandas(val_df, features=train_f)
        })
        save_pickle(dataset_path, dataset)

        if 'preprocess' in dataset_path:
            return dataset

    if 'squad' in dataset_path:
        train_data = get_pickle("../data/preprocess_train.pkl")["train"]
        val_data = get_pickle("../data/preprocess_train.pkl")["validation"]
        korquad_data = load_dataset("squad_kor_v1")["train"]

        df_train_data = pd.DataFrame(train_data)
        df_val_data = pd.DataFrame(val_data)
        df_korquad_data = pd.DataFrame(
            korquad_data, columns=['answers', 'context', 'id', 'question'])
        df_total_train = pd.concat([df_train_data, df_korquad_data])

        dataset = DatasetDict({
            'train':
            Dataset.from_pandas(df_total_train, features=train_f),
            'validation':
            Dataset.from_pandas(df_val_data, features=train_f)
        })
        save_pickle("../data/korquad_train.pkl", dataset)
        return train_dataset

    if 'concat' in dataset_path:
        base_dataset = get_pickle("../data/preprocess_train.pkl")
        train_dataset, val_dataset = base_dataset["train"], base_dataset[
            "validation"]

        train_data = [{
            "id": train_dataset[i]["id"],
            "question": train_dataset[i]["question"],
            "answers": train_dataset[i]["answers"],
            "context": train_dataset[i]["context"]
        } for i in range(len(train_dataset))]
        val_data = [{
            "id": val_dataset[i]["id"],
            "question": val_dataset[i]["question"],
            "answers": val_dataset[i]["answers"],
            "context": val_dataset[i]["context"]
        } for i in range(len(val_dataset))]

        config = {'host': 'localhost', 'port': 9200}
        es = Elasticsearch([config])

        k = 5  # k : how many contexts to concatenate
        for idx, train in enumerate(train_data):
            res = search_es(es, "wiki-index", question["question"], k)
            context_list = [(hit['_source']['document_text'], hit['_score'])
                            for hit in res['hits']['hits']]
            contexts = train["context"]
            count = 0
            for context in context_list:
                # if same context already exists, don't concatenate
                if train["context"] == context[0]:
                    continue
                contexts += " " + context[0]
                count += 1
                if count == (k - 1):
                    break
            train_data[idx]["context"] = contexts

        for idx, val in enumerate(val_data):
            res = search_es(es, "wiki-index", question["question"], k)
            context_list = [(hit['_source']['document_text'], hit['_score'])
                            for hit in res['hits']['hits']]
            contexts = val["context"]
            count = 0
            for context in context_list:
                if val["context"] == context[0]:
                    continue
                contexts += " " + context[0]
                count += 1
                if count == (k - 1):
                    break
            val_data[idx]["context"] = contexts

        train_df = pd.DataFrame(train_data)
        val_df = pd.DataFrame(val_data)
        dataset = DatasetDict({
            'train':
            Dataset.from_pandas(train_df, features=train_f),
            'validation':
            Dataset.from_pandas(val_df, features=train_f)
        })
        save_pickle(dataset_path, dataset)
        return dataset

    if "split_wiki_400" in dataset_path:
        with open("/opt/ml/input/data/preprocess_wiki.json", "r") as f:
            wiki = json.load(f)
        new_wiki = dict()
        for i in tqdm(range(len(wiki))):
            if len(wiki[str(i)]["text"]) < 800:
                new_wiki[str(i)] = wiki[str(i)]
                continue
            data_list, count = passage_split_400(wiki[str(i)]["text"])
            for j in range(count):
                new_wiki[str(i) + f"_{j}"] = {
                    "text": data_list[j],
                    "corpus_source": wiki[str(i)]["corpus_source"],
                    "url": wiki[str(i)]["url"],
                    "domain": wiki[str(i)]["domain"],
                    "title": wiki[str(i)]["title"],
                    "author": wiki[str(i)]["author"],
                    "html": wiki[str(i)]["html"],
                    "document_id": wiki[str(i)]["document_id"]
                }

        save_data("../data/wiki-index-split-400.json", new_wiki)

    if "split_wiki" in dataset_path and dataset_path != "split_wiki_400":
        with open("/opt/ml/input/data/preprocess_wiki.json", "r") as f:
            wiki = json.load(f)

        limit = 0
        if "800" in dataset_path:
            limit = 800
        if "1000" in dataset_path:
            limit = 1000

        new_wiki = dict()
        for i in tqdm(range(len(wiki))):
            if len(wiki[str(i)]["text"]) < limit:
                new_wiki[str(i)] = wiki[str(i)]
                continue
            data_1, data_2 = passage_split(wiki[str(i)]["text"])
            new_wiki[str(i) + f"_1"] = {
                "text": data_1,
                "corpus_source": wiki[str(i)]["corpus_source"],
                "url": wiki[str(i)]["url"],
                "domain": wiki[str(i)]["domain"],
                "title": wiki[str(i)]["title"],
                "author": wiki[str(i)]["author"],
                "html": wiki[str(i)]["html"],
                "document_id": wiki[str(i)]["document_id"]
            }
            new_wiki[str(i) + f"_2"] = {
                "text": data_2,
                "corpus_source": wiki[str(i)]["corpus_source"],
                "url": wiki[str(i)]["url"],
                "domain": wiki[str(i)]["domain"],
                "title": wiki[str(i)]["title"],
                "author": wiki[str(i)]["author"],
                "html": wiki[str(i)]["html"],
                "document_id": wiki[str(i)]["document_id"]
            }

        save_data(f"../data/split_wiki_{limit}.json")
Ejemplo n.º 19
0
def test_cast_array_to_features_nested():
    arr = pa.array([[{"foo": [0]}]])
    assert cast_array_to_feature(arr, [{
        "foo": Sequence(Value("string"))
    }]).type == pa.list_(pa.struct({"foo": pa.list_(pa.string())}))
Ejemplo n.º 20
0
    None: 'violet',
    'pn': 'yellow',
    'h': 'red',
    'wh': 'purple',
    'fg': 'brown',
    'fn': 'grey',
    'tb': 'beige'
}
id2label = {v: k for k, v in label2id.items()}
LABELS = [label2id[L] for L in LABELS]
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
FEATURES = Features({
    'image':
    Array3D(dtype="int64", shape=(3, 224, 224)),
    'input_ids':
    Sequence(feature=Value(dtype='int64')),
    'attention_mask':
    Sequence(Value(dtype='int64')),
    'token_type_ids':
    Sequence(Value(dtype='int64')),
    'bbox':
    Array2D(dtype="int64", shape=(512, 4)),
    'labels':
    Sequence(ClassLabel(names=LABELS + [max(LABELS) + 1]))
})
NUM_LABELS = len(LABELS)
PROCESSOR_PICKLE = f"processor_module{NUM_LABELS}.pickle"
MODEL_PICKLE = f"model_module{NUM_LABELS}.pickle"
EPOCHS_LAYOUT = 84
PDF_UPLOAD_DIR = hidden_folder + "/pdf_upload/"
ELMO_DIFFERENCE_MODEL_PATH = hidden_folder + "elmo_difference_models"
Ejemplo n.º 21
0
def main(
    rag_example_args: "RagExampleArguments",
    processing_args: "ProcessingArguments",
    index_hnsw_args: "IndexHnswArguments",
):

    ######################################
    logger.info("Step 1 - Create the dataset")
    ######################################

    # The dataset needed for RAG must have three columns:
    # - title (string): title of the document
    # - text (string): text of a passage of the document
    # - embeddings (array of dimension d): DPR representation of the passage

    # Let's say you have documents in tab-separated csv files with columns "title" and "text"
    assert os.path.isfile(
        rag_example_args.csv_path), "Please provide a valid path to a csv file"

    # You can load a Dataset object this way
    dataset = load_dataset("csv",
                           data_files=[rag_example_args.csv_path],
                           split="train",
                           delimiter="\t",
                           column_names=["title", "text"])

    # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files

    # Then split the documents into passages of n words (changing the param in split_text to what you want n to be)
    dataset = dataset.map(split_documents,
                          batched=True,
                          num_proc=processing_args.num_proc)
    # And compute the embeddings
    if use_generated_model:
        model_path = 'ragfinetune_4_4_false_50_true/checkpoint2/'  # SET PATH TO CHECKPOINT
        config = RagConfig.from_pretrained(model_path)
        config.n_docs = 4
        config.n_docs_splits = 4
        retriever = RagRetriever.from_pretrained(model_path, config=config)
        checkpoint_model = RagSequenceForGeneration.from_pretrained(
            model_path, config=config, retriever=retriever).cuda()
        ctx_encoder = checkpoint_model.generator.get_encoder()
        ctx_tokenizer = checkpoint_model.retriever.generator_tokenizer
    else:
        ctx_encoder = DPRContextEncoder.from_pretrained(
            rag_example_args.dpr_ctx_encoder_model_name).to(device=device)
        ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
            rag_example_args.dpr_ctx_encoder_model_name)
    new_features = Features({
        "text": Value("string"),
        "title": Value("string"),
        "embeddings": Sequence(Value("float32"))
    })  # optional, save as float32 instead of float64 to save space
    dataset = dataset.map(
        partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer),
        batched=True,
        batch_size=processing_args.batch_size,
        features=new_features,
    )

    # And finally save your dataset
    passages_path = os.path.join(rag_example_args.output_dir,
                                 "my_knowledge_dataset")
    dataset.save_to_disk(passages_path)
    # from datasets import load_from_disk
    # dataset = load_from_disk(passages_path)  # to reload the dataset

    ######################################
    logger.info("Step 2 - Index the dataset")
    ######################################

    # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search
    index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m,
                                faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)

    # And save the index
    index_path = os.path.join(rag_example_args.output_dir,
                              "my_knowledge_dataset_hnsw_index.faiss")
    dataset.get_index("embeddings").save(index_path)
    # dataset.load_faiss_index("embeddings", index_path)  # to reload the index

    ######################################
    logger.info("Step 3 - Load RAG")
    ######################################

    # Easy way to load the model
    retriever = RagRetriever.from_pretrained(rag_example_args.rag_model_name,
                                             index_name="custom",
                                             indexed_dataset=dataset)
    model = RagSequenceForGeneration.from_pretrained(
        rag_example_args.rag_model_name, retriever=retriever)
    tokenizer = RagTokenizer.from_pretrained(rag_example_args.rag_model_name)

    # For distributed fine-tuning you'll need to provide the paths instead, as the dataset and the index are loaded separately.
    # retriever = RagRetriever.from_pretrained(rag_model_name, index_name="custom", passages_path=passages_path, index_path=index_path)

    ######################################
    logger.info("Step 4 - Have fun")
    ######################################

    question = rag_example_args.question or "What does Moses' rod turn into ?"
    input_ids = tokenizer.question_encoder(question,
                                           return_tensors="pt")["input_ids"]
    generated = model.generate(input_ids)
    generated_string = tokenizer.batch_decode(generated,
                                              skip_special_tokens=True)[0]
    logger.info("Q: " + question)
    logger.info("A: " + generated_string)
Ejemplo n.º 22
0
    def retrieve(self, query_or_dataset, topk=1):
        assert self.p_embedding is not None, "get_embedding()을 먼저 수행한 후에 retrieve()를 작동시켜 주세요. "

        total = []
        # 중복을 걸러내기 위해 40 + topk (확인된 최대 중복 개수 40 + topk개)으로 최소값을 설정하고, topk의 alpha 배수로 뽑습니다.
        alpha = 2
        doc_scores, doc_indices = self.get_relevant_doc_bulk(
            query_or_dataset["question"], topk=max(40 + topk, alpha * topk)
        )

        for idx, example in enumerate(tqdm(query_or_dataset, desc="Retrieval: ")):

            doc_scores_topk = [doc_scores[idx][0]]
            doc_indices_topk = [doc_indices[idx][0]]

            pointer = 1

            while len(doc_indices_topk) != topk:
                is_non_duplicate = True
                new_text_idx = doc_indices[idx][pointer]
                new_text = self.contexts[new_text_idx]

                for d_id in doc_indices_topk:
                    if fuzz.ratio(self.contexts[d_id], new_text) > 65:
                        is_non_duplicate = False
                        break

                if is_non_duplicate:
                    doc_scores_topk.append(doc_scores[idx][pointer])
                    doc_indices_topk.append(new_text_idx)

                pointer += 1

                if pointer == max(40 + topk, alpha * topk):
                    break

            assert len(doc_indices_topk) == topk, "중복 없는 topk 추출을 위해 alpha 값을 증가시켜 주세요."

            for doc_id in range(topk):
                doc_idx = doc_indices_topk[doc_id]
                tmp = {
                    "question": example["question"],
                    "id": example["id"],
                    "context_id": self.context_ids[doc_idx],  # retrieved id
                    "context": self.contexts[doc_idx],  # retrieved document
                }
                if "context" in example.keys() and "answers" in example.keys():
                    tmp["original_context"] = example["context"]  # original document
                    tmp["answers"] = example["answers"]  # original answer
                total.append(tmp)

        df = pd.DataFrame(total)

        if self.args.train.do_predict is True:
            f = Features(
                {
                    "context": Value(dtype="string", id=None),
                    "id": Value(dtype="string", id=None),
                    "question": Value(dtype="string", id=None),
                    "context_id": Value(dtype="int32", id=None),
                }
            )
        else:
            f = Features(
                {
                    "answers": Sequence(
                        feature={"text": Value(dtype="string", id=None), "answer_start": Value(dtype="int32", id=None)},
                        length=-1,
                        id=None,
                    ),
                    "context": Value(dtype="string", id=None),
                    "id": Value(dtype="string", id=None),
                    "question": Value(dtype="string", id=None),
                    "original_context": Value(dtype="string", id=None),
                    "context_id": Value(dtype="int32", id=None),
                }
            )

        datasets = DatasetDict({"validation": Dataset.from_pandas(df, features=f)})
        return datasets
Ejemplo n.º 23
0
import os.path as p
from collections import defaultdict

import pandas as pd
from datasets import load_dataset
from datasets import concatenate_datasets
from datasets import Sequence, Value, Features, Dataset, DatasetDict

from utils.tools import get_args

f = Features({
    "answers":
    Sequence(
        feature={
            "text": Value(dtype="string", id=None),
            "answer_start": Value(dtype="int32", id=None)
        },
        length=-1,
        id=None,
    ),
    "id":
    Value(dtype="string", id=None),
    "context":
    Value(dtype="string", id=None),
    "question":
    Value(dtype="string", id=None),
    "title":
    Value(dtype="string", id=None),
})


def remove_multiple_indexes(rlist, indexes):