Ejemplo n.º 1
0
def create_squad_examples(raw_data, desc, dir_tokenizer):
    tokenizer = BertWordPieceTokenizer(os.path.join(dir_tokenizer,
                                                    "vocab.txt"),
                                       lowercase=True)
    p_bar = tqdm(total=len(raw_data["data"]),
                 desc=desc,
                 position=0,
                 leave=True,
                 file=sys.stdout,
                 bar_format="{l_bar}%s{bar}%s{r_bar}" %
                 (Fore.BLUE, Fore.RESET))
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                if "answers" in qa:
                    answer_text = qa["answers"][0]["text"]
                    start_char_idx = qa["answers"][0]["answer_start"]
                    all_answers = [_["text"] for _ in qa["answers"]]
                    squad_eg = Sample(tokenizer, question, context,
                                      start_char_idx, answer_text, all_answers)
                else:
                    squad_eg = Sample(tokenizer, question, context)
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
        p_bar.update(1)
    p_bar.close()
    return squad_examples