Exemple #1
0
            else:
                new_sentences.append(all_sentences[index_article][current_len])
                sentences_matrix_bert.append(all_sentences_matrix_bert[index_article][current_len])
                break

    # question_idf = get_idf(question_tokens, context)

    final_indexes = get_iterative_alignment_justifications(
        question_tokens, new_sentences, idf_values, embeddings_index, query_matrix_bert, sentences_matrix_bert,
        max_iteration=6, emb_size=emb_size)

    # # 多条证据链
    # final_indexes = get_iterative_alignment_justifications_non_parametric_parallel_evidence(
    #     question_tokens, new_sentences, idf_values, embeddings_index,
    #     parallel_evidence_num=3, max_iteration=6, emb_size=emb_size)
    justifications = []
    for final_index in final_indexes:
        justifications.append(new_sentences[final_index.astype(np.int32)])
    selected = "。".join(justifications) + "。"

    return selected


if __name__ == '__main__':
    embeddings, size = get_word_embedding()
    bm25, _ = construct_corpus_sentence()
    # content, sentences = read_txt()
    # context = dispose(read_json(), "壶兰计划是哪个市的?", sentences_partition(read_json()), embeddings, size, bm25)
    # context = dispose(content, "壶兰计划是哪个市的?", sentences, embeddings, size, bm25)
    # print(context)
# en zh-cn
print(googletrans.LANGUAGES)

text = read_json()

# translator = Translator(['translate.google.cn'])
# translated_text = translator.translate(text, src='zh-cn', dest='en')
# print(translated_text.text)

questions = [
    "壶兰计划是哪个市的?", "“壶兰计划”中诺贝尔奖获得者可以获得多少补助?", "“壶兰计划”中同一企业当年度最多不超过多少名人才指标?",
    "“壶兰计划”中的“双招双引”给出哪些意见?", "莆田市人才政策的全称是什么?", "特级人才需要符合哪些条件?",
    "莆田高层次人才服务窗口在哪?"
]

embeddings_index, emb_size = get_word_embedding()
all_sentences = sentences_partition(text)


def qa():
    for question in questions:
        # translated_question = translator.translate(question, src='zh-cn', dest='en')
        # print(translated_question.text)
        # inputs = tokenizer(translated_question.text, translated_text.text,
        #                    add_special_tokens=True, return_tensors="pt")
        context = dispose(text, question, all_sentences, embeddings_index,
                          emb_size)
        context = truncate_seq_pair(context, question, max_length=509)
        print(f"上下文:{context}")
        translated_text = baidu_translate(context, fromLang='zh', toLang='en')
        # 防止翻译后的上下文+问题依然超出max_length
Exemple #3
0
def evaluate():
    # if torch.cuda.is_available():
    #     device = torch.device("cuda")
    #     print('there are %d GPU(s) available.' % torch.cuda.device_count())
    #     print('we will use the GPU: ', torch.cuda.get_device_name(0))
    # else:
    #     print('No GPU availabel, using the CPU instead.')
    #     device = torch.device('cpu')
    device = torch.device('cuda')
    examples = read_examples()
    # tokenizer = AutoTokenizer.from_pretrained(
    #     "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-finetuned-squad")
    # model = AutoModelForQuestionAnswering.from_pretrained(
    #     "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-finetuned-squad")

    # tokenizer = AutoTokenizer.from_pretrained(
    #     "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-squad2")
    # model = AutoModelForQuestionAnswering.from_pretrained(
    #     "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-squad2")

    # tokenizer = AutoTokenizer.from_pretrained("E:\\Dataset\\pytorch-Bert\\electra_large_discriminator_squad2_512")
    # model = AutoModelForQuestionAnswering.from_pretrained(
    #     "E:\\Dataset\\pytorch-Bert\\electra_large_discriminator_squad2_512")

    # model = BertForQuestionAnswering.from_pretrained('E:\\Dataset\\pytorch-Bert\\chinese-roberta-wwm-ext\\cmrc2018')
    # tokenizer = BertTokenizer.from_pretrained('E:\\Dataset\\pytorch-Bert\\chinese-roberta-wwm-ext\\cmrc2018')

    model = BertForQuestionAnswering.from_pretrained(
        './Model/chinese-roberta-wwm-ext-finetuned-cmrc2018')
    tokenizer = BertTokenizer.from_pretrained(
        './Model/chinese-roberta-wwm-ext-finetuned-cmrc2018')
    model.to(device)

    text = read_json()
    embeddings_index, emb_size = get_word_embedding()
    all_sentences = sentences_partition(text)
    # text, all_sentences = read_txt()
    predictions = []
    bm25model, _, sentence_dict = construct_corpus_sentence()

    for index, example in enumerate(tqdm(examples)):
        # context = example.context_text
        context = dispose(text, example.question_text, all_sentences,
                          embeddings_index, emb_size, bm25model, sentence_dict,
                          30)

        # context = truncate_seq_pair(context, example.question_text, max_length=512)
        # print(f"上下文:{context}")
        # translated_text = baidu_translate(context, fromLang='zh', toLang='en')
        # print(f"Context:{translated_text}")
        #
        # translated_question = baidu_translate(example.question_text, fromLang='zh', toLang='en')
        # print(f"问题: {example.question_text}")
        # print(f"Question:{translated_question}")
        #
        # inputs = tokenizer(
        #     translated_question, translated_text, add_special_tokens=True, return_tensors="pt",
        #     truncation='longest_first', max_length=512
        # ).to(device)
        # input_ids = inputs["input_ids"].tolist()[0]
        # # text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        # answer_start_scores, answer_end_scores = model(**inputs)
        # answer_start = torch.argmax(
        #     answer_start_scores
        # )  # Get the most likely beginning of answer with the argmax of the score
        # # Get the most likely end of answer with the argmax of the score
        # answer_end = torch.argmax(answer_end_scores) + 1
        # answer = tokenizer.convert_tokens_to_string(
        # tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
        # )
        # print(f"Answer: {answer}")
        # translated_answer = baidu_translate(answer, fromLang='en', toLang='zh')
        # print(f"答案: {translated_answer}")
        # prediction = Example(
        #     qas_id=index,
        #     question_text=example.question_text,
        #     context_text=example.context_text,
        #     answer_text=translated_answer
        # )
        # predictions.append(prediction)

        # context = truncate_seq_pair(context, example.question_text, max_length=509)
        print(f"上下文:{context}")
        inputs = tokenizer(example.question_text,
                           context,
                           add_special_tokens=True,
                           return_tensors="pt",
                           max_length=512,
                           truncation=True).to(device)
        # input_ids = inputs["input_ids"].tolist()[0]
        input_ids = inputs["input_ids"][0]
        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer_start_scores, answer_end_scores = model(**inputs)
        answer_start = torch.argmax(
            answer_start_scores
        )  # Get the most likely beginning of answer with the argmax of the score
        answer_end = torch.argmax(
            answer_end_scores
        ) + 1  # Get the most likely end of answer with the argmax of the score
        answer = text_tokens[answer_start:answer_end]
        answer = "".join(answer)
        print(f"Question: {example.question_text}")
        print(f"Answer: {answer}")
        prediction = Example(qas_id=index,
                             question_text=example.question_text,
                             context_text=example.context_text,
                             answer_text=answer)
        predictions.append(prediction)

    # list_predictions = []
    dict_prediction = {}
    # dict_predictions = {}
    # with open("./Metric/electra_large_discriminator_squad2_512.json", "w") as f:
    with open(
            "./Metric/chinese-roberta-wwm-ext-finetuned-cmrc2018+BM25(Top30, t=2).json",
            "w") as f:
        for prediction in predictions:
            dict_prediction["context"] = prediction.context_text
            dict_prediction["question"] = prediction.question_text
            dict_prediction["answer"] = prediction.answer_text
            # list_predictions.append(dict_prediction)
            # for each_dict in list_predictions:
            f.write(json.dumps(dict_prediction, ensure_ascii=False) + '\n')
        print("加载入文件完成...")