Exemple #1
0
def construct_corpus_sentence():
    corpus = []
    sentence_dict = {}  # 存储所有句子所在文档以及句子索引(二级完全平铺)
    count = 0
    context = read_json()
    all_sentences = sentences_partition(context)
    for index_document, sentences_per_document in enumerate(all_sentences):
        for index_sentence, sentence in enumerate(sentences_per_document):
            corpus.append(tokenization_sentence(sentence))
            sentence_dict[count] = str(index_document) + "," + str(
                index_sentence)
            count += 1

    dictionary = corpora.Dictionary(corpus)
    print(len(dictionary))

    bm25model = bm25.BM25(corpus)
    return bm25model, all_sentences, sentence_dict
#     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
#     input_ids = inputs["input_ids"].tolist()[0]
#     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
#     answer_start_scores, answer_end_scores = model(**inputs)
#     answer_start = torch.argmax(
#         answer_start_scores
#     )  # Get the most likely beginning of answer with the argmax of the score
#     answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
#     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
#     print(f"Question: {question}")
#     print(f"Answer: {answer}")

# en zh-cn
print(googletrans.LANGUAGES)

text = read_json()

# translator = Translator(['translate.google.cn'])
# translated_text = translator.translate(text, src='zh-cn', dest='en')
# print(translated_text.text)

questions = [
    "壶兰计划是哪个市的?", "“壶兰计划”中诺贝尔奖获得者可以获得多少补助?", "“壶兰计划”中同一企业当年度最多不超过多少名人才指标?",
    "“壶兰计划”中的“双招双引”给出哪些意见?", "莆田市人才政策的全称是什么?", "特级人才需要符合哪些条件?",
    "莆田高层次人才服务窗口在哪?"
]

embeddings_index, emb_size = get_word_embedding()
all_sentences = sentences_partition(text)

Exemple #3
0
    for each_word in all_words:
        doc_count = idf2[str(each_word)]
        # 公式见论文"Alignment over Heterogeneous Embeddings for Question Answering": pp.2683, 式1
        idf[str(each_word)] = math.log10(
            (total_doc - doc_count + 0.5) / float(doc_count + 0.5))

    with open(file_name, 'w') as outfile:
        json.dump(idf, outfile, ensure_ascii=False)


def pre(all_sentences):
    # input_files = ["train_456-fixedIds.json", "dev_83-fixedIds.json"]
    vocab = []
    for justifications_per_article in all_sentences:
        for sentence in justifications_per_article:
            just_tokens = sentence_segmentation(sentence, 0)
            if just_tokens is not None:
                vocab += just_tokens

    vocab = list(set(vocab))

    # vocab:存放每个句子分词后token的列表(完全不同的token)
    # all_sentences:所有篇章句子
    write_idf_values(vocab, all_sentences,
                     "./Data/GovernmentQA_IDF_values.json")


if __name__ == "__main__":
    pre(sentences_partition(read_json()))
Exemple #4
0
def evaluate():
    # if torch.cuda.is_available():
    #     device = torch.device("cuda")
    #     print('there are %d GPU(s) available.' % torch.cuda.device_count())
    #     print('we will use the GPU: ', torch.cuda.get_device_name(0))
    # else:
    #     print('No GPU availabel, using the CPU instead.')
    #     device = torch.device('cpu')
    device = torch.device('cuda')
    examples = read_examples()
    # tokenizer = AutoTokenizer.from_pretrained(
    #     "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-finetuned-squad")
    # model = AutoModelForQuestionAnswering.from_pretrained(
    #     "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-finetuned-squad")

    # tokenizer = AutoTokenizer.from_pretrained(
    #     "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-squad2")
    # model = AutoModelForQuestionAnswering.from_pretrained(
    #     "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-squad2")

    # tokenizer = AutoTokenizer.from_pretrained("E:\\Dataset\\pytorch-Bert\\electra_large_discriminator_squad2_512")
    # model = AutoModelForQuestionAnswering.from_pretrained(
    #     "E:\\Dataset\\pytorch-Bert\\electra_large_discriminator_squad2_512")

    # model = BertForQuestionAnswering.from_pretrained('E:\\Dataset\\pytorch-Bert\\chinese-roberta-wwm-ext\\cmrc2018')
    # tokenizer = BertTokenizer.from_pretrained('E:\\Dataset\\pytorch-Bert\\chinese-roberta-wwm-ext\\cmrc2018')

    model = BertForQuestionAnswering.from_pretrained(
        './Model/chinese-roberta-wwm-ext-finetuned-cmrc2018')
    tokenizer = BertTokenizer.from_pretrained(
        './Model/chinese-roberta-wwm-ext-finetuned-cmrc2018')
    model.to(device)

    text = read_json()
    embeddings_index, emb_size = get_word_embedding()
    all_sentences = sentences_partition(text)
    # text, all_sentences = read_txt()
    predictions = []
    bm25model, _, sentence_dict = construct_corpus_sentence()

    for index, example in enumerate(tqdm(examples)):
        # context = example.context_text
        context = dispose(text, example.question_text, all_sentences,
                          embeddings_index, emb_size, bm25model, sentence_dict,
                          30)

        # context = truncate_seq_pair(context, example.question_text, max_length=512)
        # print(f"上下文:{context}")
        # translated_text = baidu_translate(context, fromLang='zh', toLang='en')
        # print(f"Context:{translated_text}")
        #
        # translated_question = baidu_translate(example.question_text, fromLang='zh', toLang='en')
        # print(f"问题: {example.question_text}")
        # print(f"Question:{translated_question}")
        #
        # inputs = tokenizer(
        #     translated_question, translated_text, add_special_tokens=True, return_tensors="pt",
        #     truncation='longest_first', max_length=512
        # ).to(device)
        # input_ids = inputs["input_ids"].tolist()[0]
        # # text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        # answer_start_scores, answer_end_scores = model(**inputs)
        # answer_start = torch.argmax(
        #     answer_start_scores
        # )  # Get the most likely beginning of answer with the argmax of the score
        # # Get the most likely end of answer with the argmax of the score
        # answer_end = torch.argmax(answer_end_scores) + 1
        # answer = tokenizer.convert_tokens_to_string(
        # tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
        # )
        # print(f"Answer: {answer}")
        # translated_answer = baidu_translate(answer, fromLang='en', toLang='zh')
        # print(f"答案: {translated_answer}")
        # prediction = Example(
        #     qas_id=index,
        #     question_text=example.question_text,
        #     context_text=example.context_text,
        #     answer_text=translated_answer
        # )
        # predictions.append(prediction)

        # context = truncate_seq_pair(context, example.question_text, max_length=509)
        print(f"上下文:{context}")
        inputs = tokenizer(example.question_text,
                           context,
                           add_special_tokens=True,
                           return_tensors="pt",
                           max_length=512,
                           truncation=True).to(device)
        # input_ids = inputs["input_ids"].tolist()[0]
        input_ids = inputs["input_ids"][0]
        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer_start_scores, answer_end_scores = model(**inputs)
        answer_start = torch.argmax(
            answer_start_scores
        )  # Get the most likely beginning of answer with the argmax of the score
        answer_end = torch.argmax(
            answer_end_scores
        ) + 1  # Get the most likely end of answer with the argmax of the score
        answer = text_tokens[answer_start:answer_end]
        answer = "".join(answer)
        print(f"Question: {example.question_text}")
        print(f"Answer: {answer}")
        prediction = Example(qas_id=index,
                             question_text=example.question_text,
                             context_text=example.context_text,
                             answer_text=answer)
        predictions.append(prediction)

    # list_predictions = []
    dict_prediction = {}
    # dict_predictions = {}
    # with open("./Metric/electra_large_discriminator_squad2_512.json", "w") as f:
    with open(
            "./Metric/chinese-roberta-wwm-ext-finetuned-cmrc2018+BM25(Top30, t=2).json",
            "w") as f:
        for prediction in predictions:
            dict_prediction["context"] = prediction.context_text
            dict_prediction["question"] = prediction.question_text
            dict_prediction["answer"] = prediction.answer_text
            # list_predictions.append(dict_prediction)
            # for each_dict in list_predictions:
            f.write(json.dumps(dict_prediction, ensure_ascii=False) + '\n')
        print("加载入文件完成...")