def construct_corpus_sentence(): corpus = [] sentence_dict = {} # 存储所有句子所在文档以及句子索引(二级完全平铺) count = 0 context = read_json() all_sentences = sentences_partition(context) for index_document, sentences_per_document in enumerate(all_sentences): for index_sentence, sentence in enumerate(sentences_per_document): corpus.append(tokenization_sentence(sentence)) sentence_dict[count] = str(index_document) + "," + str( index_sentence) count += 1 dictionary = corpora.Dictionary(corpus) print(len(dictionary)) bm25model = bm25.BM25(corpus) return bm25model, all_sentences, sentence_dict
# inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt") # input_ids = inputs["input_ids"].tolist()[0] # text_tokens = tokenizer.convert_ids_to_tokens(input_ids) # answer_start_scores, answer_end_scores = model(**inputs) # answer_start = torch.argmax( # answer_start_scores # ) # Get the most likely beginning of answer with the argmax of the score # answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score # answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])) # print(f"Question: {question}") # print(f"Answer: {answer}") # en zh-cn print(googletrans.LANGUAGES) text = read_json() # translator = Translator(['translate.google.cn']) # translated_text = translator.translate(text, src='zh-cn', dest='en') # print(translated_text.text) questions = [ "壶兰计划是哪个市的?", "“壶兰计划”中诺贝尔奖获得者可以获得多少补助?", "“壶兰计划”中同一企业当年度最多不超过多少名人才指标?", "“壶兰计划”中的“双招双引”给出哪些意见?", "莆田市人才政策的全称是什么?", "特级人才需要符合哪些条件?", "莆田高层次人才服务窗口在哪?" ] embeddings_index, emb_size = get_word_embedding() all_sentences = sentences_partition(text)
for each_word in all_words: doc_count = idf2[str(each_word)] # 公式见论文"Alignment over Heterogeneous Embeddings for Question Answering": pp.2683, 式1 idf[str(each_word)] = math.log10( (total_doc - doc_count + 0.5) / float(doc_count + 0.5)) with open(file_name, 'w') as outfile: json.dump(idf, outfile, ensure_ascii=False) def pre(all_sentences): # input_files = ["train_456-fixedIds.json", "dev_83-fixedIds.json"] vocab = [] for justifications_per_article in all_sentences: for sentence in justifications_per_article: just_tokens = sentence_segmentation(sentence, 0) if just_tokens is not None: vocab += just_tokens vocab = list(set(vocab)) # vocab:存放每个句子分词后token的列表(完全不同的token) # all_sentences:所有篇章句子 write_idf_values(vocab, all_sentences, "./Data/GovernmentQA_IDF_values.json") if __name__ == "__main__": pre(sentences_partition(read_json()))
def evaluate(): # if torch.cuda.is_available(): # device = torch.device("cuda") # print('there are %d GPU(s) available.' % torch.cuda.device_count()) # print('we will use the GPU: ', torch.cuda.get_device_name(0)) # else: # print('No GPU availabel, using the CPU instead.') # device = torch.device('cpu') device = torch.device('cuda') examples = read_examples() # tokenizer = AutoTokenizer.from_pretrained( # "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-finetuned-squad") # model = AutoModelForQuestionAnswering.from_pretrained( # "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-finetuned-squad") # tokenizer = AutoTokenizer.from_pretrained( # "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-squad2") # model = AutoModelForQuestionAnswering.from_pretrained( # "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-squad2") # tokenizer = AutoTokenizer.from_pretrained("E:\\Dataset\\pytorch-Bert\\electra_large_discriminator_squad2_512") # model = AutoModelForQuestionAnswering.from_pretrained( # "E:\\Dataset\\pytorch-Bert\\electra_large_discriminator_squad2_512") # model = BertForQuestionAnswering.from_pretrained('E:\\Dataset\\pytorch-Bert\\chinese-roberta-wwm-ext\\cmrc2018') # tokenizer = BertTokenizer.from_pretrained('E:\\Dataset\\pytorch-Bert\\chinese-roberta-wwm-ext\\cmrc2018') model = BertForQuestionAnswering.from_pretrained( './Model/chinese-roberta-wwm-ext-finetuned-cmrc2018') tokenizer = BertTokenizer.from_pretrained( './Model/chinese-roberta-wwm-ext-finetuned-cmrc2018') model.to(device) text = read_json() embeddings_index, emb_size = get_word_embedding() all_sentences = sentences_partition(text) # text, all_sentences = read_txt() predictions = [] bm25model, _, sentence_dict = construct_corpus_sentence() for index, example in enumerate(tqdm(examples)): # context = example.context_text context = dispose(text, example.question_text, all_sentences, embeddings_index, emb_size, bm25model, sentence_dict, 30) # context = truncate_seq_pair(context, example.question_text, max_length=512) # print(f"上下文:{context}") # translated_text = baidu_translate(context, fromLang='zh', toLang='en') # print(f"Context:{translated_text}") # # translated_question = baidu_translate(example.question_text, fromLang='zh', toLang='en') # print(f"问题: {example.question_text}") # print(f"Question:{translated_question}") # # inputs = tokenizer( # translated_question, translated_text, add_special_tokens=True, return_tensors="pt", # truncation='longest_first', max_length=512 # ).to(device) # input_ids = inputs["input_ids"].tolist()[0] # # text_tokens = tokenizer.convert_ids_to_tokens(input_ids) # answer_start_scores, answer_end_scores = model(**inputs) # answer_start = torch.argmax( # answer_start_scores # ) # Get the most likely beginning of answer with the argmax of the score # # Get the most likely end of answer with the argmax of the score # answer_end = torch.argmax(answer_end_scores) + 1 # answer = tokenizer.convert_tokens_to_string( # tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]) # ) # print(f"Answer: {answer}") # translated_answer = baidu_translate(answer, fromLang='en', toLang='zh') # print(f"答案: {translated_answer}") # prediction = Example( # qas_id=index, # question_text=example.question_text, # context_text=example.context_text, # answer_text=translated_answer # ) # predictions.append(prediction) # context = truncate_seq_pair(context, example.question_text, max_length=509) print(f"上下文:{context}") inputs = tokenizer(example.question_text, context, add_special_tokens=True, return_tensors="pt", max_length=512, truncation=True).to(device) # input_ids = inputs["input_ids"].tolist()[0] input_ids = inputs["input_ids"][0] text_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer_start_scores, answer_end_scores = model(**inputs) answer_start = torch.argmax( answer_start_scores ) # Get the most likely beginning of answer with the argmax of the score answer_end = torch.argmax( answer_end_scores ) + 1 # Get the most likely end of answer with the argmax of the score answer = text_tokens[answer_start:answer_end] answer = "".join(answer) print(f"Question: {example.question_text}") print(f"Answer: {answer}") prediction = Example(qas_id=index, question_text=example.question_text, context_text=example.context_text, answer_text=answer) predictions.append(prediction) # list_predictions = [] dict_prediction = {} # dict_predictions = {} # with open("./Metric/electra_large_discriminator_squad2_512.json", "w") as f: with open( "./Metric/chinese-roberta-wwm-ext-finetuned-cmrc2018+BM25(Top30, t=2).json", "w") as f: for prediction in predictions: dict_prediction["context"] = prediction.context_text dict_prediction["question"] = prediction.question_text dict_prediction["answer"] = prediction.answer_text # list_predictions.append(dict_prediction) # for each_dict in list_predictions: f.write(json.dumps(dict_prediction, ensure_ascii=False) + '\n') print("加载入文件完成...")