else: new_sentences.append(all_sentences[index_article][current_len]) sentences_matrix_bert.append(all_sentences_matrix_bert[index_article][current_len]) break # question_idf = get_idf(question_tokens, context) final_indexes = get_iterative_alignment_justifications( question_tokens, new_sentences, idf_values, embeddings_index, query_matrix_bert, sentences_matrix_bert, max_iteration=6, emb_size=emb_size) # # 多条证据链 # final_indexes = get_iterative_alignment_justifications_non_parametric_parallel_evidence( # question_tokens, new_sentences, idf_values, embeddings_index, # parallel_evidence_num=3, max_iteration=6, emb_size=emb_size) justifications = [] for final_index in final_indexes: justifications.append(new_sentences[final_index.astype(np.int32)]) selected = "。".join(justifications) + "。" return selected if __name__ == '__main__': embeddings, size = get_word_embedding() bm25, _ = construct_corpus_sentence() # content, sentences = read_txt() # context = dispose(read_json(), "壶兰计划是哪个市的?", sentences_partition(read_json()), embeddings, size, bm25) # context = dispose(content, "壶兰计划是哪个市的?", sentences, embeddings, size, bm25) # print(context)
# en zh-cn print(googletrans.LANGUAGES) text = read_json() # translator = Translator(['translate.google.cn']) # translated_text = translator.translate(text, src='zh-cn', dest='en') # print(translated_text.text) questions = [ "壶兰计划是哪个市的?", "“壶兰计划”中诺贝尔奖获得者可以获得多少补助?", "“壶兰计划”中同一企业当年度最多不超过多少名人才指标?", "“壶兰计划”中的“双招双引”给出哪些意见?", "莆田市人才政策的全称是什么?", "特级人才需要符合哪些条件?", "莆田高层次人才服务窗口在哪?" ] embeddings_index, emb_size = get_word_embedding() all_sentences = sentences_partition(text) def qa(): for question in questions: # translated_question = translator.translate(question, src='zh-cn', dest='en') # print(translated_question.text) # inputs = tokenizer(translated_question.text, translated_text.text, # add_special_tokens=True, return_tensors="pt") context = dispose(text, question, all_sentences, embeddings_index, emb_size) context = truncate_seq_pair(context, question, max_length=509) print(f"上下文:{context}") translated_text = baidu_translate(context, fromLang='zh', toLang='en') # 防止翻译后的上下文+问题依然超出max_length
def evaluate(): # if torch.cuda.is_available(): # device = torch.device("cuda") # print('there are %d GPU(s) available.' % torch.cuda.device_count()) # print('we will use the GPU: ', torch.cuda.get_device_name(0)) # else: # print('No GPU availabel, using the CPU instead.') # device = torch.device('cpu') device = torch.device('cuda') examples = read_examples() # tokenizer = AutoTokenizer.from_pretrained( # "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-finetuned-squad") # model = AutoModelForQuestionAnswering.from_pretrained( # "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-finetuned-squad") # tokenizer = AutoTokenizer.from_pretrained( # "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-squad2") # model = AutoModelForQuestionAnswering.from_pretrained( # "E:\\Dataset\\pytorch-Bert\\bert-large-uncased-whole-word-masking-squad2") # tokenizer = AutoTokenizer.from_pretrained("E:\\Dataset\\pytorch-Bert\\electra_large_discriminator_squad2_512") # model = AutoModelForQuestionAnswering.from_pretrained( # "E:\\Dataset\\pytorch-Bert\\electra_large_discriminator_squad2_512") # model = BertForQuestionAnswering.from_pretrained('E:\\Dataset\\pytorch-Bert\\chinese-roberta-wwm-ext\\cmrc2018') # tokenizer = BertTokenizer.from_pretrained('E:\\Dataset\\pytorch-Bert\\chinese-roberta-wwm-ext\\cmrc2018') model = BertForQuestionAnswering.from_pretrained( './Model/chinese-roberta-wwm-ext-finetuned-cmrc2018') tokenizer = BertTokenizer.from_pretrained( './Model/chinese-roberta-wwm-ext-finetuned-cmrc2018') model.to(device) text = read_json() embeddings_index, emb_size = get_word_embedding() all_sentences = sentences_partition(text) # text, all_sentences = read_txt() predictions = [] bm25model, _, sentence_dict = construct_corpus_sentence() for index, example in enumerate(tqdm(examples)): # context = example.context_text context = dispose(text, example.question_text, all_sentences, embeddings_index, emb_size, bm25model, sentence_dict, 30) # context = truncate_seq_pair(context, example.question_text, max_length=512) # print(f"上下文:{context}") # translated_text = baidu_translate(context, fromLang='zh', toLang='en') # print(f"Context:{translated_text}") # # translated_question = baidu_translate(example.question_text, fromLang='zh', toLang='en') # print(f"问题: {example.question_text}") # print(f"Question:{translated_question}") # # inputs = tokenizer( # translated_question, translated_text, add_special_tokens=True, return_tensors="pt", # truncation='longest_first', max_length=512 # ).to(device) # input_ids = inputs["input_ids"].tolist()[0] # # text_tokens = tokenizer.convert_ids_to_tokens(input_ids) # answer_start_scores, answer_end_scores = model(**inputs) # answer_start = torch.argmax( # answer_start_scores # ) # Get the most likely beginning of answer with the argmax of the score # # Get the most likely end of answer with the argmax of the score # answer_end = torch.argmax(answer_end_scores) + 1 # answer = tokenizer.convert_tokens_to_string( # tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]) # ) # print(f"Answer: {answer}") # translated_answer = baidu_translate(answer, fromLang='en', toLang='zh') # print(f"答案: {translated_answer}") # prediction = Example( # qas_id=index, # question_text=example.question_text, # context_text=example.context_text, # answer_text=translated_answer # ) # predictions.append(prediction) # context = truncate_seq_pair(context, example.question_text, max_length=509) print(f"上下文:{context}") inputs = tokenizer(example.question_text, context, add_special_tokens=True, return_tensors="pt", max_length=512, truncation=True).to(device) # input_ids = inputs["input_ids"].tolist()[0] input_ids = inputs["input_ids"][0] text_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer_start_scores, answer_end_scores = model(**inputs) answer_start = torch.argmax( answer_start_scores ) # Get the most likely beginning of answer with the argmax of the score answer_end = torch.argmax( answer_end_scores ) + 1 # Get the most likely end of answer with the argmax of the score answer = text_tokens[answer_start:answer_end] answer = "".join(answer) print(f"Question: {example.question_text}") print(f"Answer: {answer}") prediction = Example(qas_id=index, question_text=example.question_text, context_text=example.context_text, answer_text=answer) predictions.append(prediction) # list_predictions = [] dict_prediction = {} # dict_predictions = {} # with open("./Metric/electra_large_discriminator_squad2_512.json", "w") as f: with open( "./Metric/chinese-roberta-wwm-ext-finetuned-cmrc2018+BM25(Top30, t=2).json", "w") as f: for prediction in predictions: dict_prediction["context"] = prediction.context_text dict_prediction["question"] = prediction.question_text dict_prediction["answer"] = prediction.answer_text # list_predictions.append(dict_prediction) # for each_dict in list_predictions: f.write(json.dumps(dict_prediction, ensure_ascii=False) + '\n') print("加载入文件完成...")