def get_idf_weights(all_sentences, idf): doc_len = [] # 每个句子包含的term数 corpus = [] all_words = [] for justifications_per_article in all_sentences: for line in justifications_per_article: # each line is a doc # line = line.lower() # words = line.split() words = sentence_segmentation(line, 0) if words is not None: # words=tokenizer.tokenize(line) # words = [lmtzr.lemmatize(w1) for w1 in words] document = { } # dictionary - having terms as key and TF as values of the key. doc_len.append(len(words)) unique_words = list(set(words)) # 统计每个句子中的term在所有文档(作者将每个句子作为文档)中出现的次数 for w1 in unique_words: if w1 in idf.keys(): idf[str(w1)] += 1 # print ("yes, we come here", w1) # else: # idf.update({str(w1):1}) all_words += unique_words for term1 in unique_words: # 统计每个句子中的term在该句子(作者将每个句子作为文档)中出现的次数 document[str(term1)] = words.count(term1) corpus.append(document) all_words = list(set(all_words)) return doc_len, corpus, all_words, idf
def dispose(context, question, all_sentences, embeddings_index, emb_size): """ 按步实现无监督对齐迭代证据算法 :param context:原始上下文数据 :param question:当前问题 :param all_sentences:所有篇章中的句子(嵌套列表的形式) :param embeddings_index:嵌入表示 :param emb_size:嵌入表示维度大小,默认300维 :return:一个证据链最终得到的证据句子或者多个证据链最终得到的证据句子 """ question_tokens = sentence_segmentation(question, flag=1) # question_idf = get_idf(question_tokens, context) with open("./Data/GovernmentQA_IDF_values.json") as json_file: idf_values = json.load(json_file) final_indexes = get_iterative_alignment_justifications(question_tokens, all_sentences, idf_values, embeddings_index, max_iteration=6, emb_size=emb_size) # 多条证据链 # final_indexes = get_iterative_alignment_justifications_non_parametric_parallel_evidence( # question_tokens, all_sentences, idf_values, embeddings_index, # parallel_evidence_num=3, max_iteration=6, emb_size=emb_size) justifications = [] for final_index in final_indexes: index_article, index_sentence = get_index(final_index, all_sentences) # print(all_sentences[index_article][index_sentence]) justifications.append(all_sentences[index_article][index_sentence]) selected = "。".join(justifications) + "。" return selected
def get_alignment_justification( question_tokens, all_sentences, embeddings_index, emb_size, idf_values, query_matrix_bert, sentences_matrix_bert ): """ 获得对齐的证据句子 :param question_tokens:分词后的问题 :param all_sentences:所有篇章中的句子(顺序列表的形式) :param embeddings_index:嵌入表示 :param emb_size:嵌入表示维度大小,默认300维 :param idf_values:所有篇章中term的idf值 :param query_matrix_bert: :param sentence_matrix_bert: :return:证据检索后按分数大小顺序排列的列表索引(对应句子在所有句子中的位置); 最大分数句子对应的剩余未匹配项; 所有句子对应的剩余未匹配项 """ ques_matrix, ques_idf_mat, ques_tokens_nf, ques_tokens_found = get_sentence_embedding( question_tokens, embeddings_index, emb_size, idf_values, ques_text=1) justification_ques_remaining_terms = {} final_alignment_scores = [] num_remaining_terms = [] # 当前句子所在篇章内句子索引 for index_sentence, justification in enumerate(all_sentences): just_tokens = sentence_segmentation(justification, flag=1) if just_tokens is None: try: all_sentences.remove(justification) except ValueError: pass continue if just_tokens is not None: # 获得证据句子的嵌入 just_matrix, just_tokens_nf, just_tokens_found = get_sentence_embedding( just_tokens, embeddings_index, emb_size, idf_values) # 将证据句子与问题进行匹配,得到匹配token以及未匹配的剩余tokens index = index_sentence justification_ques_remaining_terms.update( {index: compute_alignment_vector( ques_matrix, ques_tokens_nf, ques_tokens_found, just_matrix, just_tokens_nf)}) # 当前索引句子与问题的对齐分数 index_score = compute_alignment_fusion_score( ques_matrix, ques_tokens_nf, ques_idf_mat, just_matrix, just_tokens_nf, idf_values, query_matrix_bert, sentences_matrix_bert[index_sentence] ) num_remaining_terms.append(len(justification_ques_remaining_terms[index])) final_alignment_scores.append(index_score) # 所有句子及其分数的索引 all_indexes = list(np.argsort(final_alignment_scores)[::-1]) # [:subgraph_size] # # final_index[0]即最大分数对应的句子在所有句子中的索引,根据其寻找在context中的位置,即第几篇第几个 # # 以获得该证据句子对应的justification_ques_remaining_terms[index] # index_article, index_sentence = get_index(all_indexes[0], all_sentences) index = all_indexes[0] return all_indexes, justification_ques_remaining_terms[index], justification_ques_remaining_terms
def dispose(context, question, all_sentences, all_sentences_matrix_bert, model, tokenizer, embeddings_index, emb_size, idf_values, bm25model, top_k=20): """ 按步实现无监督对齐迭代证据算法 :param context:原始上下文数据 :param question:当前问题 :param all_sentences:所有篇章中的句子(嵌套列表的形式) :param embeddings_index:嵌入表示 :param all_sentences_matrix_bert:嵌入表示 :param model:嵌入表示 :param tokenizer:嵌入表示 :param emb_size:嵌入表示维度大小,默认300维 :param idf_values: :param bm25model:构造的BM25模型 :param top_k:筛选前多少个最大BM25分数句子 :return:一个证据链最终得到的证据句子或者多个证据链最终得到的证据句子 """ question_tokens = sentence_segmentation(question, flag=1) query_matrix_bert = get_sentence_embedding_bert(model, tokenizer, question) scores = bm25model.get_scores(question_tokens) idx_list = np.array(scores).argsort()[-top_k:][::-1].tolist() new_sentences = [] sentences_matrix_bert = [] for index in idx_list: current_len = index for index_article, justifications_per_article in enumerate(all_sentences): if current_len >= len(justifications_per_article): current_len -= len(justifications_per_article) continue else: new_sentences.append(all_sentences[index_article][current_len]) sentences_matrix_bert.append(all_sentences_matrix_bert[index_article][current_len]) break # question_idf = get_idf(question_tokens, context) final_indexes = get_iterative_alignment_justifications( question_tokens, new_sentences, idf_values, embeddings_index, query_matrix_bert, sentences_matrix_bert, max_iteration=6, emb_size=emb_size) # # 多条证据链 # final_indexes = get_iterative_alignment_justifications_non_parametric_parallel_evidence( # question_tokens, new_sentences, idf_values, embeddings_index, # parallel_evidence_num=3, max_iteration=6, emb_size=emb_size) justifications = [] for final_index in final_indexes: justifications.append(new_sentences[final_index.astype(np.int32)]) selected = "。".join(justifications) + "。" return selected
def pre(all_sentences): # input_files = ["train_456-fixedIds.json", "dev_83-fixedIds.json"] vocab = [] for justifications_per_article in all_sentences: for sentence in justifications_per_article: just_tokens = sentence_segmentation(sentence, 0) if just_tokens is not None: vocab += just_tokens vocab = list(set(vocab)) # vocab:存放每个句子分词后token的列表(完全不同的token) # all_sentences:所有篇章句子 write_idf_values(vocab, all_sentences, "./Data/GovernmentQA_IDF_values.json")
def one_iteration_block(final_indexes, first_iteration_index1, remaining_tokens1_3, ques_terms, all_sentences, embedding_index, emb_size, idf_values, query_matrix_bert, sentences_matrix_bert): """ 一次迭代获取证据句子 :param final_indexes:一个证据链最终产生的证据句子索引 :param first_iteration_index1:前一次证据检索后按分数大小顺序排列的列表索引 :param remaining_tokens1_3:前一次迭代最高匹配分数对应的句子的剩余未匹配项 :param ques_terms:分词后的问题 :param all_sentences:所有篇章中的句子(嵌套列表的形式) :param embedding_index:嵌入表示 :param emb_size:嵌入表示维度大小,默认300维 :param idf_values:所有篇章中term的idf值 :return:当前迭代后一个证据链最终产生的证据句子索引; 当前迭代后按分数大小顺序排列的列表索引; 当前迭代后最高匹配分数对应的句子的剩余未匹配项 """ # 获取final_indexes中最后一个索引对应的句子在篇章中的索引(,) # index_article_inverse, index_sentence_inverse = get_index(final_indexes[-1], all_sentences) try: # selected_just_tokens = sentence_segmentation( # all_sentences[index_article_inverse][index_sentence_inverse], flag=1) selected_just_tokens = sentence_segmentation( all_sentences[final_indexes[-1]], flag=1) except IndexError: # please pardon these error messages, they do not appear when running the main file # and were used only for debugging print("the error is coming because ", final_indexes, len(all_sentences)) # index_article, index_sentence = get_index(final_indexes[0], all_sentences) # selected_just_tokens = sentence_segmentation(all_sentences[index_article][index_sentence], flag=1) selected_just_tokens = sentence_segmentation(all_sentences[final_indexes[0]], flag=1) if len(remaining_tokens1_3) <= 1: # which can be considered as a very short query new_query_terms = remaining_tokens1_3 + list(set(selected_just_tokens) - set(ques_terms)) else: new_query_terms = remaining_tokens1_3 # new_query_terms = remaining_tokens1_3 + list(set(selected_just_tokens)) # second_iteration_index1: 新一次迭代产生的证据句子索引 # remaining_tokens1_4: 新一次迭代最高匹配分数对应的句子的剩余未匹配项 # remaining_tokens2_all: 新一次迭代所有句子对应的剩余未匹配项 second_iteration_index1, remaining_tokens1_4, remaining_tokens2_all = get_alignment_justification( new_query_terms, all_sentences, embedding_index, emb_size, idf_values, query_matrix_bert, sentences_matrix_bert) # 对于新一次迭代产生的证据句子索引 for i1 in second_iteration_index1: # 如果该句子索引出现在之前的final_indexes中,则不做处理 if i1 in final_indexes: pass else: # 前一次迭代的剩余项依然在当前迭代的所有剩余项中 # i.e. none of the previously remaining ques terms were covered in this iteration # 则迭代结束(作者人为设置) # index_article, index_sentence = get_index(i1, all_sentences) # index = str(index_article) + ',' + str(index_sentence) if len(set(remaining_tokens1_3).intersection(set( remaining_tokens2_all[i1]))) == len(set(remaining_tokens1_3)): continue # return final_indexes, [], [] # 否则添加当前句子为满足条件的当前迭代的最高分数句子 final_indexes.append(i1) # 更新满足条件的当前迭代的最高分数句子对应的剩余未匹配项 remaining_tokens1_4 = remaining_tokens2_all[i1] break return final_indexes, second_iteration_index1, remaining_tokens1_4