def precompute_forward_items_and_cache(): # 3 places need to switch from dev to train !!! is_training = False doc_results = common.load_json( # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_train_doc_retrieval_v8_before_multihop_filtering.json") # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_dev_doc_retrieval_v8_before_multihop_filtering.json") config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_test_doc_retrieval_v8_before_multihop_filtering.json" ) doc_results = results_multihop_filtering(doc_results, multihop_retrieval_top_k=3, strict_mode=True) # db_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_DB) t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB) # data_list = common.load_json(config.DEV_FULLWIKI_FILE) data_list = common.load_json(config.TEST_FULLWIKI_FILE) # data_list = common.load_json(config.TRAIN_FILE) append_baseline_context(doc_results, data_list) fitem_list = build_full_wiki_document_forward_item(doc_results, data_list, is_training, t_db_cursor, True) print(len(fitem_list)) common.save_jsonl( fitem_list, config.PDATA_ROOT / "content_selection_forward" / "hotpot_test_p_level_unlabeled.jsonl")
def iterative_build_raw_text(): wiki_whole_db_cursor = get_cursor(str(config.WHOLE_WIKI_DB)) wiki_whole_db_cursor.execute("SELECT * from unnamed") total_count = 0 insert_data_list = [] db_path = config.WHOLE_WIKI_RAW_TEXT conn = sqlite3.connect(str(db_path)) saving_cursor = conn.cursor() for key, value in tqdm(wiki_whole_db_cursor, total=TOTAL_ARTICLE_COUNT): cur_item = json.loads(value) raw_text = get_raw_text(cur_item) article_title = cur_item['title'] for p_num, paragraph in enumerate(raw_text): assert isinstance(paragraph, list) p_str = json.dumps(paragraph) insert_data_list.append((article_title, p_num, p_str)) total_count += 1 conn.commit() if len(insert_data_list) >= 5000: insert_many_raw_text_table(saving_cursor, insert_data_list) insert_data_list = [] # if total_count >= 10000: # break print(total_count) insert_many_raw_text_table(saving_cursor, insert_data_list) conn.commit() conn.close()
def get_sentence_pair(top_k, d_list, p_level_results_list, is_training, debug_mode=False): # t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB) # # dev_list = common.load_json(config.DEV_FULLWIKI_FILE) # dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_list = d_list # cur_dev_eval_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_document_level/2019_4_17/dev_p_level_bert_v1_results.jsonl") cur_dev_eval_results_list = p_level_results_list if debug_mode: dev_list = dev_list[:100] id_set = set([item['_id'] for item in dev_list]) cur_dev_eval_results_list = [item for item in p_level_results_list if item['qid'] in id_set] dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict(cur_dev_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_top2 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=top_k, filter_value=None) # print(cur_results_dict_top2) fitems = build_sentence_forward_item(cur_results_dict_top2, dev_list, is_training=is_training, db_cursor=t_db_cursor) return fitems
def inspect_upstream_eval(): is_training = True debug_mode = True d_list = common.load_jsonl(config.OPEN_SQUAD_DEV_GT) in_file_name = config.PRO_ROOT / 'saved_models/05-12-08:44:38_mtr_open_qa_p_level_(num_train_epochs:3)/i(2000)|e(2)|squad|top10(0.6909176915799432)|top20(0.7103122043519394)|seed(12)_eval_results.jsonl' cur_eval_results_list = common.load_jsonl(in_file_name) top_k = 10 filter_value = 0.1 t_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_RAW_TEXT) match_type = 'string' if debug_mode: d_list = d_list[:100] id_set = set([item['question'] for item in d_list]) cur_eval_results_list = [ item for item in cur_eval_results_list if item['qid'] in id_set ] d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'question') copied_d_o_dict = copy.deepcopy(d_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_d_o_dict, 'qid', 'fid', check=True) cur_results_dict_top10 = od_sample_utils.select_top_k_and_to_results_dict( copied_d_o_dict, score_field_name='prob', top_k=top_k, filter_value=filter_value) forward_example_items = build_open_qa_forword_item(cur_results_dict_top10, d_list, is_training, t_cursor, match_type) print(forward_example_items)
def whole_wiki_pages_analysis(): whole_tokenized_db_cursor = wiki_db_tool.get_cursor( config.WHOLE_PROCESS_FOR_RINDEX_DB) whole_tokenized_db_cursor.execute("SELECT * from unnamed") with SqliteDict(str(config.WHOLE_WIKI_DB), flag='r', encode=json.dumps, decode=json.loads) as whole_wiki_db: for key, value in whole_tokenized_db_cursor: valid_page = True item = json.loads(value) # print(item) article_title = item['title'] article_clean_text = item['clean_text'] article_poss = item['poss'] abs_index = get_first_paragraph_index(whole_wiki_db[article_title]) if abs_index == -1: valid_page = False # print(whole_wiki_db[article_title]) # This pages is not valid. article_term_list = [] article_poss_list = [] title_term_list = [] title_poss_list = [] abstract_term_list = [] abstract_poss_list = [] assert len(article_clean_text) == len(article_poss) for p_i, (paragraph_text, paragraph_poss) in enumerate( zip(article_clean_text, article_poss)): for sent_text, sent_poss in zip(paragraph_text, paragraph_poss): if p_i == 0: # In title. title_term_list.extend(sent_text) title_poss_list.extend(sent_poss) continue # If the terms are in title, we don't those terms in abstract and article term. else: if p_i == abs_index: # If the terms are in abstract abstract_term_list.extend(sent_text) abstract_poss_list.extend(sent_poss) article_term_list.extend(sent_text) article_poss_list.extend(sent_poss) print("Title:", title_term_list, title_poss_list) print( "Title:(ngram):", get_ngrams(title_term_list, title_poss_list, 3, included_tags=POS_INCLUDED))
def prepare_forward_data(dataset_name, tag, is_training, upstream_top_k=20, distant_gt_top_k=2, down_sample_ratio=None, debug=False): if dataset_name == 'webq' and tag == 'test': gt_d_list_path = config.OPEN_WEBQ_TEST_GT elif dataset_name == 'webq' and tag == 'train': gt_d_list_path = config.OPEN_WEBQ_TRAIN_GT elif dataset_name == 'curatedtrec' and tag == 'test': gt_d_list_path = config.OPEN_CURATEDTERC_TEST_GT elif dataset_name == 'curatedtrec' and tag == 'train': gt_d_list_path = config.OPEN_CURATEDTERC_TRAIN_GT elif dataset_name == 'squad' and tag == 'dev': gt_d_list_path = config.OPEN_SQUAD_DEV_GT elif dataset_name == 'squad' and tag == 'train': gt_d_list_path = config.OPEN_SQUAD_TRAIN_GT elif dataset_name == 'wikimovie' and tag == 'test': gt_d_list_path = config.OPEN_WIKIM_TEST_GT elif dataset_name == 'wikimovie' and tag == 'train': gt_d_list_path = config.OPEN_WIKIM_TRAIN_GT else: raise NotImplemented() t_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_RAW_TEXT) # debug = False # upstream_top_k = 20 # distant_gt_top_k = 2 # down_sample_ratio = None if dataset_name != 'wikimovie': upstream_d_list_before_filter = common.load_jsonl( config.PRO_ROOT / f"data/p_{dataset_name}/tf_idf_p_level/{dataset_name}_{tag}_para_tfidf.jsonl") else: upstream_d_list_before_filter = common.load_jsonl( config.PRO_ROOT / f"data/p_{dataset_name}/kwm_p_level/{dataset_name}_{tag}_kwm_tfidf.jsonl") if debug: upstream_d_list_before_filter = upstream_d_list_before_filter[:50] upstream_d_list = top_k_filter_score_list(upstream_d_list_before_filter, top_k=upstream_top_k) upstream_d_dict = list_dict_data_tool.list_to_dict(upstream_d_list, 'question') gt_d_list = common.load_jsonl(gt_d_list_path) gt_d_dict = list_dict_data_tool.list_to_dict(gt_d_list, 'question') distant_gt_item_list = get_distant_top_k_ground_truth(gt_d_dict, upstream_d_list_before_filter, top_k=distant_gt_top_k) distant_gt_item_dict = list_dict_data_tool.list_to_dict(distant_gt_item_list, 'qid') fitems_list = build_p_level_forward_item(upstream_d_dict, distant_gt_item_dict, upstream_d_list, is_training, t_cursor) if is_training: return down_sample_neg(fitems_list, down_sample_ratio) else: return down_sample_neg(fitems_list, None)
def iterative_build(): # wiki_abs_db_cursor = get_cursor(str(config.ABS_WIKI_DB)) wiki_whole_db_cursor = get_cursor(str(config.WHOLE_WIKI_DB)) wiki_whole_db_cursor.execute("SELECT * from unnamed") total_count = 0 cur_count = 0 with SqliteDict(str(config.WHOLE_PROCESS_FOR_RINDEX_DB), encode=json.dumps, decode=json.loads) as whole_rindex_db: for key, value in tqdm(wiki_whole_db_cursor, total=TOTAL_ARTICLE_COUNT): cur_item = json.loads(value) # print(cur_item) clean_text = item_get_clean_text(cur_item) # print(clean_text) new_item = dict() new_item['title'] = cur_item['title'] flatten_article_tokens = [] for p_i, paragraph in enumerate(clean_text): # flatten_paragraph_tokens = [] # paragraph_poss = [] for s_i, sentence in enumerate(paragraph): flatten_article_tokens.extend(sentence) # flatten_article_tokens.extend(flatten_paragraph_tokens) flatten_article_poss = spacy_get_pos(flatten_article_tokens) cur_ptr = 0 article_poss = [] for p_i, paragraph in enumerate(clean_text): paragraph_poss = [] for s_i, sentence in enumerate(paragraph): sentence_poss = [] for _ in sentence: sentence_poss.append(flatten_article_poss[cur_ptr]) cur_ptr += 1 paragraph_poss.append(sentence_poss) article_poss.append(paragraph_poss) new_item['clean_text'] = clean_text new_item['poss'] = article_poss whole_rindex_db[new_item['title']] = new_item cur_count += 1 if cur_count % 5000 == 0: whole_rindex_db.commit() whole_rindex_db.commit() whole_rindex_db.close()
def get_open_qa_item_with_upstream_paragraphs(d_list, cur_eval_results_list, is_training, tokenizer: BertTokenizer, max_context_length, max_query_length, doc_stride=128, debug_mode=False, top_k=10, filter_value=0.1, match_type='string'): t_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_RAW_TEXT) if debug_mode: d_list = d_list[:100] id_set = set([item['question'] for item in d_list]) cur_eval_results_list = [ item for item in cur_eval_results_list if item['qid'] in id_set ] d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'question') copied_d_o_dict = copy.deepcopy(d_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_d_o_dict, 'qid', 'fid', check=True) cur_results_dict_top10 = od_sample_utils.select_top_k_and_to_results_dict( copied_d_o_dict, score_field_name='prob', top_k=top_k, filter_value=filter_value) forward_example_items = build_open_qa_forword_item(cur_results_dict_top10, d_list, is_training, t_cursor, match_type) forward_example_items = format_convert(forward_example_items, is_training) fitems_dict, read_fitems_list = span_preprocess_tool.eitems_to_fitems( forward_example_items, tokenizer, is_training, max_context_length, max_query_length, doc_stride, False) return fitems_dict, read_fitems_list, cur_results_dict_top10['pred_p_list']
def get_qa_item_with_upstream_sentence(d_list, sentence_level_results, is_training, tokenizer: BertTokenizer, max_context_length, max_query_length, doc_stride=128, debug_mode=False, top_k=5, filter_value=0.2): t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB) if debug_mode: d_list = d_list[:100] id_set = set([item['_id'] for item in d_list]) sentence_level_results = [ item for item in sentence_level_results if item['qid'] in id_set ] d_o_dict = list_dict_data_tool.list_to_dict(d_list, '_id') copied_d_o_dict = copy.deepcopy(d_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( sentence_level_results, copied_d_o_dict, 'qid', 'fid', check=True) cur_results_dict = select_top_k_and_to_results_dict( copied_d_o_dict, top_k=top_k, score_field_name='prob', filter_value=filter_value, result_field='sp') forward_example_items = build_qa_forword_item(cur_results_dict, d_list, is_training, t_db_cursor) forward_example_items = format_convert(forward_example_items, is_training) fitems_dict, read_fitems_list = span_preprocess_tool.eitems_to_fitems( forward_example_items, tokenizer, is_training, max_context_length, max_query_length, doc_stride, False) return fitems_dict, read_fitems_list, cur_results_dict['sp']
def iterative_build_raw_text(): wiki_whole_db_cursor = get_cursor(str(config.WHOLE_WIKI_DB)) wiki_whole_db_cursor.execute("SELECT * from unnamed") total_count = 0 cur_count = 0 with SqliteDict(str(config.WHOLE_WIKI_RAW_TEXT), encode=json.dumps, decode=json.loads) as whole_rindex_db: for key, value in tqdm(wiki_whole_db_cursor, total=TOTAL_ARTICLE_COUNT): cur_item = json.loads(value) raw_text = get_raw_text(cur_item) new_item = dict() new_item['title'] = cur_item['title'] new_item['raw_text'] = raw_text whole_rindex_db[new_item['title']] = new_item cur_count += 1 if cur_count % 5000 == 0: whole_rindex_db.commit() # break whole_rindex_db.commit() whole_rindex_db.close()
def lucene_indexing(): lucene.initVM() whole_tokenized_db_cursor = wiki_db_tool.get_cursor( config.WHOLE_PROCESS_FOR_RINDEX_DB) whole_tokenized_db_cursor.execute("SELECT * from unnamed") indexDir = SimpleFSDirectory(Paths.get(str(config.LUCENE_INDEXED))) analyzer = PorterStemmerAnalyzer() writerConfig = IndexWriterConfig(analyzer) writer = IndexWriter(indexDir, writerConfig) lprint("Building lucene index ...") with SqliteDict(str(config.WHOLE_WIKI_DB), flag='r', encode=json.dumps, decode=json.loads) as whole_wiki_db: for key, value in tqdm(whole_tokenized_db_cursor, total=config.TOTAL_ARTICLE_NUMBER_WHOLE): item = json.loads(value) article_title = item['title'] article_clean_text = item['clean_text'] article_poss = item['poss'] # TODO: change it to extract abstract wiki? # get the first paragraph which has the length >= 50? so weired. abs_index = get_first_paragraph_index(whole_wiki_db[article_title]) if abs_index == -1: # document too short valid_page = False # only title title_term_list = [] title_poss_list = [] # only abstract content abstract_term_list = [] abstract_poss_list = [] assert len(article_clean_text) == len(article_poss) for p_i, (paragraph_text, paragraph_poss) in enumerate( zip(article_clean_text, article_poss)): for sent_text, sent_poss in zip(paragraph_text, paragraph_poss): if p_i == 0: # In title. title_term_list.extend(sent_text) title_poss_list.extend(sent_poss) continue # If the terms are in title, we don't include those terms in abstract and article term. else: if p_i == abs_index: # If the terms are in abstract abstract_term_list.extend(sent_text) abstract_poss_list.extend(sent_poss) added_title = article_title added_text = " ".join(title_term_list + abstract_term_list) doc = Document() doc.add(Field("title", added_title, StoredField.TYPE)) doc.add(Field("text", added_text, TextField.TYPE_STORED)) writer.addDocument(doc) writer.close()
def whole_wiki_pages_title_raw_indexing(): whole_tokenized_db_cursor = wiki_db_tool.get_cursor( config.WHOLE_PROCESS_FOR_RINDEX_DB) whole_tokenized_db_cursor.execute("SELECT * from unnamed") title_abs_raw_indexdb = IndexDB() abs_file_name = config.PDATA_ROOT / "reverse_indexing/abs_rindexdb" content_indexdb = IndexDB() content_index_file_name = '' with SqliteDict(str(config.WHOLE_WIKI_DB), flag='r', encode=json.dumps, decode=json.loads) as whole_wiki_db: for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC): valid_page = True item = json.loads(value) # print(item) article_title = item['title'] article_clean_text = item['clean_text'] article_poss = item['poss'] abs_index = get_first_paragraph_index(whole_wiki_db[article_title]) if abs_index == -1: valid_page = False # print(whole_wiki_db[article_title]) # This pages is not valid. article_term_list = [] article_poss_list = [] title_term_list = [] title_poss_list = [] abstract_term_list = [] abstract_poss_list = [] assert len(article_clean_text) == len(article_poss) for p_i, (paragraph_text, paragraph_poss) in enumerate( zip(article_clean_text, article_poss)): for sent_text, sent_poss in zip(paragraph_text, paragraph_poss): if p_i == 0: # In title. title_term_list.extend(sent_text) title_poss_list.extend(sent_poss) continue # If the terms are in title, we don't those terms in abstract and article term. else: if p_i == abs_index: # If the terms are in abstract abstract_term_list.extend(sent_text) abstract_poss_list.extend(sent_poss) article_term_list.extend(sent_text) article_poss_list.extend(sent_poss) # print("Title:", title_term_list, title_poss_list) title_ngram = get_ngrams(title_term_list, title_poss_list, 3, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) abs_ngram = get_ngrams(abstract_term_list, abstract_poss_list, 3, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) # print(article_title) # print(title_ngram) # print(abs_ngram) added_terms_num = 0 for added_term in title_ngram + abs_ngram: title_abs_raw_indexdb.inverted_index.add( added_term, article_title) added_terms_num += 1 title_abs_raw_indexdb.document_length_table.add( article_title, added_terms_num) # break # content_t_ngram = get_ngrams(title_term_list, title_poss_list, 3, # filter_fn=partial(filter_ngram, mode='any'), # included_tags=POS_INCLUDED) # # content_c_ngram = get_ngrams(abstract_term_list, abstract_poss_list, 3, # filter_fn=partial(filter_ngram, mode='any'), # included_tags=POS_INCLUDED) # # added_terms_num = 0 # for added_term in content_t_ngram + content_c_ngram: # content_indexdb.inverted_index.add(added_term, article_title) # added_terms_num += 1 # # content_indexdb.document_length_table.add(article_title, added_terms_num) # title_abs_raw_indexdb.save_to_file(abs_file_name)
def whole_wiki_pages_title_raw_indexing_paragraph_level_unigram_size_limited_memory_saving( ): key_separator = '/' whole_tokenized_db_cursor = wiki_db_tool.get_cursor( config.WHOLE_PROCESS_FOR_RINDEX_DB) whole_tokenized_db_cursor.execute("SELECT * from unnamed") wiki_p_level_indexdb = IndexDB() file_name = config.PDATA_ROOT / "reverse_indexing/wiki_p_level_unigram_rindexdb" count = 0 # if limited_terms: # limited_terms_set = load_wiki_abstract_terms(config.PRO_ROOT / "data/processed/wiki_abs_3gram_terms.txt") # else: # limited_terms_set = [] # # limited_terms_set = set(limited_terms_set) for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC): item = json.loads(value) article_title = item['title'] article_clean_text = item['clean_text'] article_poss = item['poss'] title_term_list = [] title_poss_list = [] title_ngram = None assert len(article_clean_text) == len(article_poss) for p_i, (paragraph_text, paragraph_poss) in enumerate( zip(article_clean_text, article_poss)): paragraph_term_list = [] paragraph_poss_list = [] for sent_text, sent_poss in zip(paragraph_text, paragraph_poss): if p_i == 0: # In title. title_term_list.extend(sent_text) title_poss_list.extend(sent_poss) continue # If the terms are in title, we don't those terms in abstract and article term. else: # p_i != 0 paragraph_term_list.extend(sent_text) paragraph_poss_list.extend(sent_poss) if p_i == 0 and title_ngram is None: title_ngram = get_ngrams(title_term_list, title_poss_list, 1, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) if p_i >= 100: break paragraph_ngram = get_ngrams(paragraph_term_list, paragraph_poss_list, 1, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) if len(paragraph_ngram) == 0: continue added_terms_num = 0 paragraph_key = key_separator.join((article_title, str(p_i))) for added_term in title_ngram + paragraph_ngram: # if added_term in limited_terms_set: # wiki_p_level_indexdb.inverted_index.add(added_term, paragraph_key) # added_terms_num += 1 # elif ' ' not in added_term: hash_value_added_term = hash(added_term) hash_value_paragraph_key = hash(paragraph_key) wiki_p_level_indexdb.inverted_index.add( hash_value_added_term, hash_value_paragraph_key) added_terms_num += 1 # else: # pass hash_value_paragraph_key = hash(paragraph_key) wiki_p_level_indexdb.document_length_table.add( hash_value_paragraph_key, added_terms_num) count += 1 # if count >= 1000: # break wiki_p_level_indexdb.save_to_file(file_name, memory_saving=True)
def whole_wiki_pages_title_raw_indexing_article_level_to_indexdb(): whole_tokenized_db_cursor = wiki_db_tool.get_cursor( config.WHOLE_PROCESS_FOR_RINDEX_DB) whole_tokenized_db_cursor.execute("SELECT * from unnamed") # wiki_p_level_indexdb = IndexDB() file_name = config.PDATA_ROOT / "reverse_indexing/wiki_a_level_persistent_indexdb.db" index_db = IndexingDB(file_name) index_db.create_tables() count = 0 term_title_items_buffer_list: List[Tuple[str, str, int]] = [] title_items_buffer_list: List[Tuple[str, int]] = [] for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC): article_term_title_dict: Dict[Tuple[str, str], int] = dict() article_title_dict: Dict[str, int] = dict() item = json.loads(value) article_title = item['title'] article_clean_text = item['clean_text'] article_poss = item['poss'] title_term_list = [] title_poss_list = [] title_ngram = None article_ngram = [] assert len(article_clean_text) == len(article_poss) for p_i, (paragraph_text, paragraph_poss) in enumerate( zip(article_clean_text, article_poss)): paragraph_term_list = [] paragraph_poss_list = [] for sent_text, sent_poss in zip(paragraph_text, paragraph_poss): if p_i == 0: # In title. title_term_list.extend(sent_text) title_poss_list.extend(sent_poss) continue # If the terms are in title, we don't those terms in abstract and article term. else: # p_i != 0 paragraph_term_list.extend(sent_text) paragraph_poss_list.extend(sent_poss) if p_i == 0 and title_ngram is None: title_ngram = get_ngrams(title_term_list, title_poss_list, 2, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) continue paragraph_ngram = get_ngrams(paragraph_term_list, paragraph_poss_list, 2, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) if len(paragraph_ngram) == 0: continue article_ngram.extend(paragraph_ngram) if p_i >= 60: break added_terms_num = 0 for added_term in title_ngram + article_ngram: article_term_title_dict[(added_term, article_title)] = \ article_term_title_dict.get((added_term, article_title), 0) + 1 added_terms_num += 1 article_title_dict[article_title] = added_terms_num count += 1 if count >= 200: break for (term, article_title), ovalue in article_term_title_dict.items(): term_title_items_buffer_list.append((term, article_title, ovalue)) for article_title, ovalue in article_title_dict.items(): title_items_buffer_list.append((article_title, ovalue)) if len(term_title_items_buffer_list) >= 1000: # Flush index_db.insert_many_items(term_title_items_buffer_list) index_db.insert_many_articles(title_items_buffer_list) term_title_items_buffer_list = [] title_items_buffer_list = [] index_db.insert_many_items(term_title_items_buffer_list) index_db.insert_many_articles(title_items_buffer_list) index_db.close()
def whole_wiki_pages_title_raw_indexing_article_level(limited_terms=True): whole_tokenized_db_cursor = wiki_db_tool.get_cursor( config.WHOLE_PROCESS_FOR_RINDEX_DB) whole_tokenized_db_cursor.execute("SELECT * from unnamed") wiki_p_level_indexdb = IndexDB() file_name = config.PDATA_ROOT / "reverse_indexing/wiki_a_level_limited_gram_rindexdb" if limited_terms: limited_terms_set = load_wiki_abstract_terms( config.PRO_ROOT / "data/processed/wiki_abs_3gram_terms.txt") else: limited_terms_set = [] limited_terms_set = set(limited_terms_set) count = 0 for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC): item = json.loads(value) article_title = item['title'] article_clean_text = item['clean_text'] article_poss = item['poss'] title_term_list = [] title_poss_list = [] title_ngram = None assert len(article_clean_text) == len(article_poss) # article_term_list = [] # article_poss_list = [] article_ngram = [] for p_i, (paragraph_text, paragraph_poss) in enumerate( zip(article_clean_text, article_poss)): paragraph_term_list = [] paragraph_poss_list = [] for sent_text, sent_poss in zip(paragraph_text, paragraph_poss): if p_i == 0: # In title. title_term_list.extend(sent_text) title_poss_list.extend(sent_poss) continue # If the terms are in title, we don't those terms in abstract and article term. else: # p_i != 0 paragraph_term_list.extend(sent_text) paragraph_poss_list.extend(sent_poss) if p_i == 0 and title_ngram is None: title_ngram = get_ngrams(title_term_list, title_poss_list, 1, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) continue paragraph_ngram = get_ngrams(paragraph_term_list, paragraph_poss_list, 1, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) if len(paragraph_ngram) == 0: continue article_ngram.extend(paragraph_ngram) if p_i >= 80: break added_terms_num = 0 for added_term in title_ngram + article_ngram: if added_term in limited_terms_set: wiki_p_level_indexdb.inverted_index.add( added_term, article_title) added_terms_num += 1 elif ' ' not in added_term: wiki_p_level_indexdb.inverted_index.add( added_term, article_title) added_terms_num += 1 wiki_p_level_indexdb.document_length_table.add(article_title, added_terms_num) count += 1 # if count >= 5000: # break wiki_p_level_indexdb.save_to_file(file_name)
def init_results_v8(data_list, gt_data_list, terms_based_resutls, g_score_dict, match_filtering_k=3, term_retrieval_top_k=5, multihop_retrieval_top_k=None): # 2019-04-06 # The complete v7 version of retrieval ner_set = get_title_entity_set() # dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE) print("Total data length:") print(len(data_list)) # We load term-based results print("Load term-based results.") terms_based_results_dict = dict() for item in terms_based_resutls: terms_based_results_dict[item['qid']] = item # Load tf-idf_score function: # g_score_dict = dict() # load_from_file(g_score_dict, # config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") keyword_processor = KeywordProcessor(case_sensitive=True) keyword_processor_disamb = KeywordProcessor(case_sensitive=True) print("Build Processor") for kw in tqdm(ner_set): if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: # matched_key_word is the original matched span. we need to save it for group ordering. matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info={kw: 'kwm'}) keyword_processor.add_keyword(kw, matched_obj) # for kw in wiki_util.title_entities_set.disambiguation_group: if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: if kw in keyword_processor: # if the kw existed in the kw_processor, we update its dict to add more disamb items existing_matched_obj: _MatchedObject = keyword_processor.get_keyword( kw) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue if disamb_kw not in existing_matched_obj.matched_keywords_info: existing_matched_obj.matched_keywords_info[ disamb_kw] = 'kwm_disamb' else: # If not we add it to the keyword_processor_disamb, which is set to be lower priority # new_dict = dict() matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info=dict()) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb' # new_dict[disamb_kw] = 'kwm_disamb' keyword_processor_disamb.add_keyword(kw, matched_obj) doc_pred_dict = {'sp_doc': dict(), 'raw_retrieval_set': dict()} # doc_pred_dict_p1 = {'sp_doc': dict(), 'raw_retrieval_set': dict()} for item in tqdm(data_list): question = item['question'] qid = item['_id'] query_terms = get_query_ngrams(question) valid_query_terms = [ term for term in query_terms if term in g_score_dict ] retrieved_set = RetrievedSet() # This method will add the keyword match results in-place to retrieved_set. get_kw_matching_results(question, valid_query_terms, retrieved_set, match_filtering_k, g_score_dict, keyword_processor, keyword_processor_disamb) # Then we add term-based matching results added_count = 0 for score, title in sorted(terms_based_results_dict[qid]['doc_list'], key=lambda x: x[0], reverse=True)[:term_retrieval_top_k + 3]: if not filter_word(title) and not filter_document_id(title): retrieved_set.add_item(RetrievedItem(title, 'tf-idf')) added_count += 1 if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k: break # Add hyperlinked pages: finded_keys_set = set( retrieved_set.to_id_list() ) # for finding hyperlinked pages we do for both keyword matching and disambiguration group. # .3 We then add some hyperlinked title db_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_DB) for keyword_group in finded_keys_set: flatten_hyperlinks = [] hyperlinks = wiki_db_tool.get_first_paragraph_hyperlinks( db_cursor, keyword_group) for hls in hyperlinks: flatten_hyperlinks.extend(hls) for hl in flatten_hyperlinks: potential_title = hl.href if potential_title in ner_set and not filter_word( potential_title) and not filter_document_id( potential_title ): # important bug fixing 'or' to 'and' # hyperlinked_title.append(potential_title) # if not filter_document_id(potential_title): score = get_query_doc_score(valid_query_terms, potential_title, g_score_dict) retrieved_set.add_item( retrieval_utils.RetrievedItem(potential_title, 'kwm_disamb_hlinked')) retrieved_set.score_item(potential_title, score, namespace=keyword_group + '-2-hop') for keyword_group in finded_keys_set: retrieved_set.sort_and_filter(keyword_group + '-2-hop', top_k=multihop_retrieval_top_k) doc_pred_dict['sp_doc'][qid] = retrieved_set.to_id_list() doc_pred_dict['raw_retrieval_set'][qid] = retrieved_set if gt_data_list is not None: ext_hotpot_eval.eval(doc_pred_dict, gt_data_list) return doc_pred_dict
def inspect_oracle_answer_text(append_head=True): dev_list = common.load_json(config.DEV_FULLWIKI_FILE) db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB) total, error = 0, 0 for item in dev_list: qid = item['_id'] query = item['question'] answer = item['answer'] o_contexts = item['context'] supporting_facts = item['supporting_facts'] # print(query) # print(answer) supporting_doc = set([fact[0] for fact in item['supporting_facts']]) selected_fact = [] sentid2sent_token_dict = dict() for doc in supporting_doc: # if doc in gt_doc: # continue text_item = wiki_db_tool.get_item_by_key(db_cursor, key=doc) context = wiki_db_tool.get_first_paragraph_from_clean_text_item( text_item, flatten_to_paragraph=False, skip_first=True) for i, sentence_token in enumerate(context): # sentence_text = sentence_token if len(sentence_token) != 0: selected_fact.append([doc, i]) sentid2sent_token_dict[(doc, i)] = sentence_token # shuffle doc ordering. supporting_doc = list(supporting_doc) random.shuffle(supporting_doc) # end shuffled_supporting_fact_list = [] supporting_facts = sorted(supporting_facts, key=lambda x: (x[0], x[1])) for doc in supporting_doc: for fact in supporting_facts: if fact[0] == doc: shuffled_supporting_fact_list.append(fact) assert len(shuffled_supporting_fact_list) == len(supporting_facts) # print(supporting_facts) # print(shuffled_supporting_fact_list) # # print("Sup Fact.") cur_doc = None context_token_list = [] for doc, i in shuffled_supporting_fact_list: if (doc, i) not in sentid2sent_token_dict: print(f"Potential Error: {(doc, i)} not exists in DB.") continue # print((doc, i), sentid2sent_token_dict[(doc, i)]) paragraph_token_list = sentid2sent_token_dict[(doc, i)] if cur_doc != doc and append_head and i != 0: context_token_list = context_token_list + doc.split(' ') + [ '.' ] + paragraph_token_list else: context_token_list += paragraph_token_list # print(context_token_list) context_matcher = ContextAnswerMatcher(context_token_list, uncase=True) context, answer_start_list = context_matcher.concate_and_return_answer_index( answer, match_type='left') if len(answer_start_list) > 1: error += 1 if len(answer_start_list) == 0 and answer != 'yes' and answer != 'no': print("Error") print("Query:", query) print("Answer:", answer) print("Sp fact:", shuffled_supporting_fact_list) print("Context:", context) context_matcher = ContextAnswerMatcher(context_token_list, uncase=True) context, answer_start_list = context_matcher.concate_and_return_answer_index( answer) # print(sentid2sent_token_dict) # for title, number in supporting_facts: # print(title, number) total += 1 print(error, total)