def toy_init_results(): dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE) print(len(dev_fullwiki_list)) # Load rindex file abs_rindexdb = IndexDB() abs_rindexdb.load_from_file(config.PDATA_ROOT / "reverse_indexing/abs_rindexdb") print("Number of terms:", len(abs_rindexdb.inverted_index.index)) abs_rindexdb.inverted_index.build_Nt_table() abs_rindexdb.score_db['default-tf-idf'] = dict() load_from_file(abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") # Load rindex finished saved_items = [] for item in tqdm(dev_fullwiki_list): saved_tfidf_item = dict() question = item['question'] qid = item['_id'] doc_list = get_top_ranked_tf_idf_doc(question, abs_rindexdb, top_k=50) saved_tfidf_item['question'] = question saved_tfidf_item['qid'] = qid saved_tfidf_item['doc_list'] = doc_list saved_items.append(saved_tfidf_item) common.save_jsonl(saved_items, config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl")
def experiment_test_full_wiki(): multihop_retrieval_top_k = 3 match_filtering_k = 3 term_retrieval_top_k = 5 data_list = common.load_json(config.TEST_FULLWIKI_FILE) terms_based_results_list = common.load_jsonl( config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_test.jsonl" ) g_score_dict = dict() load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") # WE need to give gt data None. doc_retri_pred_dict = init_results_v8( data_list, None, terms_based_results_list, g_score_dict, match_filtering_k=match_filtering_k, term_retrieval_top_k=term_retrieval_top_k) len_list = [] for rset in doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results without filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) common.save_json( doc_retri_pred_dict, "hotpot_test_doc_retrieval_v8_before_multihop_filtering.json") # Filtering new_doc_retri_pred_dict = results_multihop_filtering( doc_retri_pred_dict, multihop_retrieval_top_k=multihop_retrieval_top_k) print("Results with filtering:") len_list = [] for rset in new_doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results with filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) # ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list) common.save_json(new_doc_retri_pred_dict, "hotpot_test_doc_retrieval_v8.json")
def sanity_check(): # pre_compute_abs_if_idf_scores() # abs_rindexdb = IndexDB() abs_rindexdb.load_from_file(config.PDATA_ROOT / "reverse_indexing/abs_rindexdb") print("Number of terms:", len(abs_rindexdb.inverted_index.index)) abs_rindexdb.inverted_index.build_Nt_table() abs_rindexdb.score_db['default-tf-idf'] = dict() load_from_file( abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") # # exit(0) # # abs_rindexdb.pre_compute_scores() # save_to_file(abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") # exit(0) query = "What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?" tokens = [t.text for t in nlp(query)] # poss = [t.text for t in nlp(query)] query_ngrams = get_ngrams(tokens, None, 3, filter_fn=partial(filter_ngram, mode='any'), included_tags=None) # print(query_ngram) candidate_pages_set = set() valid_terms = [] for q_ngram in query_ngrams: candidate_pages = abs_rindexdb.inverted_index.get_containing_document( q_ngram) if candidate_pages is not None: valid_terms.append(q_ngram) candidate_pages_set |= candidate_pages print('Animorphs' in candidate_pages_set) print(abs_rindexdb.get_relevant_document(['Animorphs'], valid_terms)) doc_list = abs_rindexdb.get_relevant_document(candidate_pages_set, valid_terms, top_k=100) # print(candidate_pages_set) print(query_ngrams) print(len(candidate_pages_set)) print(doc_list)
def load_tf_idf_score_to_redis_cache(): tf_idf_score_redis = redis.StrictRedis(host='localhost', port=6379, db=0, decode_responses=True) redis_score_index = RedisScoreIndex(tf_idf_score_redis) # abs_rindexdb = IndexDB() # abs_rindexdb.load_from_file(config.PDATA_ROOT / "reverse_indexing/abs_rindexdb") # print("Number of terms:", len(abs_rindexdb.inverted_index.index)) # abs_rindexdb.inverted_index.build_Nt_table() score_db = dict() load_from_file( score_db, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") redis_score_index.save_scored_index(score_db)
def single_process_fever_with_dict(start=0, end=None, tag='dev'): task_name = 'fever' debug = False top_k = 20 query_fieldname = 'claim' id_fieldname = 'id' debug_name = 'debug' if debug else "" g_score_dict = dict() g_score_dict = load_from_file(g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") # Important Set this number !!! print("Total length:", len(d_list)) # start, end = 0, len(d_list) # Important End !!! print(f"Task:{task_name}, Tag:{tag}, TopK:{top_k}, Start/End:{start}/{end}") d_list = d_list[start:end] print("Data length:", len(d_list)) if debug: d_list = d_list[:10] start, end = 0, 10 print("Data length (Pos-filtering):", len(d_list)) r_item_list = [] incr_file = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_incr_({start},{end})_{debug_name}.jsonl" if incr_file.is_file(): print("Warning save file exists.") save_path: Path = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_({start},{end})_{debug_name}.jsonl" if save_path.is_file(): print("Warning save file exists.") with open(incr_file, mode='w', encoding='utf-8') as out_f: process_func = partial(process_fever_item_with_score_dict, top_k=top_k, query_field=query_fieldname, id_field=id_fieldname, global_score_dict=g_score_dict) for item in tqdm(d_list, total=len(d_list)): r_item = process_func(item) r_item_list.append(r_item) out_f.write(json.dumps(item) + '\n') out_f.flush() print(len(r_item_list)) common.save_jsonl(r_item_list, save_path)
def save_wiki_abstract_terms(save_file): g_score_dict = dict() g_score_dict = load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") with open(save_file, encoding='utf-8', mode='w') as out_f: for term in tqdm(g_score_dict.keys()): out_f.write(term + '\n')
def fever_retrieval_v0(term_retrieval_top_k=3, match_filtering_k=2, tag='dev'): # term_retrieval_top_k = 20 # term_retrieval_top_k = 20 # term_retrieval_top_k = 3 # match_filtering_k = 2 if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") d_tf_idf = common.load_jsonl( config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/fever_tf_idf_{tag}.jsonl" ) tf_idf_dict = list_dict_data_tool.list_to_dict(d_tf_idf, 'id') r_list = [] ner_set = get_title_entity_set() g_score_dict = dict() load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") keyword_processor = KeywordProcessor(case_sensitive=True) keyword_processor_disamb = KeywordProcessor(case_sensitive=True) print("Build Processor") for kw in tqdm(ner_set): if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: # matched_key_word is the original matched span. we need to save it for group ordering. matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info={kw: 'kwm'}) keyword_processor.add_keyword(kw, matched_obj) for kw in wiki_util.title_entities_set.disambiguation_group: if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: if kw in keyword_processor: # if the kw existed in the kw_processor, we update its dict to add more disamb items existing_matched_obj: _MatchedObject = keyword_processor.get_keyword( kw) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue if disamb_kw not in existing_matched_obj.matched_keywords_info: existing_matched_obj.matched_keywords_info[ disamb_kw] = 'kwm_disamb' else: # If not we add it to the keyword_processor_disamb, which is set to be lower priority # new_dict = dict() matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info=dict()) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb' # new_dict[disamb_kw] = 'kwm_disamb' keyword_processor_disamb.add_keyword(kw, matched_obj) for item in tqdm(d_list): cur_id = str(item['id']) query = item['claim'] query_terms = get_query_ngrams(query) valid_query_terms = [ term for term in query_terms if term in g_score_dict ] retrieved_set = RetrievedSet() # print(tf_idf_doc_list) get_kw_matching_results(query, valid_query_terms, retrieved_set, match_filtering_k, g_score_dict, keyword_processor, keyword_processor_disamb) tf_idf_doc_list = tf_idf_dict[cur_id]['retrieved_list'] added_count = 0 for score, title in sorted(tf_idf_doc_list, key=lambda x: x[0], reverse=True)[:term_retrieval_top_k + 3]: if not filter_word(title) and not filter_document_id( title) and not title.startswith('List of '): retrieved_set.add_item(RetrievedItem(title, 'tf-idf')) added_count += 1 if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k: break predicted_docids = retrieved_set.to_id_list() # print(retrieved_set) # print(item['claim'], predicted_docids) r_item = dict() r_item['id'] = int(cur_id) r_item['claim'] = item['claim'] r_item['predicted_docids'] = predicted_docids if tag != 'test': r_item['label'] = item['label'] r_list.append(r_item) # r_list = common.load_jsonl('dev-debug.jsonl') # We need to modify the existing retrieved document for naming consistency for i, item in enumerate(r_list): predicted_docids = item['predicted_docids'] modified_docids = [] for docid in predicted_docids: docid = docid.replace(' ', '_') docid = reverse_convert_brc(docid) modified_docids.append(docid) item['predicted_docids'] = modified_docids # Modify finished # print(r_list[0:10]) len_list = [] for rset in r_list: len_list.append(len(rset['predicted_docids'])) print(collections.Counter(len_list).most_common(10000)) print(np.mean(len_list)) print(np.std(len_list)) print(np.max(len_list)) print(np.min(len_list)) common.save_jsonl( r_list, f'fever_term_based_retri_results_' f'{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl' ) mode = {'standard': False, 'check_doc_id_correct': True} # fever_scorer.fever_score_analysis(r_list, d_list, mode=mode, max_evidence=None) fever_scorer.fever_score(r_list, d_list, mode=mode, max_evidence=None)
def experiment_train_full_wiki(): multihop_retrieval_top_k = 3 match_filtering_k = 3 term_retrieval_top_k = 5 multihop_strict_mode = True debug_mode = None # data_list = common.load_json(config.DEV_FULLWIKI_FILE) data_list = common.load_json(config.TRAIN_FILE) if debug_mode is not None: data_list = data_list[:debug_mode] terms_based_results_list = common.load_jsonl( config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_train.jsonl" ) g_score_dict = dict() load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") doc_retri_pred_dict = init_results_v8( data_list, data_list, terms_based_results_list, g_score_dict, match_filtering_k=match_filtering_k, term_retrieval_top_k=term_retrieval_top_k) len_list = [] for rset in doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results without filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) # common.save_json(doc_retri_pred_dict, f"hotpot_doc_retrieval_v8_before_multihop_filtering_{debug_mode}.json") common.save_json( doc_retri_pred_dict, f"hotpot_train_doc_retrieval_v8_before_multihop_filtering.json") # Filtering new_doc_retri_pred_dict = results_multihop_filtering( doc_retri_pred_dict, multihop_retrieval_top_k=multihop_retrieval_top_k, strict_mode=multihop_strict_mode) print("Results with filtering:") len_list = [] for rset in new_doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results with filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list) # common.save_json(new_doc_retri_pred_dict, f"hotpot_doc_retrieval_v8_{debug_mode}.json") common.save_json(new_doc_retri_pred_dict, f"hotpot_train_doc_retrieval_v8.json")