def expand_from_preext_sent_rule(self): if not hasattr(self, 'cursor'): self.cursor = fever_db.get_cursor() if not hasattr(self, 'preext_sent_dict'): d_list = read_jsonl(config.RESULT_PATH / \ "sent_retri_nn/2018_07_17_16-34-19_r/train_scale(0.1).jsonl") self.preext_sent_dict = {item['id']: item for item in d_list} item = self.item # if len(item['prioritized_docids']) < 5: new_pdocids = [] structured_docids_sent = {} sent_ids = self.preext_sent_dict[item['id']]['scored_sentids'] for sent_id, score, probability in sent_ids: docid, sent_ind = sent_id.split('<SENT_LINE>') sent_ind = int(sent_ind) id_list, sent_list, sent_links = \ fever_db.get_evidence(self.cursor, docid, sent_ind) sent_links = json.loads(sent_links) all_links = np.array(sent_links) all_links = np.array(all_links) all_links = all_links.reshape(-1, 2)[:, 1] all_links = list(map(fever_db.reverse_convert_brc, all_links)) all_links = list(map(lambda x: x.replace(' ', '_'), all_links)) prio_docids = [(id_link, score) for id_link in all_links] new_pdocids.extend(prio_docids) structured_docids_sent.update({sent_id: prio_docids}) item['prioritized_docids_sent'] = new_pdocids item['structured_docids_sent'] = structured_docids_sent return self
def disambiguous_from_preext_sent_rule(self): if not hasattr(self, 'cursor'): self.cursor = fever_db.get_cursor() if not hasattr(self, 'preext_sent_dict'): d_list = read_jsonl(config.RESULT_PATH / \ "sent_retri_nn/2018_07_17_16-34-19_r/train_sent.jsonl") self.preext_sent_dict = {item['id']: item for item in d_list} item = self.item if len(item['prioritized_docids']) > 60: sent_ids = self.preext_sent_dict[item['id']][''] return self
def tf_idf_rank(args, top_k=5): dev_path = config.PRO_ROOT / \ 'results_old/doc_retri/docretri.basic.nopageview/dev.jsonl' cursor = get_cursor() d_list = read_jsonl(dev_path) d_list_test = d_list for i, item in enumerate(spcl(d_list_test)): all_sent = [] all_ids = [it[0] for it in item['prioritized_docids']] try: for doc_id in all_ids: r_list, _ = get_all_sent_by_doc_id(cursor, doc_id, with_h_links=False) all_sent.append(' '.join(r_list)) ranker = OnlineTfidfDocRanker(args, args.hash_size, args.ngram, all_sent) except Exception as e: if i - 1 >= 0: print(f'Early quit at {i-1} because of {e}') save_path = config.RESULT_PATH / \ 'doc_retri/docretri.tfidfrank/' \ f'dev_quit_dump_{uuid4()}.json' DocRetrievalExperiment.dump_results(d_list_test[:i], save_path) raise e rank_ind, rank_score = \ ranker.closest_docs(' '.join(item['claim_tokens']), k=100) id_score_dict = {docid: 0 for docid in all_ids} id_score_dict.update({all_ids[ri]: rs \ for ri, rs in zip(rank_ind, rank_score)}) item['prioritized_docids'] = [(k, v) for k, v in id_score_dict.items()] item['predicted_docids'] = \ list(set([k for k, v \ in sorted(item['prioritized_docids'], key=lambda x: (-x[1], x[0]))][:top_k])) save_path = config.RESULT_PATH / 'doc_retri/docretri.tfidfrank/dev.json' DocRetrievalExperiment.dump_results(d_list_test, save_path)
def main(): doc_exp = DocRetrievalExperimentTwoStep() d_list = read_jsonl(config.FEVER_DEV_JSONL) # d_list = read_jsonl(config.FEVER_TRAIN_JSONL) doc_exp.sample_answer_with_priority(d_list) doc_exp.print_eval(d_list) doc_exp.feed_sent_file( "../../results/" "sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl") doc_exp.find_sent_link_with_priority(d_list, predict=True) doc_exp.print_eval(d_list) # path = "../../results/doc_retri/docretri.spiral.aside/train.jsonl" # doc_exp.dump_results(d_list, path) from IPython import embed embed() import os os._exit(1)
def main(): import os from chaonan_src._config import old_result_path from chaonan_src._utils.doc_utils import read_jsonl from chaonan_src._utils.spcl import spcl # pageview_path = os.path.join(config.RESULT_PATH, # 'doc_retri/docretri.rawpageview/train.jsonl') pageview_path = config.RESULT_PATH / \ 'doc_retri/docretri.rawpageview/dev.jsonl' # ori_path = os.path.join(old_result_path, # 'doc_retri/docretri.pageview/dev.jsonl') # d_list = read_jsonl(config.FEVER_DEV_JSONL) # item_rb_exp = ItemRuleRawPageview() # doc_exp = DocRetrievalExperiment(item_rb_exp) # doc_exp.sample_answer_with_priority(d_list) # doc_exp.dump_results(d_list, save_path) # doc_exp.print_eval(d_list) # d_list_ori = read_jsonl(ori_path) d_list = read_jsonl(pageview_path) # DocRetrievalExperiment.dump_results(d_list, pageview_path) # item_rb = ItemRuleRawPageview() top_k = 5 for item in spcl(d_list): item['predicted_docids'] = \ list(set([k for k, v \ in sorted(item['docid_pageviews'], key=lambda x: (-x[1], x[0]))][:top_k])) # DocRetrievalExperiment.dump_results(d_list, save_path) DocRetrievalExperiment.print_eval(d_list) from IPython import embed embed() import os os._exit(1)
def pageview_analysis(): from chaonan_src._doc_retrieval.item_rules import ItemRuleBuilder from chaonan_src._utils.doc_utils import read_jsonl from utils.fever_db import convert_brc wiki_pv = WikiPageviews() d_list = read_jsonl( "../../../results/doc_retri/docretri.titlematch/dev.jsonl") gt_evidences, pre_evidences = [], [] for item in d_list: gt_evidences.extend(ItemRuleBuilder\ .get_all_docid_in_evidence(item['evidence'])) pre_evidences.extend([it[0] for it in item['prioritized_docids']]) gt_evidences = set(gt_evidences) pre_evidences = set(pre_evidences) gt_count = [wiki_pv[convert_brc(it)] for it in gt_evidences] pre_count = [wiki_pv[convert_brc(it)] for it in pre_evidences] from IPython import embed embed() import os os._exit(1)
def feed_sent_score_result(self, path): d_list = read_jsonl(path) self.preext_sent_dict = {item['id']: item for item in d_list}
if predict: porg = \ set([k for k, v \ in sorted(item['prioritized_docids'], key=lambda x: (-x[1], x[0]))][:top_k]) paside = \ set([k for k, v \ in sorted(item['prioritized_docids_aside'], key=lambda x: (-x[1], x[0]))][:top_k]) item['predicted_docids'] = list(porg | paside) item['predicted_docids_origin'] = list(porg) item['predicted_docids_aside'] = list(paside) d_list[i] = item ws_second.close() if __name__ == "__main__": from chaonan_src._utils.doc_utils import read_jsonl import config d_list = read_jsonl(config.FEVER_DEV_JSONL) d_list_test = d_list[:20] DocRetrievalClient.sample_answer_with_priority(d_list_test) DocRetrievalClient.print_eval(d_list_test) # DocRetrievalClient.feed_sent_file( # "../../results" # "/sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl") # DocRetrievalClient.find_sent_link_with_priority(d_list_test, predict=True) DocRetrievalExperiment.print_eval(d_list_test)
def eval(): save_path = config.RESULT_PATH / 'doc_retri/docretri.tfidfrank/dev_toy.json' d_list = read_jsonl(save_path) DocRetrievalExperiment.print_eval(d_list)
def __init__(self, jl_path, train=False): self.d_list = read_jsonl(jl_path) self.train = train self.initialized = False self.batch_size = None