def inference_build(item, cursor, contain_first_sentence=False): doc_t_list = [it[0] for it in item['prioritized_docids']] # evidence_group = check_sentences.check_and_clean_evidence(item) t_claim = ' '.join(item['claim_tokens']) eid = item['id'] b_list = [] for doc_id in doc_t_list: if '-LRB-' in doc_id and common.doc_id_to_tokenized_text(doc_id) not in t_claim: item = dict() item['selection_id'] = str(eid) + '###' + str(doc_id) example = common.doc_id_to_tokenized_text(doc_id) description_sent = '' if contain_first_sentence: r_list, id_list = fever_db.get_all_sent_by_doc_id(cursor, doc_id, with_h_links=False) for sent, sent_id in zip(r_list, id_list): if int(sent_id.split('(-.-)')[1]) == 0: description_sent = sent item['query'] = example + ' ' + description_sent item['text'] = t_claim item['selection_label'] = 'hidden' b_list.append(item) return b_list
def make_examples(eid, positive_list, negative_list, t_claim, cursor, contain_first_sentence=False): pos_examples = [] neg_examples = [] for pos_e in positive_list: item = dict() item['selection_id'] = str(eid) + '###' + str(pos_e) example = common.doc_id_to_tokenized_text(pos_e) description_sent = '' if contain_first_sentence: r_list, id_list = fever_db.get_all_sent_by_doc_id( cursor, pos_e, with_h_links=False) for sent, sent_id in zip(r_list, id_list): if int(sent_id.split('(-.-)')[1]) == 0: description_sent = sent item['query'] = example + ' ' + description_sent item['text'] = t_claim item['selection_label'] = 'true' pos_examples.append(item) del item for neg_e in negative_list: # sampling item = dict() item['selection_id'] = str(eid) + '###' + str(neg_e) example = common.doc_id_to_tokenized_text(neg_e) description_sent = '' if contain_first_sentence: r_list, id_list = fever_db.get_all_sent_by_doc_id( cursor, neg_e, with_h_links=False) for sent, sent_id in zip(r_list, id_list): if int(sent_id.split('(-.-)')[1]) == 0: description_sent = sent item['query'] = example + ' ' + description_sent # print(item['query']) item['text'] = t_claim item['selection_label'] = 'false' neg_examples.append(item) del item return pos_examples, neg_examples
def evidence_list_to_text(cursor, evidences, contain_head=True, id_tokenized=False): current_evidence_text = [] evidences = sorted(evidences, key=lambda x: (x[0], x[1])) cur_head = 'DO NOT INCLUDE THIS FLAG' for doc_id, line_num in evidences: _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num) if contain_head and cur_head != doc_id: cur_head = doc_id if not id_tokenized: doc_id_natural_format = fever_db.convert_brc(doc_id).replace( '_', ' ') t_doc_id_natural_format = ' '.join( easy_tokenize(doc_id_natural_format)) else: t_doc_id_natural_format = common.doc_id_to_tokenized_text( doc_id) if line_num != 0: current_evidence_text.append(f"{t_doc_id_natural_format} <t>") # Important change move one line below: July 16 current_evidence_text.append(e_text) # print(current_evidence_text) return ' '.join(current_evidence_text)
def enforce_disabuigation_into_retrieval_result_v2(disabuigation_r_list, r_list, prob_sh=0.5): # Index by id and doc_id disabuigation_dict = dict() for item in disabuigation_r_list: disabuigation_dict[item['selection_id']] = item for item in r_list: the_id = item['id'] for i, (doc_id, priority) in enumerate(item['prioritized_docids']): if '-LRB-' in doc_id: # Only use for disamb query_id = str(the_id) + '###' + doc_id if query_id in disabuigation_dict: query_selection = disabuigation_dict[query_id] item['prioritized_docids'][i] = [doc_id, query_selection['prob']] # Reset Exact match t_claim = ' '.join(item['claim_tokens']) item['predicted_docids'] = [] for k, it in enumerate(item['prioritized_docids']): if '-LRB-' in it[0] and common.doc_id_to_tokenized_text(it[0]) in t_claim: item['prioritized_docids'][k] = [it[0], 5.0] if it[0] not in item['predicted_docids']: item['predicted_docids'].append(it[0]) for it in sorted(item['prioritized_docids'], key=lambda x: (-x[1], x[0])): if it[0] not in item['predicted_docids'] and it[1] >= prob_sh: item['predicted_docids'].append(it[0])
def evidence_list_to_text_list(cursor, evidences, contain_head=True): # One evidence one text and len(evidences) == len(text_list) current_evidence_text_list = [] evidences = sorted(evidences, key=lambda x: (x[0], x[1])) cur_head = 'DO NOT INCLUDE THIS FLAG' for doc_id, line_num in evidences: _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num) cur_text = "" if contain_head and cur_head != doc_id: cur_head = doc_id t_doc_id_natural_format = common.doc_id_to_tokenized_text(doc_id) if line_num != 0: cur_text = f"{t_doc_id_natural_format} <t> " # Important change move one line below: July 16 # current_evidence_text.append(e_text) cur_text = cur_text + e_text current_evidence_text_list.append(cur_text) assert len(evidences) == len(current_evidence_text_list) return current_evidence_text_list
def resample_answer_with_priority(d_list, top_k=5): count = 0 num_of_select = Counter() print("Build results file...") for item in tqdm(d_list): # Important additional rule: We can add something more here to fine-select the document finded_keys = item['prioritized_docids'] for i, (doc_id, priority) in enumerate(finded_keys): if priority == 1.0: # mem_parse_doc_id = memodict(partial(parse_doc_id, tokenizer=global_tok)) doc_id_tokens = ( common.doc_id_to_tokenized_text(doc_id)).split(' ') if 'film' in get_words_inside_parenthese(doc_id_tokens): finded_keys[i] = (doc_id, priority + 0.2) if 'album' in get_words_inside_parenthese(doc_id_tokens): finded_keys[i] = (doc_id, priority + 0.1) # print(doc_id_tokens) # doc_id_tokens, doc_id_lemmas = mem_parse_doc_id(doc_id) # match_score = check_inside_paretheses_overlap(doc_id_tokens, doc_id_lemmas, claim_tokens, claim_lemmas) # if match_score == 1.0: # continue # else: # finded_keys[i] = (doc_id, match_score) item['prioritized_docids'] = finded_keys item['predicted_docids'] = \ list(set([k for k, v in sorted(item['prioritized_docids'], key=lambda x: (-x[1], x[0]))][:top_k]))
def convert_to_formatted_sent(zipped_s_id_list, evidence_set): sent_list = [] for sent, sid in zipped_s_id_list: sent_item = dict() cur_sent = sent doc_id, ln = sid.split('(-.-)')[0], int(sid.split('(-.-)')[1]) t_doc_id_natural_format = common.doc_id_to_tokenized_text(doc_id) if ln != 0 and t_doc_id_natural_format.lower() not in sent.lower(): cur_sent = f"{t_doc_id_natural_format} <t> " + sent sent_item['text'] = cur_sent sent_item['sid'] = doc_id + SENT_LINE + str(ln) # sid is '[doc_id]<SENT_LINE>[line_number]' if evidence_set is not None: if (doc_id, ln) in evidence_set: sent_item['selection_label'] = "true" else: sent_item['selection_label'] = "false" else: sent_item['selection_label'] = "hidden" sent_list.append(sent_item) return sent_list
def item_resorting(d_list): for item in d_list: t_claim = ' '.join(item['claim_tokens']) item['predicted_docids'] = [] for it in item['prioritized_docids']: if '-LRB-' in it[0] and common.doc_id_to_tokenized_text( it[0]) in t_claim: item['predicted_docids'].append(it[0]) for it in sorted(item['prioritized_docids'], key=lambda x: (-x[1], x[0])): if it[0] not in item['predicted_docids']: item['predicted_docids'].append(it[0])
def disabuigation_training_build_v0(item, cursor, contain_first_sentence=False, only_found=True): doc_t_list = [it[0] for it in item['prioritized_docids']] evidence_group = check_sentences.check_and_clean_evidence(item) all_true_t_list = set() t_claim = ' '.join(item['claim_tokens']) for ground_truth_evid in evidence_group: # print(ground_truth_evid) true_t_list = set([it[0] for it in ground_truth_evid]) all_true_t_list = set.union(all_true_t_list, true_t_list) all_true_t_list = list(all_true_t_list) positive_list = [] negative_list = [] eid = item['id'] found_pos = False for doc_id in all_true_t_list: if '-LRB-' in doc_id and common.doc_id_to_tokenized_text( doc_id) not in t_claim: positive_list.append(doc_id) found_pos = True if found_pos and only_found: random.shuffle(doc_t_list) num_neg = random.randint(6, 8) # for _ in num_neg: for doc_id in doc_t_list[:num_neg]: if '-LRB-' in doc_id and doc_id not in all_true_t_list: negative_list.append(doc_id) elif not only_found: random.shuffle(doc_t_list) # Change this on Aug 30, 2018 # num_neg = random.randint(36, 36) num_neg = random.randint(6, 8) # for _ in num_neg: for doc_id in doc_t_list[:num_neg]: if '-LRB-' in doc_id and doc_id not in all_true_t_list: negative_list.append(doc_id) return make_examples(eid, positive_list, negative_list, t_claim, cursor, contain_first_sentence=contain_first_sentence)
def load_keyword_dict_v1_3(in_filename, filtering=False): # COLON cleaned id_to_key_dict = dict() with open(in_filename, encoding='utf-8', mode='r') as in_f: for line in tqdm(in_f): item = json.loads(line.strip()) if filtering and text_clean.filter_document_id(item['docid']): continue # id_to_key_dict[item['docid']] = item['keys'] # This is a list of keys: id_to_key_dict[item['docid']] = [ common.doc_id_to_tokenized_text(item['docid']) ] return id_to_key_dict
def filter_contain_parenthese_valid(item): doc_t_list = [it[0] for it in item['prioritized_docids']] evidence_group = check_sentences.check_and_clean_evidence(item) all_true_t_list = set() t_claim = ' '.join(item['claim_tokens']) for ground_truth_evid in evidence_group: # print(ground_truth_evid) true_t_list = set([it[0] for it in ground_truth_evid]) all_true_t_list = set.union(all_true_t_list, true_t_list) all_true_t_list = list(all_true_t_list) for doc_id in all_true_t_list: if '-LRB-' in doc_id and doc_id in doc_t_list and common.doc_id_to_tokenized_text( doc_id) not in t_claim: return True return False
def item_resorting(d_list, top_k=None): for item in d_list: item['predicted_docids'] = [] # Reset Exact match t_claim = ' '.join(item['claim_tokens']) item['predicted_docids'] = [] for k, it in enumerate(item['prioritized_docids']): if '-LRB-' in it[0] and common.doc_id_to_tokenized_text(it[0]) in t_claim: item['prioritized_docids'][k] = [it[0], 5.0] item['predicted_docids'].append(it[0]) for it in sorted(item['prioritized_docids'], key=lambda x: (-x[1], x[0])): if it[0] not in item['predicted_docids']: item['predicted_docids'].append(it[0]) if top_k is not None and len(item['predicted_docids']) > top_k: item['predicted_docids'] = item['predicted_docids'][:top_k]
def convert_to_formatted_sent(zipped_s_id_list, evidence_set, contain_head=True, id_tokenized=True): sent_list = [] for sent, sid in zipped_s_id_list: sent_item = dict() cur_sent = sent doc_id, ln = sid.split('(-.-)')[0], int(sid.split('(-.-)')[1]) # print(sent, doc_id, ln) if contain_head: if not id_tokenized: doc_id_natural_format = fever_db.convert_brc(doc_id).replace( '_', ' ') t_doc_id_natural_format = ' '.join( easy_tokenize(doc_id_natural_format)) else: t_doc_id_natural_format = common.doc_id_to_tokenized_text( doc_id) if ln != 0 and t_doc_id_natural_format.lower() not in sent.lower(): cur_sent = f"{t_doc_id_natural_format} <t> " + sent sent_item['text'] = cur_sent sent_item['sid'] = doc_id + c_scorer.SENT_LINE + str(ln) # sid is '[doc_id]<SENT_LINE>[line_number]' if evidence_set is not None: if (doc_id, ln) in evidence_set: sent_item['selection_label'] = "true" else: sent_item['selection_label'] = "false" else: sent_item['selection_label'] = "hidden" sent_list.append(sent_item) else: sent_list.append(sent_item) # for s in sent_list: # print(s['text'][:20], s['selection_label']) return sent_list
def disabuigation_training_build(item, cursor, contain_first_sentence=False): doc_t_list = [it[0] for it in item['prioritized_docids']] evidence_group = check_sentences.check_and_clean_evidence(item) all_true_t_list = set() t_claim = ' '.join(item['claim_tokens']) for ground_truth_evid in evidence_group: # print(ground_truth_evid) true_t_list = set([it[0] for it in ground_truth_evid]) all_true_t_list = set.union(all_true_t_list, true_t_list) all_true_t_list = list(all_true_t_list) positive_list = [] negative_list = [] eid = item['id'] for doc_id in all_true_t_list: if '-LRB-' in doc_id and common.doc_id_to_tokenized_text( doc_id) not in t_claim: positive_list.append(doc_id) for doc_id in doc_t_list: if '-LRB-' in doc_id and doc_id not in all_true_t_list: negative_list.append(doc_id) # for doc_id in all_true_t_list: # if '-LRB-' in doc_id and doc_id not in claim: # positive_list.append(doc_id) # # for doc_id in doc_t_list: # if '-LRB-' in doc_id and doc_id not in all_true_t_list: # negative_list.append(doc_id) # print("id:", eid) # print("Pos:", positive_list) # print("Neg:", negative_list) # print("Claim:", t_claim) return make_examples(eid, positive_list, negative_list, t_claim, cursor, contain_first_sentence=contain_first_sentence)