def evidence_list_to_text(cursor, evidences, contain_head=True, id_tokenized=False): current_evidence_text = [] evidences = sorted(evidences, key=lambda x: (x[0], x[1])) cur_head = 'DO NOT INCLUDE THIS FLAG' for doc_id, line_num in evidences: _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num) if contain_head and cur_head != doc_id: cur_head = doc_id if not id_tokenized: doc_id_natural_format = fever_db.convert_brc(doc_id).replace( '_', ' ') t_doc_id_natural_format = ' '.join( easy_tokenize(doc_id_natural_format)) else: t_doc_id_natural_format = common.doc_id_to_tokenized_text( doc_id) if line_num != 0: current_evidence_text.append(f"{t_doc_id_natural_format} <t>") # Important change move one line below: July 16 current_evidence_text.append(e_text) # print(current_evidence_text) return ' '.join(current_evidence_text)
def convert_to_normalized_format(cursor, e_list, contain_head=True): r_list = [] for evidences in e_list: current_evidence = [] cur_head = 'DO NOT INCLUDE THIS FLAG' # if len(evidences) >= 2: # print("!!!") # This is important sorting of all evidences. evidences = sorted(evidences, key=lambda x: (x[0], x[1])) # print(evidences) for doc_id, line_num in evidences: _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num) if contain_head and cur_head != doc_id: cur_head = doc_id doc_id_natural_format = fever_db.convert_brc(doc_id).replace( '_', ' ') t_doc_id_natural_format = ' '.join( easy_tokenize(doc_id_natural_format)) if line_num != 0: current_evidence.append(f"{t_doc_id_natural_format} .") # print(e_text) current_evidence.append(e_text) # print(current_evidence) r_list.append(' '.join(current_evidence)) return r_list
def sample_for_verifiable(cursor, e_list, contain_head=True): r_list = [] for evidences in e_list: current_evidence = [] cur_head = 'DO NOT INCLUDE THIS FLAG' # if len(evidences) >= 2: # print("!!!") for doc_id, line_num in evidences: _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num) if contain_head and cur_head != doc_id: cur_head = doc_id doc_id_natural_format = fever_db.convert_brc(doc_id).replace( '_', ' ') t_doc_id_natural_format = ' '.join( easy_tokenize(doc_id_natural_format)) if line_num != 0: current_evidence.append(f"{t_doc_id_natural_format} .") # print(e_text) current_evidence.append(e_text) # print(current_evidence) r_list.append(' '.join(current_evidence)) return r_list
def expand_from_preext_sent_rule(self): if not hasattr(self, 'cursor'): self.cursor = fever_db.get_cursor() if not hasattr(self, 'preext_sent_dict'): d_list = read_jsonl(config.RESULT_PATH / \ "sent_retri_nn/2018_07_17_16-34-19_r/train_scale(0.1).jsonl") self.preext_sent_dict = {item['id']: item for item in d_list} item = self.item # if len(item['prioritized_docids']) < 5: new_pdocids = [] structured_docids_sent = {} sent_ids = self.preext_sent_dict[item['id']]['scored_sentids'] for sent_id, score, probability in sent_ids: docid, sent_ind = sent_id.split('<SENT_LINE>') sent_ind = int(sent_ind) id_list, sent_list, sent_links = \ fever_db.get_evidence(self.cursor, docid, sent_ind) sent_links = json.loads(sent_links) all_links = np.array(sent_links) all_links = np.array(all_links) all_links = all_links.reshape(-1, 2)[:, 1] all_links = list(map(fever_db.reverse_convert_brc, all_links)) all_links = list(map(lambda x: x.replace(' ', '_'), all_links)) prio_docids = [(id_link, score) for id_link in all_links] new_pdocids.extend(prio_docids) structured_docids_sent.update({sent_id: prio_docids}) item['prioritized_docids_sent'] = new_pdocids item['structured_docids_sent'] = structured_docids_sent return self
def expand_from_preext_sent_rule(self): if not hasattr(self, 'cursor'): self.cursor = fever_db.get_cursor() if not hasattr(self, 'preext_sent_dict'): d_list = load_data(config.RESULT_PATH / \ "sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl") self.preext_sent_dict = {item['id']: item for item in d_list} item = self.item # if len(item['prioritized_docids']) < 5: new_pdocids = copy(item['prioritized_docids']) sent_ids = self.preext_sent_dict[item['id']]['predicted_sentids'] for sent_id in sent_ids: docid, sent_ind = sent_id.split('<SENT_LINE>') sent_ind = int(sent_ind) id_list, sent_list, sent_links = \ fever_db.get_evidence(self.cursor, docid, sent_ind) sent_links = json.loads(sent_links) all_links = np.array(sent_links) all_links = np.array(all_links) all_links = all_links.reshape(-1, 2)[:, 1] all_links = list(map(reverse_convert_brc, all_links)) new_pdocids.extend([(id_link, 1.0) \ for id_link in all_links]) item['prioritized_docids'] = new_pdocids return self
def evidence_list_to_text_list(cursor, evidences, contain_head=True): # One evidence one text and len(evidences) == len(text_list) current_evidence_text_list = [] evidences = sorted(evidences, key=lambda x: (x[0], x[1])) cur_head = 'DO NOT INCLUDE THIS FLAG' for doc_id, line_num in evidences: _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num) cur_text = "" if contain_head and cur_head != doc_id: cur_head = doc_id t_doc_id_natural_format = common.doc_id_to_tokenized_text(doc_id) if line_num != 0: cur_text = f"{t_doc_id_natural_format} <t> " # Important change move one line below: July 16 # current_evidence_text.append(e_text) cur_text = cur_text + e_text current_evidence_text_list.append(cur_text) assert len(evidences) == len(current_evidence_text_list) return current_evidence_text_list
def expand_from_preext_sent_rule(self): item = self.item # if len(item['prioritized_docids']) < 5: new_pdocids = [] structured_docids_sent = {} sent_ids = self.preext_sent_dict[item['id']]['scored_sentids'] for sent_id, score, probability in sent_ids: docid, sent_ind = sent_id.split('<SENT_LINE>') sent_ind = int(sent_ind) id_list, sent_list, sent_links = fever_db.get_evidence( self.cursor, docid, sent_ind) sent_links = json.loads(sent_links) all_links = np.array(sent_links) all_links = np.array(all_links) all_links = all_links.reshape(-1, 2)[:, 1] all_links = list(map(fever_db.reverse_convert_brc, all_links)) all_links = list(map(lambda x: x.replace(' ', '_'), all_links)) prio_docids = [(id_link, score) for id_link in all_links] new_pdocids.extend(prio_docids) structured_docids_sent.update({sent_id: prio_docids}) item['prioritized_docids_sent'] = new_pdocids item['structured_docids_sent'] = structured_docids_sent return self
def get_full_list(tokenized_data_file, additional_data_file, pred=False, top_k=None): """ This method will select all the sentence from upstream doc retrieval and label the correct evident as true :param tokenized_data_file: Remember this is tokenized data with original format containing 'evidence' :param additional_data_file: This is the data after document retrieval. This file need to contain *"predicted_docids"* field. :return: """ cursor = fever_db.get_cursor() d_list = load_jsonl(tokenized_data_file) if not isinstance(additional_data_file, list): additional_d_list = load_jsonl(additional_data_file) else: additional_d_list = additional_data_file if top_k is not None: print("Upstream document number truncate to:", top_k) trucate_item(additional_d_list, top_k=top_k) additional_data_dict = dict() for add_item in additional_d_list: additional_data_dict[add_item['id']] = add_item full_data_list = [] for item in tqdm(d_list): doc_ids = additional_data_dict[item['id']]["predicted_docids"] if not pred: if item['evidence'] is not None: e_list = utils.check_sentences.check_and_clean_evidence(item) all_evidence_set = set( itertools.chain.from_iterable( [evids.evidences_list for evids in e_list])) else: all_evidence_set = None # print(all_evidence_set) r_list = [] id_list = [] if all_evidence_set is not None: for doc_id, ln in all_evidence_set: _, text, _ = fever_db.get_evidence(cursor, doc_id, ln) r_list.append(text) id_list.append(doc_id + '(-.-)' + str(ln)) else: # If pred, then reset to not containing ground truth evidence. all_evidence_set = None r_list = [] id_list = [] for doc_id in doc_ids: cur_r_list, cur_id_list = fever_db.get_all_sent_by_doc_id( cursor, doc_id, with_h_links=False) # Merging to data list and removing duplicate for i in range(len(cur_r_list)): if cur_id_list[i] in id_list: continue else: r_list.append(cur_r_list[i]) id_list.append(cur_id_list[i]) assert len(id_list) == len(set(id_list)) # check duplicate assert len(r_list) == len(id_list) zipped_s_id_list = list(zip(r_list, id_list)) # Sort using id # sorted(evidences_set, key=lambda x: (x[0], x[1])) zipped_s_id_list = sorted(zipped_s_id_list, key=lambda x: (x[1][0], x[1][1])) all_sent_list = convert_to_formatted_sent(zipped_s_id_list, all_evidence_set, contain_head=True, id_tokenized=True) cur_id = item['id'] for i, sent_item in enumerate(all_sent_list): sent_item['selection_id'] = str(cur_id) + "<##>" + str( sent_item['sid']) sent_item['query'] = item['claim'] if 'label' in item.keys(): sent_item['claim_label'] = item['label'] full_data_list.append(sent_item) return full_data_list