def prediction_processing(dataset_path, predictions): """ process the predicted (doc_id,sent_id) pairs to the score system desired format :param dataset_path: :param predictions: :return: """ final_predictions = [] jsr = JSONLineReader() with open(dataset_path, "r") as f: lines = jsr.process(f) # # lines = lines[:100] for idx, line in enumerate(lines): if len(line['predicted_pages']) == 0: line['predicted_evidence'] = [] else: line['predicted_evidence'] = [[ prediction[0], int(prediction[1]) ] for prediction in predictions[idx]] line['predicted_label'] = "REFUTES" final_predictions.append(line) return final_predictions
def sampling(self, datapath, num_sample=1): jlr = JSONLineReader() X = [] count = 0 with open(datapath, "r") as f: lines = jlr.process(f) for line in tqdm(lines): count += 1 pos_pairs = [] # count1 += 1 if line['label'].upper() == "NOT ENOUGH INFO": continue neg_sents = [] claim = line['claim'] pos_set = set() for evidence_set in line['evidence']: pos_sent = self.get_whole_evidence(evidence_set, self.db) if pos_sent in pos_set: continue pos_set.add(pos_sent) p_lines = [] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = [ page for page in line['predicted_pages'] if page is not None ] for page in pages: doc_lines = self.db.get_doc_lines(page) p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if (doc_line[1], doc_line[2]) not in evidence_set: neg_sents.append(doc_line[0]) num_sampling = num_sample if len(neg_sents) < num_sampling: num_sampling = len(neg_sents) # print(neg_sents) if num_sampling == 0: continue else: for pos_sent in pos_set: samples = random.sample(neg_sents, num_sampling) for sample in samples: if not sample: continue X.append((claim, pos_sent, sample)) if count % 1000 == 0: print( "claim:{} ,evidence :{} sample:{}".format( claim, pos_sent, sample)) return X
def train_dev_split(train_datapath, split_rate): with open(train_datapath, "r") as f: jlr = JSONLineReader() lines = jlr.process(f) random.shuffle(lines) dev_lines = lines[:int(len(lines) * split_rate)] train_lines = lines[int(len(lines) * split_rate):] return train_lines, dev_lines
def load_words(embedding_file, train_datapath, test_path, db_filename, num_sample, sampled_path): words = set() def _insert(iterable): for w in iterable: w = Dictionary.normalize(w) if valid_words and w not in valid_words: continue words.add(w) valid_words = index_embedding_words(embedding_file) X_claim, X_sents, y = load_generate_samples(db_filename, train_datapath, num_sample, sampled_path) X_claim = set(X_claim) for claim in X_claim: words = nltk.word_tokenize(claim) _insert(words) for sent in X_sents: words = simple_tokenizer(sent) _insert(words) with open(test_path, "r") as f: jlr = JSONLineReader() db = FeverDocDB(db_filename) lines = jlr.process(f) for line in lines: claim = line['claim'] words = nltk.word_tokenize(claim) _insert(words) evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = set() pages.update(evidence[0] for evidence in line['predicted_pages']) pages.update(evidence[0] for evidence in evidence_set) for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: continue doc_lines = [ doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in doc_lines.split("\n") ] doc_lines = [doc_line for doc_line in doc_lines if doc_line] for doc_line in doc_lines: words = simple_tokenizer(doc_line) _insert(words) return words
def test_data_4_siamese(db_filename, dataset_path): db = FeverDocDB(db_filename) jlr = JSONLineReader() X_claims = [] X_sents = [] all_sents_id = [] with open(dataset_path, "r") as f: lines = jlr.process(f) # lines = lines[:1000] for line in tqdm(lines): claims = [] sents = [] sents_indexes = [] p_lines = [] claim = line['claim'] pages = set() pages.update(evidence[0] for evidence in line['predicted_pages']) for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: continue doc_lines = [ doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in doc_lines.split("\n") ] p_lines.extend( zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines)))) for doc_line in p_lines: if not doc_line[0]: continue else: claims.append(claim) sents.append(doc_line[0]) sents_indexes.append((doc_line[1], doc_line[2])) X_claims.append(claims) X_sents.append(sents) all_sents_id.append(sents_indexes) # print(len(X_claims)) # print(len(X_sents)) # print(len(all_sents_id)) # X_claims_indexes, X_sents_indexes = [], [] # for idx, claims in enumerate(X_claims): # claims_index, sents_index = data_transformer(claims, X_sents[idx], word_dict) # X_claims_indexes.append(claims_index) # X_sents_indexes.append(sents_index) return X_claims, X_sents, all_sents_id
def test_data(db_path, dataset_path, type="ranking"): """ generate dev examples to feed into the classifier :param db_path: :param dataset_path: :param type: :return: """ with open(db_path) as f: db = json.load(f) jsr = JSONLineReader() inputs = [] X_claim = [] X_sents = [] indexes = [] with open(dataset_path, "r") as f: lines = jsr.process(f) for line in tqdm(lines): valid_lines = [] claims = [] sents_idnexes = [] claim = line['claim'] #for doc_line in p_lines: doc = line['predicted_evidence'] # doc = line['evidence'] for doc_line in doc: if not doc_line: continue else: # print(doc_line[0]) if type == "cos": sents_idnexes.append(doc_line) valid_lines.append( get_whole_evidence([doc_line], db)) claims.append(claim) elif type == "ranking": sents_idnexes.append((doc_line[0], doc_line[1])) valid_lines.append( (claim, get_whole_evidence([doc_line], db))) if type == "cos": X_sents.append(valid_lines) X_claim.append(claims) elif type == "ranking": inputs.append(valid_lines) indexes.append(sents_idnexes) inputs = list(zip(X_claim, X_sents)) return inputs, indexes
def test_data_loader(save_path, db_filename=None, data_path=None): if os.path.exists(save_path): with open(save_path, 'rb') as f: X = pickle.load(f) claims, list_sents, sents_indexes = zip(*X) else: with open(data_path, "rb") as f: jlr = JSONLineReader() lines = jlr.process(f) claims, list_sents, sents_indexes = test_processing(db_filename, lines) X = zip(claims, list_sents, sents_indexes) with open(save_path, 'wb') as f: pickle.dump(X, f) return claims, list_sents, sents_indexes
def sampling(self,datapath,num_sample=1): jlr = JSONLineReader() X = [] count = 0 with open(datapath, "r") as f: lines = jlr.process(f) for line in tqdm(lines): print('line: ', line) count += 1 if line['label'].upper() == "NOT ENOUGH INFO": continue neg_sents = [] claim = line['claim'] print('claim: ',claim) pos_set = set() pos_set_ref = [] for evidence_set in line['evidence']: for evidence_sentence in evidence_set: pos_set.add(self.get_whole_evidence([evidence_sentence], self.db)) pos_set_ref.append(evidence_sentence) print('pos_set: ', pos_set) neg_sents = [] for neg_evidence in line['predicted_evidence']: # if neg_evidence not in evidence_set: if neg_evidence not in pos_set_ref: neg_sents.append(self.get_whole_evidence([neg_evidence], self.db)) num_sampling = num_sample if len(neg_sents) < num_sampling: num_sampling = len(neg_sents) # print(neg_sents) if num_sampling == 0: continue else: for pos_sent in pos_set: samples = random.sample(neg_sents, num_sampling) for sample in samples: print('sample: ',sample) if not sample: continue X.append((claim, pos_sent, sample)) if count % 1000 == 0: print("claim:{} ,evidence :{} sample:{}".format(claim, pos_sent, sample)) return X
def prediction_processing(dataset_path, predictions): """ process the predicted (doc_id,sent_id) pairs to the score system desired format :param dataset_path: :param predictions: :return: """ final_predictions = [] jsr = JSONLineReader() with open(dataset_path, "r") as f: lines = jsr.process(f) prediction_processing_no_reload(lines, predictions) return final_predictions
def prediction_processing(dataset_path, predictions, db_filename): """ process the predicted (doc_id,sent_id) pairs to the score system desired format :param dataset_path: :param predictions: :return: """ final_predictions = [] jsr = JSONLineReader() with open(db_filename) as f: db = json.load(f) out_error_ana = [] with open(dataset_path, "r") as f: lines = jsr.process(f) cnt = 0 for line in lines: pos_set_ref = line['evidence'] if len(pos_set_ref) == 0 or not pos_set_ref[0]: continue line['predicted_evidence'] = [[prediction[0], int(prediction[1])] for prediction in predictions[cnt]] cnt_gold = 0 out_error_ana.append("Claim: "+str(cnt)) out_error_ana.append(line['claim']) out_error_ana.append("Gold evidence:") for evidence_set in line['evidence']: for evidence_sentence in evidence_set: out_error_ana.append(get_whole_evidence([evidence_sentence], db)) cnt_gold += 1 out_error_ana.append("Predicted evidence:") for evidence_set in line['predicted_evidence'][:cnt_gold]: out_error_ana.append(get_whole_evidence([evidence_set], db)) out_error_ana.append("") line['predicted_label'] = "refutes" final_predictions.append(line) cnt += 1 if cnt == len(predictions): break return final_predictions, out_error_ana
def sampling(self, datapath, num_sample=1): jlr = JSONLineReader() ret = [] print("sampling for " + datapath) with open(datapath, "r") as f: lines = jlr.process(f) print(len(lines)) with ThreadPool(processes=48) as p: for line in tqdm(p.imap(lambda x: self.handle(x, num_sample), lines), total=len(lines)): if line is not None: ret.extend(line) print("Done") return ret
def dev_processing(self, data_path): jlr = JSONLineReader() with open(data_path, "r") as f: lines = jlr.process(f) devs = [] labels = [] for line in tqdm(lines): dev = [] label = [] if line['label'].upper() == "NOT ENOUGH INFO": continue evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = [ page for page in line['predicted_pages'] if page is not None ] for page, num in evidence_set: pages.append(page) pages = set(pages) p_lines = [] for page in pages: doc_lines = self.db.get_doc_lines(page) p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue dev.append((line['claim'], doc_line[0])) if (doc_line[1], doc_line[2]) in evidence_set: label.append(1) else: label.append(0) if len(dev) == 0 or len(label) == 0: continue devs.append(dev) labels.append(label) return devs, labels
def cos_train(db_filepath, dataset_path): """ Use the cosine similarity score to rank (claim,sentence) pair in the dev set don't need training data :param db_filepath: :param dataset_path: :return: """ with open(db_filepath) as f: db = json.load(f) jlr = JSONLineReader() X = [] y = [] with open(dataset_path, "r") as f: lines = jlr.process(f) for line in tqdm(lines): if line['label'] == "NOT ENOUGH INFO": continue #label, dev = [], [] pos_set_ref = line['evidence'] if len(pos_set_ref) == 0 or not pos_set_ref[0]: continue pos_set = set() for evidence_set in line['evidence']: for evidence_sentence in evidence_set: pos_set.add(get_whole_evidence([evidence_sentence], db)) for evidence_sentence_ref in line['predicted_evidence']: evidence_sentence = get_whole_evidence( [evidence_sentence_ref], db) X.append((line['claim'], evidence_sentence)) if evidence_sentence in pos_set: y.append(1) else: y.append(0) return X, y
def data_processing_for_joint(self, data_path): from athene.rte.utils.data_reader import label_dict jlr = JSONLineReader() with open(data_path, "r") as f: lines = jlr.process(f) datas = [] sent_labels = [] claim_labels = [] for line in tqdm(lines): claim_labels.append(label_dict.index(line['label'])) data = [] sent_label = [] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = [ page for page in line['predicted_pages'] if page is not None ] for page, num in evidence_set: pages.append(page) pages = set(pages) p_lines = [] for page in pages: doc_lines = self.db.get_doc_lines(page) p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue data.append((line['claim'], doc_line[0])) if (doc_line[1], doc_line[2]) in evidence_set: sent_label.append(1) else: sent_label.append(0) if len(data) == 0 or len(sent_label) == 0: continue datas.append(data) sent_labels.append(sent_label) return datas, sent_labels, claim_labels
def predict_processing(db_path, dataset_path): with open(db_path) as f: db = json.load(f) jlr = JSONLineReader() devs = [] all_indexes = [] with open(dataset_path, "rb") as f: lines = jlr.process(f) for line in tqdm(lines): dev = [] indexes = [] claim = line['claim'] ########################## pos_set_ref = line['evidence'] if len(pos_set_ref) == 0 or not pos_set_ref[0]: continue pos_set = set() for evidence_set in line['evidence']: for evidence_sentence in evidence_set: pos_set.add(get_whole_evidence([evidence_sentence], db)) for evidence_sentence_ref in line['predicted_evidence']: evidence_sentence = get_whole_evidence( [evidence_sentence_ref], db) dev.append((line['claim'], evidence_sentence)) indexes.append(evidence_sentence_ref) ########################## if len(dev) == 0: dev.append((claim, 'no evidence for this claim')) indexes.append(('empty', 0)) devs.append(dev) all_indexes.append(indexes) return devs, all_indexes
def dev_processing(self,data_path): jlr = JSONLineReader() with open(data_path,"r") as f: lines = jlr.process(f) devs = [] labels = [] for line in tqdm(lines): # if line['label'].upper() == "NOT ENOUGH INFO": # continue label, dev = [], [] pos_set_ref = line['evidence'] if len(pos_set_ref) == 0 or not pos_set_ref[0]: continue pos_set = set() for evidence_set in line['evidence']: for evidence_sentence in evidence_set: pos_set.add(self.get_whole_evidence([evidence_sentence], self.db)) for evidence_sentence_ref in line['predicted_evidence']: evidence_sentence = self.get_whole_evidence([evidence_sentence_ref], self.db) dev.append((line['claim'], evidence_sentence)) if evidence_sentence in pos_set: label.append(1) else: label.append(0) if len(dev) == 0 or len(label) == 0: continue devs.append(dev) labels.append(label) return devs,labels
def predict_processing(self, datapath): jlr = JSONLineReader() devs = [] all_indexes = [] with open(datapath, "rb") as f: lines = jlr.process(f) for line in tqdm(lines): dev = [] indexes = [] pages = set() # pages = line['predicted_pages'] pages.update(page for page in line['predicted_pages']) # if len(pages) == 0: # pages.add("Michael_Hutchence") claim = line['claim'] p_lines = [] #Separa sentences and add predict page for page in pages: doc_lines = self.db.get_doc_lines(page) if not doc_lines: continue p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue dev.append((claim, doc_line[0])) indexes.append((doc_line[1], doc_line[2])) # print(len(dev)) if len(dev) == 0: dev.append((claim, 'no evidence for this claim')) indexes.append(('empty', 0)) devs.append(dev) all_indexes.append(indexes) return devs, all_indexes
def generate_prediction_files(predictions,p_sents_indexes,data_path,final_prediction_path): """ transform the generated predictions from classifier to lists of dicts form to feed into the score system :param predictions: :param p_sents_indexes: :param data_path: :param final_prediction_path: :return: """ jlr = JSONLineReader() final_predictions = [] with open(data_path,"r") as f: lines = jlr.process(f) print(len(predictions)) print(len(p_sents_indexes)) print(len(lines)) assert len(predictions) == len(p_sents_indexes) == len(lines) for idx,line in enumerate(lines): line['predicted_evidence'] = [] line['predicted_label'] = 'refutes' predicted_sents = predictions[idx] sents_indexes = p_sents_indexes[idx] for i in range(len(sents_indexes)): if predicted_sents[i] == 1: line['predicted_evidence'].append([sents_indexes[i][0],sents_indexes[i][1]]) final_predictions.append(line) with open(final_prediction_path,"w") as f: for prediction in final_predictions: f.write(json.dumps(prediction)+'\n') return final_predictions
def tfidf_test_processing(base_path, dbfilename, test_data_path, test_store_path, pro_extract_sents_path, h_max_length, s_max_length, iword_dict): dev_index_path = os.path.join( base_path, "data/train_data/dev.h_{}.s_{}.tfidf.indexes.p".format( h_max_length, s_max_length)) devs, location_indexes = dev_data_loader(test_store_path, dbfilename, test_data_path) if os.path.exists(dev_index_path): with open(dev_index_path, "rb") as f: devs_indexes = pickle.load(f) else: with open(pro_extract_sents_path, "r") as f: jlr = JSONLineReader() lines = jlr.process(f) inputs = [] new_location_indexes = [] for i, line in enumerate(lines): pro_extract_sents = [] sent_index = [] predict_sents = line['predicted_sentences'] claim = line['claim'] predict_sents_set = set([ (doc_id, sent_num) for doc_id, sent_num in predict_sents ]) # print(predict_sents_set) for j, index in enumerate(location_indexes[i]): if (index[0], index[1]) in predict_sents_set: # print(devs[i][j]) # print(devs[i]) pro_extract_sents.append((claim, devs[i][j][1])) sent_index.append((index[0], index[1])) inputs.append(pro_extract_sents) new_location_indexes.append(sent_index) devs_indexes = test_data_indexes(inputs, iword_dict, h_max_length, s_max_length) return devs_indexes, new_location_indexes
def dev_processing(db_filename, datapath): db = FeverDocDB(db_filename) jlr = JSONLineReader() devs = [] all_indexes = [] with open(datapath, "rb") as f: lines = jlr.process(f) for line in tqdm(lines): dev = [] indexes = [] pages = set() pages.update(page[0] for page in line['predicted_pages']) if len(pages) == 0: pages.add("Michael_Hutchence") claim = line['claim'] p_lines = [] for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: continue p_lines.extend(get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue dev.append((claim, doc_line[0])) indexes.append((doc_line[1], doc_line[2])) # print(len(dev)) if len(dev) == 0: dev.append((claim, 'no evidence for this claim')) indexes.append(('empty', 0)) devs.append(dev) all_indexes.append(indexes) return devs, all_indexes
help=("String option specifying tokenizer type to use " "(e.g. 'corenlp')")) parser.add_argument('--num-workers', type=int, default=None, help='Number of CPU processes (for tokenizing, etc)') args = parser.parse_args() doc_freqs = None if args.use_precomputed: _, metadata = utils.load_sparse_csr(args.model) doc_freqs = metadata['doc_freqs'].squeeze() db = FeverDocDB("data/fever/fever.db") jlr = JSONLineReader() formatter = FEVERGoldFormatter(set(), FEVERLabelSchema()) jlr = JSONLineReader() with open(args.in_file, "r") as f, open( "data/fever/{0}.sentences.{3}.p{1}.s{2}.jsonl".format( args.split, args.max_page, args.max_sent, "precomputed" if args.use_precomputed else "not_precomputed"), "w+") as out_file: lines = jlr.process(f) #lines = tf_idf_claims_batch(lines) for line in tqdm(lines): line = tf_idf_claim(line) out_file.write(json.dumps(line) + "\n")
def prepare_ranking(db_filename, datapath, k=10, num_sample=3): """ :param db_filename: :param datapath: :param k: :param num_sample: :return: """ db = FeverDocDB(db_filename) jlr = JSONLineReader() X = [] with open(datapath, "r") as f: lines = jlr.process(f) for line in tqdm(lines): if line['label'].upper() == "NOT ENOUGH INFO": continue p_lines = [] pos_sents = [] neg_sents = [] claim = line['claim'] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) sampled_sents_idx = [(id, number) for id, number in line['predicted_sentences']] sampled_sents_idx = [ index for index in sampled_sents_idx if index not in evidence_set ] if k: sampled_sents_idx = sampled_sents_idx[:k] pages = set() pages.update(evidence[0] for evidence in line['predicted_pages']) pages.update(evidence[0] for evidence in evidence_set) for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: continue doc_lines = [ doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in doc_lines.split("\n") ] p_lines.extend( zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines)))) for doc_line in p_lines: if not doc_line[0]: continue elif (doc_line[1], doc_line[2]) in sampled_sents_idx: neg_sents.append(doc_line[0]) elif (doc_line[1], doc_line[2]) in evidence_set: pos_sents.append(doc_line[0]) # print(line) # print(sampled_sents_idx) # print(neg_sents) if len(sampled_sents_idx) < num_sample: continue for sent in pos_sents: neg_samples = random.sample(neg_sents, num_sample) triplet = (claim, sent, neg_samples) X.append(triplet) return X
def sample_ranking_train(db_filename, datapath, k=5, num_sample=2): """ :param db_filename: path stores wiki-pages database :param datapath: path stores fever predicted pages train set :param k: number of sentences where to select negative examples :param num_sample: number of negative examples to sample :return: X: claim and sentence pairs y: if the sentence in evidence set """ db = FeverDocDB(db_filename) jlr = JSONLineReader() X_claim = [] X_sents = [] y = [] count = 0 with open(datapath, "r") as f: lines = jlr.process(f) # lines = lines[:1000] for line in tqdm(lines): num_sampling = num_sample if line['label'].upper() == "NOT ENOUGH INFO": continue p_lines = [] neg_sents = [] claim = line['claim'] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) sampled_sents_idx = [(id, number) for id, number in line['predicted_sentences']] sampled_sents_idx = sampled_sents_idx[0:k + 5] sampled_sents_idx = [ index for index in sampled_sents_idx if index not in evidence_set ] pages = set() pages.update(evidence[0] for evidence in line['predicted_pages']) pages.update(evidence[0] for evidence in evidence_set) for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: continue doc_lines = [ doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in doc_lines.split("\n") ] p_lines.extend( zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines)))) for doc_line in p_lines: if not doc_line[0]: continue elif (doc_line[1], doc_line[2]) in sampled_sents_idx: neg_sents.append(doc_line[0]) elif (doc_line[1], doc_line[2]) in evidence_set: X_claim.append(claim) X_sents.append(doc_line[0]) y.append(1) if len(sampled_sents_idx) < num_sample: count += 1 num_sampling = len(sampled_sents_idx) samples = random.sample(neg_sents, num_sampling) for neg_example in samples: X_claim.append(claim) X_sents.append(neg_example) y.append(0) print(count) return X_claim, X_sents, y
def test_data(db_path, dataset_path, type="ranking"): """ generate dev examples to feed into the classifier :param db_path: :param dataset_path: :param type: :return: """ db = FeverDocDB(db_path) jsr = JSONLineReader() inputs = [] X_claim = [] X_sents = [] indexes = [] with open(dataset_path, "r") as f: lines = jsr.process(f) for line in tqdm(lines): p_lines = [] valid_lines = [] claims = [] sents_idnexes = [] claim = line['claim'] # X_claim.append([claim]) predicted_pages = line['predicted_pages'] for page in predicted_pages: # doc_lines = db.get_doc_lines(page[0]) doc_lines = db.get_doc_lines(page[0]) if not doc_lines: # print(page) continue doc_lines = [doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in doc_lines.split("\n")] p_lines.extend(zip(doc_lines, [page[0]] * len(doc_lines), range(len(doc_lines)))) for doc_line in p_lines: if not doc_line[0]: continue else: # print(doc_line[0]) if type == "cos": sents_idnexes.append((doc_line[1], doc_line[2])) valid_lines.append(doc_line[0]) claims.append(claim) elif type == "ranking": sents_idnexes.append((doc_line[1], doc_line[2])) valid_lines.append((claim, doc_line[0])) if type == "cos": X_sents.append(valid_lines) X_claim.append(claims) elif type == "ranking": inputs.append(valid_lines) indexes.append(sents_idnexes) inputs = list(zip(X_claim, X_sents)) return inputs, indexes
def sample_4_ranking(db_path, data_path, type="train", num_neg=3, seed=55): """ sample a set of negative sentecnes and combine positive sentence to form the training data :param db_path: :param data_path: :param type: :param num_neg: :param seed: :return: """ random.seed(seed) if type == "train": claims, related_pages_sents, _, y = label_sents(db_path, data_path, type="train") train_triplets = [] for i, claim in tqdm(enumerate(claims)): neg_sents = [j for j, label in enumerate(y[i]) if label != 1] for idx, label in enumerate(y[i]): if label == 1: pos_sent = related_pages_sents[i][idx] samples = random.sample(neg_sents, num_neg) sampled_neg_sents = [] for index in samples: sampled_neg_sents.append(related_pages_sents[i][index]) triplet = (claim, pos_sent, sampled_neg_sents) train_triplets.append(triplet) return train_triplets elif type == "dev" or type == "test": """ For dev or test set, use claim and sentence pairs to get scores of each pair """ with open(db_path) as f: db = json.load(f) jsr = JSONLineReader() with open(data_path, "r") as f: lines = jsr.process(f) dev_examples = [] pages_sents_indexes = [] for line in tqdm(lines): ########################## pos_set_ref = line['evidence'] if len(pos_set_ref) == 0 or not pos_set_ref[0]: continue pos_set = set() for evidence_set in line['evidence']: for evidence_sentence in evidence_set: pos_set.add( get_whole_evidence([evidence_sentence], db)) for evidence_sentence_ref in line['predicted_evidence']: evidence_sentence = get_whole_evidence( [evidence_sentence_ref], db) dev_examples.append((line['claim'], evidence_sentence)) pages_sents_indexes.append(evidence_sentence_ref) ########################## # p_lines = [] # feed_tuples = [] # sents_indexes = [] # claim = line['claim'] # for page in line['predicted_pages']: # doc_lines = db.get_doc_lines(page) # if not doc_lines: # # print(page) # continue # doc_lines = [doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in # doc_lines.split("\n")] # p_lines.extend(zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines)))) # # for doc_line in p_lines: # if not doc_line[0]: # continue # else: # # print(doc_line[0]) # sents_indexes.append((doc_line[1], doc_line[2])) # feed_tuples.append((claim,doc_line[0])) # # dev_examples.append(feed_tuples) # pages_sents_indexes.append(sents_indexes) return dev_examples, pages_sents_indexes
def label_sents(db_path, data_path, type="train"): """ This function is to label all sentences in the evidence set to 1 and not in evidence set to 0 for training data :param db_path: :param data_path: :param type: :return: """ db = FeverDocDB(db_path) jsr = JSONLineReader() claims = [] related_pages_sents = [] pages_sents_indexes = [] y = [] with open(data_path, "r") as f: lines = jsr.process(f) count = 0 for line in tqdm(lines): if line['label'] == "NOT ENOUGH INFO" and type == "train": continue p_lines = [] valid_lines = [] line_labels = [] sents_idnexes = [] claim = line['claim'] evidences = line['evidence'] evidence_set = set() pages_list = [] for evidence in evidences: for sent in evidence: evidence_set.add((sent[2], sent[3])) pages_list.append(sent[2]) # predicted_pages = line['predicted_pages'] predicted_pages = [page[0] for page in line['predicted_pages']] predicted_pages = predicted_pages + pages_list predicted_pages = set(predicted_pages) if len(predicted_pages) > 5: count += 1 claims.append(claim) for page in predicted_pages: doc_lines = db.get_doc_lines(page) if not doc_lines: # print(page) continue doc_lines = [ doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in doc_lines.split("\n") ] p_lines.extend( zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines)))) for doc_line in p_lines: # ignore empty sentences if not doc_line[0]: continue else: # print(doc_line[0]) sents_idnexes.append((doc_line[1], doc_line[2])) valid_lines.append(doc_line[0]) is_added = False for sent in evidence_set: if sent[0] == doc_line[1] and sent[1] == doc_line[2]: line_labels.append(1) is_added = True break if is_added != True: line_labels.append(0) # print(len(p_lines)) # print(len(line_labels)) # print(len(valid_lines)) assert len(line_labels) == len(valid_lines) == len(sents_idnexes) related_pages_sents.append(valid_lines) pages_sents_indexes.append(sents_idnexes) y.append(line_labels) print(count) return claims, related_pages_sents, pages_sents_indexes, y
def in_class_sampling(db_filename, datapath, num_sample=1, k=5): """ :param db_filename: path stores wiki-pages database :param datapath: path stores fever predicted pages train set :param k: number of sentences where to select negative examples :param num_sample: number of negative examples to sample :return: X: claim and sentence pairs y: if the sentence in evidence set """ db = FeverDocDB(db_filename) jlr = JSONLineReader() X = [] count = 0 count1 = 1 with open(datapath, "r") as f: lines = jlr.process(f) # lines = lines[:1000] for line in tqdm(lines): pos_pairs = [] count1 += 1 num_sampling = num_sample if line['label'].upper() == "NOT ENOUGH INFO": continue p_lines = [] neg_sents = [] claim = line['claim'] for evidence_set in line['evidence']: pos_sent = get_whole_evidence(evidence_set, db) print("claim:{} pos_sent:{}".format(claim, pos_sent)) pos_pairs.append((claim, pos_sent)) evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) sampled_sents_idx = [(id, number) for id, number in line['predicted_sentences']] sampled_sents_idx = sampled_sents_idx[0:k + 5] sampled_sents_idx = [ index for index in sampled_sents_idx if index not in evidence_set ] pages = set() pages.update(evidence[0] for evidence in line['predicted_pages']) pages.update(evidence[0] for evidence in evidence_set) for page in pages: doc_lines = db.get_doc_lines(page) p_lines.extend(get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue elif (doc_line[1], doc_line[2]) in sampled_sents_idx: neg_sents.append(doc_line[0]) # elif (doc_line[1], doc_line[2]) in evidence_set: # if count1%10000==0: # print("page_id:{},sent_num:{}".format(doc_line[1],doc_line[2])) # print("evidence_set:{}".format(evidence_set)) # pos_pairs.append((claim,doc_line[0])) if len(sampled_sents_idx) < num_sample: num_sampling = len(neg_sents) if num_sampling == 0: count += 1 continue else: for pair in pos_pairs: samples = random.sample(neg_sents, num_sampling) for sample in samples: X.append((pair[0], pair[1], sample)) if count1 % 10000 == 0: print("claim:{},pos:{},neg:{}".format( claim, pair[1], sample)) print(count) return X