def load_feature_by_data_set(data_set_path: str, feature_path: str, max_sent_num: int): from common.dataset.reader import JSONLineReader import pickle import os with open(os.path.join(feature_path, 'feature.p'), 'rb') as f: features = pickle.load(f) with open(os.path.join(feature_path, 'data_idx_map.p'), 'rb') as f: data_idx_map = pickle.load(f) jlr = JSONLineReader() lines = jlr.read(data_set_path) feature_dim = features.shape[1] padding = np.zeros([feature_dim], np.float32) claim_features = [] evidence_features = [] for line in lines: _id = line['id'] key = _concat_sent(CLAIM, _id) claim_features.append(features[data_idx_map[key]]) evidence_per_claim_features = [] for sent in line['predicted_evidence']: page, line_num = sent[-2], sent[-1] key = _concat_sent(page, line_num) evidence_per_claim_features.append(features[data_idx_map[key]]) if len(evidence_per_claim_features) > max_sent_num: evidence_features.append( evidence_per_claim_features[:max_sent_num]) else: for _ in range(max_sent_num - len(evidence_per_claim_features)): evidence_per_claim_features.append(padding) evidence_features.append(evidence_per_claim_features) return np.asarray(claim_features, np.float32), np.asarray(evidence_features, np.float32)
def number_feature(data_set_path: str, db_path: str, max_sent_num: int): from common.dataset.reader import JSONLineReader db = FeverDocDB(db_path) jlr = JSONLineReader() lines = jlr.read(data_set_path) num_feat = np.zeros([len(lines), max_sent_num, 3], dtype=np.int32) for i, line in enumerate(lines): claim_text = line['claim'] claim_tokens = tokenize(claim_text) all_nums = set() for token in claim_tokens: if is_token_numeric(token): all_nums.add(float(token)) for j, evidence in enumerate(line['predicted_evidence']): if j >= max_sent_num: break page, line_num = evidence[-2], evidence[-1] all_evidence_nums = [] evidence_text = evidence_num_to_text(db, page, line_num) evidence_tokens = tokenize(evidence_text) for token in evidence_tokens: if is_token_numeric(token): all_evidence_nums.append(float(token)) has_num = len(all_evidence_nums) > 0 has_identical_num = any(n in all_nums for n in all_evidence_nums) has_different_num = any(n not in all_nums for n in all_evidence_nums) num_feat[i][j][0], num_feat[i][j][1], num_feat[i][j][ 2] = _interprete_num_result(has_num, has_identical_num, has_different_num) return num_feat
def generate_submission(_predictions, _ids, test_set_path, submission_path): """ Generate submission file for shared task: http://fever.ai/task.html :param _ids: :param _predictions: :param test_set_path: :param submission_path: :return: """ from common.dataset.reader import JSONLineReader from tqdm import tqdm import json _predictions_with_id = list(zip(_ids, _predictions)) jlr = JSONLineReader() json_lines = jlr.read(test_set_path) os.makedirs(os.path.dirname(os.path.abspath(submission_path)), exist_ok=True) with open(submission_path, 'w') as f: for line in tqdm(json_lines): for i, evidence in enumerate(line['predicted_evidence']): line['predicted_evidence'][i][0] = normalize(evidence[0]) _id = line['id'] _pred_label = prediction_2_label(2) for _pid, _plabel in _predictions_with_id: if _pid == _id: _pred_label = prediction_2_label(_plabel) break obj = {"id": _id,"predicted_label": _pred_label,"predicted_evidence": line['predicted_evidence']} f.write(json.dumps(obj)) f.write('\n')
def main(db_file, k_wiki, in_file, out_file, add_claim=True, parallel=True): # tfidf_path = "data/index/fever-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz" method = Doc_Retrieval(database_path=db_file, add_claim=add_claim, k_wiki_results=k_wiki) processed = dict() path = os.getcwd() jlr = JSONLineReader() lines = jlr.read(os.path.join(path, in_file)) if os.path.isfile(os.path.join(path, in_file + ".progress")): with open(os.path.join(path, in_file + ".progress"), 'rb') as f_progress: import pickle progress = pickle.load(f_progress) print(os.path.join(path, in_file + ".progress") + " exists. Load it as progress file.") else: progress = dict() try: with ThreadPool(processes=4 if parallel else None) as p: for line in tqdm( get_map_function(parallel, p)(lambda l: process_line_with_progress(method, l, progress), lines), total=len(lines)): processed[line['id']] = line progress[line['id']] = line # time.sleep(0.5) with open(os.path.join(path, out_file), "w+") as f2: for line in lines: f2.write(json.dumps(processed[line['id']]) + "\n") finally: with open(os.path.join(path, in_file + ".progress"), 'wb') as f_progress: import pickle pickle.dump(progress, f_progress, pickle.HIGHEST_PROTOCOL)
def generate_submission(_predictions, test_set_path, submission_path): """ Generate submission file for shared task: http://fever.ai/task.html :param _predictions: :param test_set_path: :param submission_path: :return: """ jlr = JSONLineReader() json_lines = jlr.read(test_set_path) with open(submission_path, 'w') as f: for _prediction, line in tqdm(zip(_predictions, json_lines)): for i, evidence in enumerate(line['predicted_evidence']): line['predicted_evidence'][i][0] = normalize(evidence[0]) obj = { "id": line['id'], "predicted_evidence": line['predicted_evidence'], "predicted_label": prediction_2_label(_prediction) } f.write(json.dumps(obj)) f.write('\n')
max_evidence] _line['scores'] = _line['scores'][:args.max_evidence] return _line if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('input', help='/path/to/input/file') parser.add_argument('output', help='/path/to/output/file') parser.add_argument('--max_evidence', help='max num of evidences', type=int, default=5) args = parser.parse_args() LogHelper.setup() logger = LogHelper.get_logger("replace_noise_dataset") random.seed(55) jlr = JSONLineReader() lines = jlr.read(args.input) counter = 0 with open(args.output, 'w') as f: for i, line in tqdm(enumerate(lines)): if not line[ 'label'] == 'NOT ENOUGH INFO' and not is_gold_evidence_predicted( line): counter += 1 logger.info("line " + str(i + 1) + " should be filled") line = random_fill_gold_evidence(line) f.write(json.dumps(line) + '\n') logger.info(str(counter) + " samples filled with gold evidence")
import argparse import json import os from sklearn.model_selection import train_test_split from common.dataset.reader import JSONLineReader if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--file', help='/path/to/jsonl/file', required=True) parser.add_argument('--out', help='/path/to/output/folder', default="data/rte_cor") parser.add_argument('--split-rate', type=float, help='split rate for test set', default=0.2) args = parser.parse_args() jlr = JSONLineReader() all_data = jlr.read(args.file) train, test = train_test_split(all_data, test_size=args.split_rate, random_state=55) os.makedirs(args.out, exist_ok=True) train_path = os.path.join(args.out, "train.jsonl") test_path = os.path.join(args.out, "test.jsonl") with open(train_path, "w") as train_file: for item in train: train_file.write(json.dumps(item) + "\n") with open(test_path, "w") as test_file: for item in test: test_file.write(json.dumps(item) + "\n") ''' --file '/home/hanselowski/workspace/athene-fever/snopes-fever/data/snopes-data/snopes.claim.jsonl' --out '/home/hanselowski/workspace/athene-fever/snopes-fever/data/snopes-data/data-out'
parser = argparse.ArgumentParser() parser.add_argument('db_path', type=str, help='/path/to/fever.db') args = parser.parse_args() jlr = JSONLineReader() docdb = FeverDocDB(args.db_path) idx = docdb.get_non_empty_doc_ids() idx = list(filter(lambda item: not uninformative(item), tqdm(idx))) r = SimpleRandom.get_instance() with open("data/fever/test.ns.rand.jsonl", "w+") as f: for line in jlr.read("data/fever-data/test.jsonl"): if line["label"] == "NOT ENOUGH INFO": for evidence_group in line['evidence']: for evidence in evidence_group: evidence[2] = idx[r.next_rand(0, len(idx))] evidence[3] = -1 f.write(json.dumps(line) + "\n") with open("data/fever/dev.ns.rand.jsonl", "w+") as f: for line in jlr.read("data/fever-data/dev.jsonl"): if line["label"] == "NOT ENOUGH INFO": for evidence_group in line['evidence']: for evidence in evidence_group: evidence[2] = idx[r.next_rand(0, len(idx))]
import argparse import json import os from common.dataset.reader import JSONLineReader if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--submission', help='/path/to/submission/file', required=True) parser.add_argument('--data', help='/path/to/data/file', required=True) parser.add_argument('--output', help='/path/to/output/file', required=True) args = parser.parse_args() jlr = JSONLineReader() submission_lines = jlr.read(args.submission) data_lines = jlr.read(args.data) assert len(submission_lines) == len( data_lines), "lengths of submission and data set are different!" submission_dict = {} for line in submission_lines: submission_dict[line['id']] = line assert len(submission_dict) == len( submission_lines), "lines in submission are not unique!" sorted_lines = [] for d in data_lines: sorted_lines.append(submission_dict[d['id']]) assert len(sorted_lines) == len( data_lines), "some claims from data set are missing in submission!" os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True) with open(args.output, 'w') as f:
if len(gold_sents) > 0: logger.warn( str(len(gold_sents)) + " gold sentences cannot be filled into prediction") return selected_sents if __name__ == '__main__': LogHelper.setup() logger = LogHelper.get_logger('fill_gold_sentences') parser = argparse.ArgumentParser() parser.add_argument('--input', help='/path/to/input/file', required=True) parser.add_argument('--output', help='/path/to/output/file', required=True) parser.add_argument('--max-sent', type=int, help='Maximal number of sentences per claim', default=10) args = parser.parse_args() jlr = JSONLineReader() data = jlr.read(args.input) with open(args.output, "w+") as output_file: for data in tqdm(data): if data['verifiable'] != 'NOT VERIFIABLE': pred_sents = data['predicted_sentences'] gold_evidences = data['evidence'] gold_sents = _sents_from_evidences(gold_evidences) filled_pred_sents = _fill_pred_sents_with_gold( pred_sents, gold_sents, args.max_sent) data['predicted_sentences'] = filled_pred_sents output_file.write(json.dumps(data) + "\n")