def nextblock(self): self.block = (self.block + 1) if self.block is not None else 0 if self.file is not None: self.file.close() self.file = open( os.path.join( "data", "fever", "wiki", "wiki-{0}.jsonl".format(str.zfill(str(self.block), 3))), "w+") def __exit__(self, exc_type, exc_val, exc_tb): self.file.close() if __name__ == "__main__": blocks = int(sys.argv[1]) LogHelper.setup() logger = LogHelper.get_logger("convert") blk = Corpus("page", os.path.join("data", "fever"), blocks, lambda x: (x, read_words(x))) with BlockWriter(os.path.join("data", "fever", "wiki"), 50000) as f: for page, body in tqdm(blk): f.write( json.dumps({ "id": page, "text": " ".join(body[1]), "lines": body[0] }))
neg_indices = reversed(neg_indices) for i in neg_indices: sent = selected_sents[i] selected_sents[i] = _replace_sent_with_str(sent, gold_sents.pop()) if len(gold_sents) == 0: return selected_sents if len(gold_sents) > 0: logger.warn( str(len(gold_sents)) + " gold sentences cannot be filled into prediction") return selected_sents if __name__ == '__main__': LogHelper.setup() logger = LogHelper.get_logger('fill_gold_sentences') parser = argparse.ArgumentParser() parser.add_argument('--input', help='/path/to/input/file', required=True) parser.add_argument('--output', help='/path/to/output/file', required=True) parser.add_argument('--max-sent', type=int, help='Maximal number of sentences per claim', default=10) args = parser.parse_args() jlr = JSONLineReader() data = jlr.read(args.input) with open(args.output, "w+") as output_file: for data in tqdm(data): if data['verifiable'] != 'NOT VERIFIABLE': pred_sents = data['predicted_sentences'] gold_evidences = data['evidence']
required=True) parser.add_argument('-o', '--output', help='/path/to/output/file', required=True) parser.add_argument('-ip', '--input-pickle', help='/path/to/input/pickle/file') parser.add_argument('-op', '--output-pickle', help='/path/to/output/pickle/file', required=True) parser.add_argument('-db', '--db', help='/path/to/db/file', required=True) args = parser.parse_args() LogHelper.setup() logger = LogHelper.get_logger("generate_paths") db = FeverDocDB(args.db) jlr = JSONLineReader() lines = jlr.read(args.input) if args.input_pickle is not None: with open(args.input_pickle, 'rb') as f: per_claim_dict = pickle.load(f) else: logger.warn("no pickle file loaded!") per_claim_dict = dict() with open(args.output, 'w') as f, ThreadPoolExecutor(max_workers=8) as executor: future_map = dict() for line in tqdm(lines): _id = line['id'] claim_text = line['claim']
from common.util.log_helper import LogHelper from retrieval.fever_doc_db import FeverDocDB from rte.parikh.reader import FEVERReader import argparse import numpy as np from rte.riedel.data import FEVERGoldFormatter, FEVERLabelSchema # from scripts.retrieval.sentence.process_tfidf import XTermFrequencyFeatureFunction from retrieval.process_tfidf import XTermFrequencyFeatureFunction from rte.tmp.nei_rte_model import NeiRteModel LogHelper.setup() logger = LogHelper.get_logger(__name__) # pylint: disable=invalid-name def tf_idf_sim(claim, lines): test = [] for line in lines: test.append({"claim": claim, "text": line}) return tf.lookup(test).reshape(-1).tolist() def eval_model(db: FeverDocDB, args) -> Model: # archive = load_archive(args.archive_file, cuda_device=args.cuda_device, overrides=args.overrides) # config = archive.config # ds_params = config["dataset_reader"] #
def embed_claims(claims: List, db: Union[str, FeverDocDB], fasttext_model: Union[str, FastText], glove_path: str = None, vocab_dict: Dict[str, int] = None, glove_embeddings=None, predicted: bool = True, threshold_b_sent_num=None, threshold_b_sent_size=50, threshold_h_sent_size=50, is_snopes=False): assert vocab_dict is not None and glove_embeddings is not None or glove_path is not None, "Either vocab_dict and glove_embeddings, or glove_path should be not None" if vocab_dict is None or glove_embeddings is None: vocab, glove_embeddings = load_whole_glove(glove_path) vocab_dict = vocab_map(vocab) print(len(claims)) logger = LogHelper.get_logger("embed_data_set_given_vocab") datas, labels = read_data_set_from_lines(claims, db, predicted, is_snopes=is_snopes) print(len(datas["h"]), len(datas["b"])) heads_ft_embeddings, fasttext_model = single_sentence_set_2_fasttext_embedded( datas['h'], fasttext_model) logger.debug("Finished sentence to FastText embeddings for claims") print(len(heads_ft_embeddings)) heads_ids = single_sentence_set_2_ids_given_vocab(datas['h'], vocab_dict) logger.debug("Finished sentence to IDs for claims") bodies_ft_embeddings, fasttext_model = multi_sentence_set_2_fasttext_embedded( datas['b'], fasttext_model) logger.debug("Finished sentence to FastText embeddings for evidences") bodies_ids = multi_sentence_set_2_ids_given_vocab(datas['b'], vocab_dict) logger.debug("Finished sentence to IDs for evidences") h_ft_np = fasttext_padding_for_single_sentence_set_given_size( heads_ft_embeddings, threshold_h_sent_size) logger.debug( "Finished padding FastText embeddings for claims. Shape of h_ft_np: {}" .format(str(h_ft_np.shape))) b_ft_np = fasttext_padding_for_multi_sentences_set(bodies_ft_embeddings, threshold_b_sent_num, threshold_b_sent_size) logger.debug( "Finished padding FastText embeddings for evidences. Shape of b_ft_np: {}" .format(str(b_ft_np.shape))) h_np, h_sent_sizes = ids_padding_for_single_sentence_set_given_size( heads_ids, threshold_h_sent_size) logger.debug("Finished padding claims") b_np, b_sizes, b_sent_sizes = ids_padding_for_multi_sentences_set( bodies_ids, threshold_b_sent_num, threshold_b_sent_size) logger.debug("Finished padding evidences") processed_data_set = { 'data': { 'h_np': h_np, 'b_np': b_np, 'h_ft_np': h_ft_np, 'b_ft_np': b_ft_np, 'h_sent_sizes': h_sent_sizes, 'b_sent_sizes': b_sent_sizes, 'b_sizes': b_sizes } } return processed_data_set, fasttext_model, vocab_dict, glove_embeddings, threshold_b_sent_num, threshold_b_sent_size
with open(fever_data_file, 'w') as f: for row in claim_dict: f.write(json.dumps(row) + '\n') page_dict = dict() for url_pages in snippet_dict.values(): for page in url_pages: page_dict[page['id']] = { 'lines': page['lines'], 'stance': page['stance'] } with open(fever_page_file, 'w') as f: json.dump(page_dict, f, indent=4) if __name__ == '__main__': import argparse from common.util.log_helper import LogHelper parser = argparse.ArgumentParser() parser.add_argument('--snopes', help='/path/to/snopes/file', required=True) parser.add_argument('--out-page', help='/path/to/page/output/file', required=True) parser.add_argument('--out-claim', help='/path/to/claim/output/file', required=True) args = parser.parse_args() LogHelper.setup() logger = LogHelper.get_logger('snopes') snopes_file_2_fever_data_set(args.snopes, args.out_page, args.out_claim)
return os.path.exists(os.path.join("models", "{0}.model".format(mname))) def str2bool(v): if v.lower() in ('yes', 'true', 't', 'y', '1'): return True elif v.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise argparse.ArgumentTypeError('Boolean value expected.') if __name__ == "__main__": LogHelper.setup() logger = LogHelper.get_logger(__name__) parser = argparse.ArgumentParser() parser.add_argument('db', type=str, help='db file path') parser.add_argument('test', type=str, help='test file path') parser.add_argument("--model", type=str, help="model name") parser.add_argument("--sentence", type=str2bool, default=False) parser.add_argument("--log", type=str, default=None) args = parser.parse_args() logger.info("Loading DB {0}".format(args.db)) db = FeverDocDB(args.db) mname = args.model logger.info("Model name is {0}".format(mname))
def main(mode, config, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + mode) logger.info("model: " + mode + ", config: " + str(config)) if hasattr(Config, 'use_inter_evidence_comparison'): use_inter_evidence_comparison = Config.use_inter_evidence_comparison else: use_inter_evidence_comparison = False if hasattr(Config, 'use_claim_evidences_comparison'): use_claim_evidences_comparison = Config.use_claim_evidences_comparison else: use_claim_evidences_comparison = False if hasattr(Config, 'use_extra_features'): use_extra_features = Config.use_extra_features else: use_extra_features = False if hasattr(Config, 'use_numeric_feature'): use_numeric_feature = Config.use_numeric_feature else: use_numeric_feature = False logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM arguments: " + str(Config.esim_end_2_end_hyper_param)) logger.info("use_inter_sentence_comparison: " + str(use_inter_evidence_comparison)) logger.info("use_extra_features: " + str(use_extra_features)) logger.info("use_numeric_feature: " + str(use_numeric_feature)) logger.info("use_claim_evidences_comparison: " + str(use_claim_evidences_comparison)) if mode == 'train': # # training mode if hasattr(Config, 'training_dump') and os.path.exists( Config.training_dump): with open(Config.training_dump, 'rb') as f: (X_dict, y_train) = pickle.load(f) else: training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2( Config.training_set_file, Config.db_path, glove_path=Config.glove_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_sentence_size) h_sent_sizes = training_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set['data']['h_sent_sizes'] = np.expand_dims( h_sent_sizes, 1) training_set['data']['h_sizes'] = h_sizes training_set['data']['h_np'] = np.expand_dims( training_set['data']['h_np'], 1) valid_set, _, _, _, _ = embed_data_set_with_glove_2( Config.dev_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_sentence_size) h_sent_sizes = valid_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set['data']['h_sizes'] = h_sizes valid_set['data']['h_np'] = np.expand_dims( valid_set['data']['h_np'], 1) if use_extra_features: assert hasattr( Config, 'feature_path' ), "Config should has feature_path if Config.use_feature is True" training_claim_features, training_evidence_features = load_feature_by_data_set( Config.training_set_file, Config.feature_path, Config.max_sentences) valid_claim_features, valid_evidence_features = load_feature_by_data_set( Config.dev_set_file, Config.feature_path, Config.max_sentences) training_set['data']['h_feats'] = training_claim_features training_set['data']['b_feats'] = training_evidence_features valid_set['data']['h_feats'] = valid_claim_features valid_set['data']['b_feats'] = valid_evidence_features if use_numeric_feature: training_num_feat = number_feature(Config.training_set_file, Config.db_path, Config.max_sentences) valid_num_feat = number_feature(Config.dev_set_file, Config.db_path, Config.max_sentences) training_set['data']['num_feat'] = training_num_feat valid_set['data']['num_feat'] = valid_num_feat if use_inter_evidence_comparison: training_concat_sent_indices, training_concat_sent_sizes = generate_concat_indices_for_inter_evidence( training_set['data']['b_np'], training_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) training_set['data'][ 'b_concat_indices'] = training_concat_sent_indices training_set['data'][ 'b_concat_sizes'] = training_concat_sent_sizes valid_concat_sent_indices, valid_concat_sent_sizes = generate_concat_indices_for_inter_evidence( valid_set['data']['b_np'], valid_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) valid_set['data'][ 'b_concat_indices'] = valid_concat_sent_indices valid_set['data']['b_concat_sizes'] = valid_concat_sent_sizes if use_claim_evidences_comparison: training_all_evidences_indices, training_all_evidences_sizes = generate_concat_indices_for_claim( training_set['data']['b_np'], training_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) training_set['data'][ 'b_concat_indices_for_h'] = training_all_evidences_indices training_set['data'][ 'b_concat_sizes_for_h'] = training_all_evidences_sizes valid_all_evidences_indices, valid_all_evidences_sizes = generate_concat_indices_for_claim( valid_set['data']['b_np'], valid_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) valid_set['data'][ 'b_concat_indices_for_h'] = valid_all_evidences_indices valid_set['data'][ 'b_concat_sizes_for_h'] = valid_all_evidences_sizes X_dict = { 'X_train': training_set['data'], 'X_valid': valid_set['data'], 'y_valid': valid_set['label'], 'embedding': embeddings } y_train = training_set['label'] if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump((X_dict, y_train), f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) estimator.fit(X_dict, y_train) save_model(estimator, Config.model_folder, Config.pickle_name, logger) elif mode == 'test': # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) vocab, embeddings = load_whole_glove(Config.glove_path) vocab = vocab_map(vocab) test_set, _, _, _, _ = embed_data_set_with_glove_2( Config.test_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_sentence_size) h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1) if use_extra_features: assert hasattr( Config, 'feature_path' ), "Config should has feature_path if Config.use_feature is True" test_claim_features, test_evidence_features = load_feature_by_data_set( Config.test_set_file, Config.feature_path, Config.max_sentences) test_set['data']['h_feats'] = test_claim_features test_set['data']['b_feats'] = test_evidence_features if use_numeric_feature: test_num_feat = number_feature(Config.test_set_file, Config.db_path, Config.max_sentences) test_set['data']['num_feat'] = test_num_feat x_dict = {'X_test': test_set['data'], 'embedding': embeddings} if use_inter_evidence_comparison: test_concat_sent_indices, test_concat_sent_sizes = generate_concat_indices_for_inter_evidence( test_set['data']['b_np'], test_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) test_set['data']['b_concat_indices'] = test_concat_sent_indices test_set['data']['b_concat_sizes'] = test_concat_sent_sizes if use_claim_evidences_comparison: test_all_evidences_indices, test_all_evidences_sizes = generate_concat_indices_for_claim( test_set['data']['b_np'], test_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) test_set['data'][ 'b_concat_indices_for_h'] = test_all_evidences_indices test_set['data']['b_concat_sizes_for_h'] = test_all_evidences_sizes predictions = estimator.predict(x_dict, restore_param_required) generate_submission(predictions, Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) else: logger.error("Invalid argument --mode: " + mode + " Argument --mode should be either 'train’ or ’test’") return estimator
"predicted_label": prediction_2_label(_prediction) } f.write(json.dumps(obj)) f.write('\n') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--mode', help='\'train\' or \'test\' or \'stub\'', required=True) parser.add_argument('--config', help='/path/to/config/file, in JSON format') args = parser.parse_args() LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + args.mode) logger.info("parameters:\n" + str(vars(args))) if args.config is not None: Config.load_config(args.config) # loading FastText takes long time, so better pickle the loaded FastText model if os.path.splitext(Config.fasttext_path)[1] == '.p': with open(Config.fasttext_path, "rb") as ft_file: fasttext_model = pickle.load(ft_file) else: fasttext_model = Config.fasttext_path if args.mode == 'train': # # training mode training_set, fasttext_model, vocab, embeddings, _, _ = embed_data_set_with_glove_and_fasttext( Config.training_set_file, Config.db_path,
import argparse import json import os import sqlite3 from hashlib import md5 import unicodedata as ud from common.util.log_helper import LogHelper LogHelper.setup() logger = LogHelper.get_logger("FEVERcs Normalize") BUFFER_SIZE, tmp_file = 100000, "" def is_fever_db(file): try: with sqlite3.connect(file) as db: db.cursor().execute( "SELECT id, text, lines FROM documents LIMIT 1") return True except sqlite3.DatabaseError: return False def normalize_sqlite(args): logger.info('Reading into a temporary database...') with sqlite3.connect( args.source_file) as source, sqlite3.connect(tmp_file) as target: c_source, c_target = source.cursor(), target.cursor() c_target.execute( "CREATE TABLE documents (id PRIMARY KEY, text, lines);")
def vocab_map(vocab): voc_dict = {} for i, v in enumerate(vocab): voc_dict[v] = i + 2 voc_dict['PAD'] = 0 voc_dict['UNK'] = 1 return voc_dict if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('db', help='/path/to/db/file') parser.add_argument('output', help='/path/to/output/pickle/file') args = parser.parse_args() LogHelper.setup() logger = LogHelper.get_logger("generate_vocab_all_wiki") db = FeverDocDB(args.db) vocab = set() for doc in tqdm(db.get_doc_ids()): lines = db.get_doc_lines(doc) lines = lines.split("\n") for line in lines: segments = line.split("\t") if len(segments) < 2: continue line = segments[1] if line.strip() == "": continue tokens = set(token.lower() for token in tokenize(clean_text(line))) vocab.update(tokens) logger.info("total size of vocab: " + str(len(vocab)))
""" logger.info("Starting document retrieval for training set...") document_retrieval_main(Config.db_path, Config.document_k_wiki, Config.raw_training_set, Config.training_doc_file, Config.document_add_claim, Config.document_parallel) logger.info("Finished document retrieval for training set.") logger.info("Starting document retrieval for dev set...") document_retrieval_main(Config.db_path, Config.document_k_wiki, Config.raw_dev_set, Config.dev_doc_file, Config.document_add_claim, Config.document_parallel) logger.info("Finished document retrieval for dev set.") logger.info("Starting document retrieval for test set...") document_retrieval_main(Config.db_path, Config.document_k_wiki, Config.raw_test_set, Config.test_doc_file, Config.document_add_claim, Config.document_parallel) """ document_retrieval_main(args.database, 7, args.infile, args.outfile, args.path_wiki_titles, True, True) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--infile') parser.add_argument('--outfile') parser.add_argument('--database') parser.add_argument('--path_wiki_titles') args = parser.parse_args() LogHelper.setup() logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0]) logger.info("=========================== Subtask 1. Document Retrieval ==========================================") print (args.database, args.infile, args.outfile) document_retrieval(logger)
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM_MTL arguments: " + str(Config.esim_mtl_hyper_param)) if mode == RTERunPhase.train: # # training mode if hasattr(Config, 'training_dump') and os.path.exists(Config.training_dump): with open(Config.training_dump, 'rb') as f: X_dict, y_dict = pickle.load(f) else: training_set_claim_valid, training_set_evidence_eval, vocab, embeddings, _, _ = embed_data_set_with_evidence_label( Config.training_set_file, Config.db_path, glove_path=Config.glove_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_sentence_size) h_sent_sizes = training_set_claim_valid['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set_claim_valid['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) training_set_claim_valid['data']['h_sizes'] = h_sizes training_set_claim_valid['data']['h_np'] = np.expand_dims(training_set_claim_valid['data']['h_np'], 1) h_sent_sizes = training_set_evidence_eval['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set_evidence_eval['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) training_set_evidence_eval['data']['h_sizes'] = h_sizes training_set_evidence_eval['data']['h_np'] = np.expand_dims(training_set_evidence_eval['data']['h_np'], 1) valid_set_claim_valid, valid_set_evidence_eval, _, _, _, _ = embed_data_set_with_evidence_label( Config.dev_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_sentence_size) h_sent_sizes = valid_set_claim_valid['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set_claim_valid['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set_claim_valid['data']['h_sizes'] = h_sizes valid_set_claim_valid['data']['h_np'] = np.expand_dims(valid_set_claim_valid['data']['h_np'], 1) h_sent_sizes = valid_set_evidence_eval['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set_evidence_eval['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set_evidence_eval['data']['h_sizes'] = h_sizes valid_set_evidence_eval['data']['h_np'] = np.expand_dims(valid_set_evidence_eval['data']['h_np'], 1) X_dict = {'claim': {'train': training_set_claim_valid['data'], 'valid': valid_set_claim_valid['data'], }, 'evidence': {'train': training_set_evidence_eval['data'], 'valid': valid_set_evidence_eval['data']}, 'embedding': embeddings } y_dict = { 'claim': {'train': training_set_claim_valid['label'], 'valid': valid_set_claim_valid['label']}, 'evidence': {'train': training_set_evidence_eval['label'], 'valid': valid_set_evidence_eval['label']} } if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump((X_dict, y_dict), f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_dict, y_dict) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) vocab, embeddings = load_whole_glove(Config.glove_path) vocab = vocab_map(vocab) test_set, _, _, _, _, _ = embed_data_set_with_evidence_label(Config.test_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_sentence_size) h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1) x_dict = { 'X_test': test_set['data'], 'embedding': embeddings } if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict(x_dict, restore_param_required) generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) return estimator
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) if hasattr(Config, 'is_snopes'): is_snopes = Config.is_snopes else: is_snopes = False logger.debug("is_snopes: " + str(is_snopes)) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("BERT sentence embedding arguments: " + str(Config.bert_sent_hyper_parameter)) if mode == RTERunPhase.train: # training mode if hasattr(Config, 'training_dump') and os.path.exists(Config.training_dump): with open(Config.training_dump, 'rb') as f: (X_train, Y_labels_train, X_valid, Y_labels_valid) = pickle.load(f) else: # process training JSONL file X_train, Y_labels_train = read_data_set_from_jsonl(Config.training_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) X_valid, Y_labels_valid = read_data_set_from_jsonl(Config.dev_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) X_train['b_sizes'] = get_num_sents_of_bodies(X_train['b']) X_valid['b_sizes'] = get_num_sents_of_bodies(X_valid['b']) b_train = X_train['b'] b_encoded_train = encode_multi_sentence_set_with_bert(b_train, Config.max_sentences, port=Config.bert_port, port_out=Config.bert_port_out) X_train['b'] = b_encoded_train logger.debug("b_encoded_train.shape: " + str(b_encoded_train.shape)) h_train = X_train['h'] h_encoded_train = encode_single_sentence_set_with_bert(h_train, port=Config.bert_port, port_out=Config.bert_port_out) X_train['h'] = h_encoded_train logger.debug("h_encoded_train.shape: " + str(h_encoded_train.shape)) b_valid = X_valid['b'] b_encoded_valid = encode_multi_sentence_set_with_bert(b_valid, Config.max_sentences, port=Config.bert_port, port_out=Config.bert_port_out) X_valid['b'] = b_encoded_valid logger.debug("b_encoded_valid.shape: " + str(b_encoded_valid.shape)) h_valid = X_valid['h'] h_encoded_valid = encode_single_sentence_set_with_bert(h_valid, port=Config.bert_port, port_out=Config.bert_port_out) X_valid['h'] = h_encoded_valid logger.debug("h_encoded_valid.shape: " + str(h_encoded_valid.shape)) if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump((X_train, Y_labels_train, X_valid, Y_labels_valid), f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_train, Y_labels_train, X_valid, Y_labels_valid) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) X_test, Y_labels_test = read_data_set_from_jsonl(Config.test_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) X_test['b_sizes'] = get_num_sents_of_bodies(X_test['b']) b_test = X_test['b'] b_encoded_test = encode_multi_sentence_set_with_bert(b_test, Config.max_sentences, port=Config.bert_port, port_out=Config.bert_port_out) X_test['b'] = b_encoded_test logger.debug("b_encoded_test.shape: " + str(b_encoded_test.shape)) h_test = X_test['h'] h_encoded_test = encode_single_sentence_set_with_bert(h_test, port=Config.bert_port, port_out=Config.bert_port_out) X_test['h'] = h_encoded_test logger.debug("h_encoded_test.shape: " + str(h_encoded_test.shape)) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict(X_test, restore_param_required) generate_submission(predictions, X_test['id'], Config.test_set_file, Config.submission_file) if Y_labels_test is not None: print_metrics(Y_labels_test, predictions, logger) return estimator
import argparse import json from tqdm import tqdm from common.dataset.reader import JSONLineReader from common.util.log_helper import LogHelper if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('input', help='/path/to/input/file') parser.add_argument('output', help='/path/to/output/file') args = parser.parse_args() LogHelper.setup() logger = LogHelper.get_logger("separate_scores") jlr = JSONLineReader() lines = jlr.read(args.input) with open(args.output, 'w') as f: for obj in tqdm(lines): predicted_evidence = obj['predicted_evidence'] new_predicted_evidence = [] scores = [] for evidence in predicted_evidence: new_predicted_evidence.append(evidence[0]) scores.append(evidence[1]) obj['predicted_evidence'] = new_predicted_evidence obj['scores'] = scores f.write(json.dumps(obj) + '\n')
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM arguments: " + str(Config.esim_hyper_param)) if hasattr(Config, 'is_snopes'): is_snopes = Config.is_snopes else: is_snopes = False logger.debug("is_snopes: " + str(is_snopes)) if mode == RTERunPhase.train: # training mode training_set = embed_data_set_with_bert(Config.training_set_file, Config.db_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, is_snopes=is_snopes, port=Config.bert_port, port_out=Config.bert_port_out) h_sent_sizes = training_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) training_set['data']['h_sizes'] = h_sizes training_set['data']['h_bert_np'] = np.expand_dims(training_set['data']['h_bert_np'], 1) valid_set = embed_data_set_with_bert(Config.dev_set_file, Config.db_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, is_snopes=is_snopes, port=Config.bert_port, port_out=Config.bert_port_out) h_sent_sizes = valid_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set['data']['h_sizes'] = h_sizes valid_set['data']['h_bert_np'] = np.expand_dims(valid_set['data']['h_bert_np'], 1) X_dict = { 'X_train': training_set['data'], 'X_valid': valid_set['data'], 'y_valid': valid_set['label'] } if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_dict, training_set['label']) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) test_set = embed_data_set_with_bert(Config.test_set_file, Config.db_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, is_snopes=is_snopes, port=Config.bert_port, port_out=Config.bert_port_out) h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_bert_np'] = np.expand_dims(test_set['data']['h_bert_np'], 1) x_dict = { 'X_test': test_set['data'] } if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict(x_dict, restore_param_required) generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) return estimator
if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model if __name__ == "__main__": LogHelper.setup() LogHelper.get_logger("allennlp.training.trainer") LogHelper.get_logger(__name__) parser = argparse.ArgumentParser() parser.add_argument('db', type=str, help='/path/to/saved/db.db') parser.add_argument( 'param_path', type=str, help='path to parameter file describing the model to be trained') parser.add_argument("logdir", type=str) parser.add_argument("--filtering", type=str, default=None) parser.add_argument("--cuda-device", type=int, default=None,
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM arguments: " + str(Config.esim_hyper_param)) # loading FastText takes long time, so better pickle the loaded FastText model if os.path.splitext(Config.fasttext_path)[1] == '.p': with open(Config.fasttext_path, "rb") as ft_file: fasttext_model = pickle.load(ft_file) else: fasttext_model = Config.fasttext_path if mode == RTERunPhase.train: # # training mode training_set, fasttext_model, vocab, embeddings = embed_data_set_with_glove_and_fasttext_claim_only( Config.training_set_file, fasttext_model, glove_path=Config.glove_path, threshold_h_sent_size=Config.max_sentence_size) h_sent_sizes = training_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) training_set['data']['h_sizes'] = h_sizes training_set['data']['h_np'] = np.expand_dims( training_set['data']['h_np'], 1) training_set['data']['h_ft_np'] = np.expand_dims( training_set['data']['h_ft_np'], 1) valid_set, _, _, _ = embed_data_set_with_glove_and_fasttext_claim_only( Config.dev_set_file, fasttext_model, vocab_dict=vocab, glove_embeddings=embeddings, threshold_h_sent_size=Config.max_sentence_size) del fasttext_model h_sent_sizes = valid_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set['data']['h_sizes'] = h_sizes valid_set['data']['h_np'] = np.expand_dims(valid_set['data']['h_np'], 1) valid_set['data']['h_ft_np'] = np.expand_dims( valid_set['data']['h_ft_np'], 1) X_dict = { 'X_train': training_set['data'], 'X_valid': valid_set['data'], 'y_valid': valid_set['label'], 'embedding': embeddings } if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_dict, training_set['label']) save_model(estimator, Config.model_folder, Config.pickle_name, logger) elif mode == 'test': # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) vocab, embeddings = load_whole_glove(Config.glove_path) vocab = vocab_map(vocab) test_set, _, _, _ = embed_data_set_with_glove_and_fasttext_claim_only( Config.test_set_file, fasttext_model, vocab_dict=vocab, glove_embeddings=embeddings, threshold_h_sent_size=Config.max_sentence_size) del fasttext_model h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1) test_set['data']['h_ft_np'] = np.expand_dims( test_set['data']['h_ft_np'], 1) x_dict = {'X_test': test_set['data'], 'embedding': embeddings} if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict(x_dict, restore_param_required) generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) return estimator
def main(mode, config, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + mode) logger.info("model: " + mode + ", config: " + str(config)) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM arguments: " + str(Config.esim_hyper_param)) # loading FastText takes a long time, so better pickle the loaded FastText model if os.path.splitext(Config.fasttext_path)[1] == '.p': with open(Config.fasttext_path, "rb") as ft_file: fasttext_model = pickle.load(ft_file) else: fasttext_model = Config.fasttext_path if mode == 'train': # # training mode training_set, fasttext_model, vocab, embeddings, _, _ = embed_data_set_with_glove_and_fasttext( Config.training_set_file, Config.db_path, fasttext_model, glove_path=Config.glove_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_sentence_size) h_sent_sizes = training_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) training_set['data']['h_sizes'] = h_sizes training_set['data']['h_np'] = np.expand_dims( training_set['data']['h_np'], 1) training_set['data']['h_ft_np'] = np.expand_dims( training_set['data']['h_ft_np'], 1) valid_set, _, _, _, _, _ = embed_data_set_with_glove_and_fasttext( Config.dev_set_file, Config.db_path, fasttext_model, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_sentence_size) del fasttext_model h_sent_sizes = valid_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set['data']['h_sizes'] = h_sizes valid_set['data']['h_np'] = np.expand_dims(valid_set['data']['h_np'], 1) valid_set['data']['h_ft_np'] = np.expand_dims( valid_set['data']['h_ft_np'], 1) X_dict = { 'X_train': training_set['data'], 'X_valid': valid_set['data'], 'y_valid': valid_set['label'], 'embedding': embeddings } if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) estimator.fit(X_dict, training_set['label']) save_model(estimator, Config.model_folder, Config.pickle_name, logger) elif mode == 'test': # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) vocab, embeddings = load_whole_glove(Config.glove_path) vocab = vocab_map(vocab) test_set, _, _, _, _, _ = embed_data_set_with_glove_and_fasttext( Config.test_set_file, Config.db_path, fasttext_model, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_sentence_size) del fasttext_model h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1) test_set['data']['h_ft_np'] = np.expand_dims( test_set['data']['h_ft_np'], 1) x_dict = {'X_test': test_set['data'], 'embedding': embeddings} predictions = estimator.predict(x_dict, restore_param_required) generate_submission(predictions, Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) else: logger.error("Invalid argument --mode: " + mode + " Argument --mode should be either 'train’ or ’test’")
def fever_app(caller): global db, tokenizer, text_encoder, encoder, X_train, M_train, X, M, Y_train, Y,params,sess, n_batch_train, db_file, \ drqa_index, max_page, max_sent, encoder_path, bpe_path, n_ctx, n_batch, model_file global n_vocab,n_special,n_y,max_len,clf_token,eval_lm_losses,eval_clf_losses,eval_mgpu_clf_losses,eval_logits, \ eval_mgpu_logits,eval_logits LogHelper.setup() logger = LogHelper.get_logger("papelo") logger.info("Load config") config = json.load(open(os.getenv("CONFIG_FILE","configs/config-docker.json"))) globals().update(config) print(globals()) logger.info("Set Seeds") random.seed(42) np.random.seed(42) tf.set_random_seed(42) logger.info("Load FEVER DB") db = FeverDocDB(db_file) retrieval = TopNDocsTopNSents(db, max_page, max_sent, True, False, drqa_index) logger.info("Init word tokenizer") tokenizer = SimpleWordSplitter() # Prepare text encoder logger.info("Load BPE Text Encoder") text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) n_y = 3 encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx // 2 - 2 n_batch_train = n_batch logger.info("Create TF Placeholders") X_train = tf.placeholder(tf.int32, [n_batch, 1, n_ctx, 2]) M_train = tf.placeholder(tf.float32, [n_batch, 1, n_ctx]) X = tf.placeholder(tf.int32, [None, 1, n_ctx, 2]) M = tf.placeholder(tf.float32, [None, 1, n_ctx]) Y_train = tf.placeholder(tf.int32, [n_batch]) Y = tf.placeholder(tf.int32, [None]) logger.info("Model Setup") eval_logits, eval_clf_losses, eval_lm_losses = model(X, M, Y, train=False, reuse=None) eval_mgpu_logits, eval_mgpu_clf_losses, eval_mgpu_lm_losses = mgpu_predict(X_train, M_train, Y_train) logger.info("Create TF Session") params = find_trainable_variables('model') gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=float(os.getenv("TF_GPU_MEMORY_FRACTION","0.5"))) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) sess.run(tf.global_variables_initializer()) sess.run([p.assign(ip) for p, ip in zip(params, joblib.load(model_file))]) logger.info("Ready") def predict(instances): predictions = [] for instance in tqdm(instances): sents = retrieval.get_sentences_for_claim(instance["claim"]) found_evidence = resolve_evidence(sents) instance["tokenized_claim"] = " ".join(map(lambda x: x.text, tokenizer.split_words(instance["claim"]))) sub_instances = make_instances(instance, found_evidence) sub_predictions = predict_sub_instances(text_encoder, sub_instances) refute_evidence = [i for i, x in enumerate(sub_predictions) if x == 2] support_evidence = [i for i, x in enumerate(sub_predictions) if x == 0] if len(support_evidence): predicted_label = "SUPPORTS" predicted_evidence = [[found_evidence[i]["title"], found_evidence[i]["line_number"]] for i in support_evidence] elif len(refute_evidence): predicted_label = "REFUTES" predicted_evidence = [[found_evidence[i]["title"], found_evidence[i]["line_number"]] for i in refute_evidence] else: predicted_label = "NOT ENOUGH INFO" predicted_evidence = [] predictions.append({"predicted_label":predicted_label, "predicted_evidence": predicted_evidence}) return predictions return caller(predict)
def __init__(self,file): self.pages = [] self.file = file self.logger = LogHelper.get_logger(__name__) self.logger.info("Indexing Pages")
class TopNDocsTopNSents(RetrievalMethod): class RankArgs: def __init__(self): self.ngram = 2 self.hash_size = int(math.pow(2, 24)) self.tokenizer = "simple" self.num_workers = None def __init__(self, db, n_docs, n_sents, model): super().__init__(db) self.n_docs = n_docs self.n_sents = n_sents self.ranker = retriever.get_class('tfidf')(tfidf_path=model) self.onlineranker_args = self.RankArgs() def get_docs_for_claim(self, claim_text): doc_names, doc_scores = self.ranker.closest_docs( claim_text, self.n_docs) return zip(doc_names, doc_scores) def tf_idf_sim(self, claim, lines, freqs=None): tfidf = OnlineTfidfDocRanker(self.onlineranker_args, [line["sentence"] for line in lines], freqs) line_ids, scores = tfidf.closest_docs(claim, self.n_sents) ret_lines = [] for idx, line in enumerate(line_ids): ret_lines.append(lines[line]) ret_lines[-1]["score"] = scores[idx] return ret_lines LogHelper.setup() logger = LogHelper.get_logger(__name__) def get_sentences_given_claim(self, page, logger, line_no): lines = self.db.get_doc_lines(page) lines = [ line.split("\t")[1] if len(line.split("\t")[1]) > 1 else "" for line in lines.split("\n") ] sent = lines[line_no] return sent def get_sentences_for_claim(self, claim_text, include_text=False): #given a claim get a bunch of documents that might be relevant for it pages = self.get_docs_for_claim(claim_text) sorted_p = list(sorted(pages, reverse=True, key=lambda elem: elem[1])) pages = [p[0] for p in sorted_p[:self.n_docs]] p_lines = [] for page in pages: logger.info("page:" + page) #query the db and get the list of sentences in a given wikipedia page lines = self.db.get_doc_lines(page) logger.info(lines) sys.exit(1) lines = [ line.split("\t")[1] if len(line.split("\t")[1]) > 1 else "" for line in lines.split("\n") ] p_lines.extend(zip(lines, [page] * len(lines), range(len(lines)))) lines = [] for p_line in p_lines: logger.info("value of sentence in p_line is:" + p_line[0]) sys.exit(1) lines.append({ "sentence": p_line[0], "page": p_line[1], "line_on_page": p_line[2] }) scores = self.tf_idf_sim(claim_text, lines) if include_text: return scores return [(s["page"], s["line_on_page"]) for s in scores]
def read_data_set_from_lines(lines: List, db: Union[str, FeverDocDB], predicted: bool = True, num_sentences=None, is_snopes=False): logger = LogHelper.get_logger("read_data_set_from_jsonl") if not is_snopes: if type(db) is str: db = FeverDocDB(db) else: with open(db) as f: db = json.load(f) claims = [] evidences = [] paths = [] labels = [] for line in tqdm(lines): json_obj = line if predicted: evidences_texts = [] if 'predicted_evidence' in json_obj: _evidences = json_obj['predicted_evidence'] elif 'predicted_sentences' in json_obj: _evidences = json_obj['predicted_sentences'] else: _evidences = [] if len(_evidences) > 0: for sent in _evidences: page, line_num = sent[-2], sent[-1] page_title = page.replace("_", " ") evidences_texts.append( # page_title + " # " + clean_text(evidence_num_to_text(db, page, line_num, is_snopes))) clean_text( evidence_num_to_text(db, page, line_num, is_snopes))) else: evidences_texts = set() _evidences = json_obj['evidence'] for evidence in _evidences: for sent in evidence: page, line_num = sent[-2], sent[-1] page_title = page.replace("_", " ") evidences_texts.add( # page_title + " # " + clean_text(evidence_num_to_text(db, page, line_num, is_snopes))) clean_text( evidence_num_to_text(db, page, line_num, is_snopes))) evidences_texts = list(evidences_texts) if len(evidences_texts) == 0: evidence_texts = [""] if num_sentences is not None: if len(evidences_texts) > num_sentences: evidences_texts = evidences_texts[:num_sentences] claims.append(clean_text(json_obj['claim'])) if 'label' in json_obj: labels.append(label_dict.index(json_obj['label'])) evidences.append(evidences_texts) if 'paths' in json_obj: paths_from_sent_to_claim = [ 1.0 if p else 0.0 for p in json_obj['paths'] ] if num_sentences is not None and num_sentences > len( paths_from_sent_to_claim): paths_from_sent_to_claim += [0.0] * ( num_sentences - len(paths_from_sent_to_claim)) paths.append(paths_from_sent_to_claim) datas = {'h': claims, 'b': evidences} if paths: datas['paths'] = paths return datas, labels
#!/usr/bin/env python3 #Adapted from https://github.com/facebookresearch/DrQA/blob/master/scripts/retriever/build_db.py # Copyright 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. """A script to build the tf-idf document matrices for retrieval.""" import os from drqascripts.retriever.build_tfidf import * from common.util.log_helper import LogHelper if __name__ == '__main__': LogHelper.setup() logger = LogHelper.get_logger("DrQA Build TFIDF") LogHelper.get_logger("DRQA") logger.info("Build TF-IDF matrix") parser = argparse.ArgumentParser() parser.add_argument('db_path', type=str, default=None, help='Path to sqlite db holding document texts') parser.add_argument('out_dir', type=str, default=None, help='Directory for saving output files') parser.add_argument('--ngram', type=int,
import argparse from functools import partial from utils.data_reader import read_data_set_from_jsonl from common.util.log_helper import LogHelper from deep_models.USE_Attention_finetune import USEAttention, weight_matrix_2 def max_len_body(x): lengths = [len(sents) for sents in x['b']] print(max(lengths), min(lengths), sum(lengths) / len(lengths)) return max(lengths) if __name__ == '__main__': LogHelper.setup() logger = LogHelper.get_logger('train') parser = argparse.ArgumentParser() parser.add_argument( '--train', help='/path/to/training/set', required=True) parser.add_argument( '--valid', help='/path/to/validation/set', required=True) parser.add_argument('--save', help='/path/to/save/model') parser.add_argument('--db', help='/path/to/data/base', required=True) parser.add_argument( '--max-sent', type=int, help='Maximal number of sentences per claim', default=5) args = parser.parse_args() num_sentences = args.max_sent X_train, Y_labels_train = read_data_set_from_jsonl( args.train, args.db, num_sentences=num_sentences) X_valid, Y_labels_valid = read_data_set_from_jsonl( args.valid, args.db, num_sentences=num_sentences)
parser.add_argument('--save-data', help='/path/to/save/data', default="data/rte/train/") parser.add_argument('--load-data', help='/path/to/load/data/file') parser.add_argument('--db', help='/path/to/data/base', required=True) parser.add_argument('--max-sent', type=int, help='Maximal number of sentences per claim', default=5) parser.add_argument('--embed', help='/path/to/embedding') parser.add_argument('--save-result', help='/path/to/save/result', default="data/rte/result/") args = parser.parse_args() LogHelper.setup() logger = LogHelper.get_logger(args.mode) if args.mode == 'train': assert args.train is not None or args.load_data is not None, "--train training set or --load-data should be provided in train mode" assert args.embed is not None, "--embed should be provided in train mode" # training mode if args.load_data: # load pre-processed training data with open(args.load_data, "rb") as file: param = pickle.load(file) else: # process training JSONL file paths = [args.train, args.valid] dataset_list, vocab, embeddings, b_max_sent_num, b_max_sent_size = embed_data_sets_with_glove( paths, args.db, args.embed, threshold_b_sent_num=args.max_sent) vocab = vocab_map(vocab) param = {