def fasttext_padding_for_multi_sentences_set(fasttext_embeddings, max_bodies_size=None, max_bodies_sent_size=None): logger = LogHelper.get_logger("fasttext_padding_for_multi_sentences_set") b_sizes = np.asarray([len(sents) for sents in fasttext_embeddings]) bodies_sent_sizes_ = [[len(sent) for sent in sents] for sents in fasttext_embeddings] if max_bodies_size is None: max_bodies_size = max(b_sizes) if max_bodies_sent_size is None: max_bodies_sent_size = max(map(max, bodies_sent_sizes_)) def padded_text_ids(_list, num_doc, max_num_sent, max_num_words): doc_np = np.zeros([num_doc, max_num_sent, max_num_words, dim_fasttext], dtype=np.float32) for i, doc in enumerate(_list): for j, sent in enumerate(doc): if j >= max_num_sent: break for k, word in enumerate(sent): if k >= max_num_words: break doc_np[i, j, k] = word return doc_np ft_np = padded_text_ids(fasttext_embeddings, len(fasttext_embeddings), max_bodies_size, max_bodies_sent_size) return ft_np
def __init__(self, name=None, ckpt_path=None, trainable=False, lstm_layers=2, num_neurons=[128, 128, 32], pos_weight=None, optimizer='adam', learning_rate=0.001, batch_size=128, activation='relu', initializer='he', num_epoch=100, dropout_rate=None, max_check_without_progress=10, show_progress=1, tensorboard_logdir=None, random_state=None, vocab_size=None, n_outputs=3, device=None, embedding=None): self.ckpt_path = ckpt_path self.trainable = trainable self.lstm_layers = lstm_layers self.num_neurons = num_neurons self.optimizer = optimizer self.learning_rate = learning_rate self.batch_size = batch_size self.activation = activation self.num_epoch = num_epoch self.initializer = initializer self.dropout_rate = dropout_rate self.max_check_without_progress = max_check_without_progress self.show_progress = show_progress self.random_state = random_state self.tensorboard_logdir = tensorboard_logdir self.vocab_size = vocab_size self.n_outputs = n_outputs self.pos_weight = pos_weight self.name = name self.device = device self.embedding = embedding self._graph = None self._classes = None self._session = None self._initializer = None self._optimizer = None self._activation = None self.logger = LogHelper.get_logger(self.name)
def __init__(self, num_neurons=[256, 32], optimizer='adam', learning_rate=0.0001, batch_size=128, activation='relu', initializer='he', num_epoch=100, batch_norm_momentum=None, dropout_rate=None, n_outputs=3, max_check_without_progress=10, show_progress=1, ckpt_path=None, tensorboard_logdir=None, random_state=None, l2_lambda=0, max_sentences=5, attention=cosine_similarity, pos_weight=None, embedding_size=dim_bert, max_gpu_memory=0.5): self.num_neurons = num_neurons self.optimizer = optimizer self.learning_rate = learning_rate self.batch_size = batch_size self.activation = activation self.num_epoch = num_epoch self.initializer = initializer self.batch_norm_momentum = batch_norm_momentum self.dropout_rate = dropout_rate self.max_checks_without_progress = max_check_without_progress self.show_progress = show_progress self.random_state = random_state self.tensorboard_logdir = tensorboard_logdir self.l2_lambda = l2_lambda self.max_sentences = max_sentences self.attention = attention self.n_outputs = n_outputs self.pos_weight = pos_weight self.embedding_size = embedding_size self.ckpt_path = ckpt_path self.max_gpu_memory = max_gpu_memory self._session = None self._activation = None self._initializer = None self._optimizer = None self._graph = None self.logger = LogHelper.get_logger(self.__class__.__name__)
def __init__(self, h_max_length=20, b_max_length=200, trainable=False, lstm_layers=1, mlp_layers=0, num_neurons=[128, 128, 32], share_parameters=True, average_pooling=False, optimizer=tf.train.AdamOptimizer, learning_rate=0.001, batch_size=128, activation=tf.nn.relu, initializer=he_init, num_epoch=10, batch_norm_momentum=None, dropout_rate=None, max_check_without_progress=20, show_progress=10, tensorboard_logdir=None, random_state=None, embedding=None, l2_lambda=0.001, vocab_size=None): self.logger = LogHelper.get_logger(self.__class__.__name__) self.h_max_length = h_max_length self.b_max_length = b_max_length self.trainable = trainable self.lstm_layers = lstm_layers self.mlp_layers = mlp_layers self.num_neurons = num_neurons self.share_parameters = share_parameters self.average_pooling = average_pooling self.optimizer = optimizer self.learning_rate = learning_rate self.batch_size = batch_size self.activation = activation self.num_epoch = num_epoch self.initializer = initializer self.batch_norm_momentum = batch_norm_momentum self.dropout_rate = dropout_rate self.max_checks_without_progress = max_check_without_progress self.show_progress = show_progress self.randome_state = random_state self.tensorboard_logdir = tensorboard_logdir self.embedding = embedding self.embed_size = len(embedding[0]) if embedding is not None else 0 self.l2_lambda = l2_lambda self.logger.debug(vocab_size) self.vocab_size = vocab_size # self.share_embeddings = share_embeddings # assert self.lstm_layers + self.mlp_layers == len(self.num_neurons) # if self.embedding is None and self.vocab_size is None: # raise Exception("Either embedding or vocab_size must be setted!") self._session = None
def __init__(self, name, path, blocks, preprocessing=None): self.logger = LogHelper.get_logger(Corpus.__name__) self.name = name self.path = path self.blocks = blocks self.active_block_iter = None self.active_block = None self.active_block_number = None self.preprocessing = preprocessing
def main(args=NullArgs()): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0]) args.mode = Mode.PREDICT if args.config is not None: Config.load_config(args.config) if args.out_file is not None: Config.relative_path_submission = args.out_file if args.in_file is not None: Config.relative_path_test_file = args.in_file if args.database is not None: Config.relative_path_db = args.database print("relative_path_db " + Config.relative_path_db) print("raw_test_set " + Config.raw_test_set()) if os.path.exists(Config.test_doc_file): os.remove(Config.test_doc_file) if os.path.exists(Config.test_set_file): os.remove(Config.test_set_file) if args.mode in {Mode.PIPELINE, Mode.PREDICT, Mode.PREDICT_ALL_DATASETS}: logger.info( "=========================== Sub-task 1. Document Retrieval ==========================================" ) document_retrieval(logger, args.mode) if args.mode in { Mode.PIPELINE_NO_DOC_RETR, Mode.PIPELINE, Mode.PREDICT, Mode.PREDICT_NO_DOC_RETR, Mode.PREDICT_ALL_DATASETS, Mode.PREDICT_NO_DOC_RETR_ALL_DATASETS }: logger.info( "=========================== Sub-task 2. Sentence Retrieval ==========================================" ) sentence_retrieval_ensemble(logger, args.mode) logger.info( "=========================== Sub-task 3. Claim Validation ============================================" ) rte(logger, args, args.mode)
def __init__(self, name, patience=8): self.patience = patience self.best_model = None self.best_score = None self.best_epoch = 0 self.epoch = 0 self.name = name self.logger = LogHelper.get_logger(EarlyStopping.__name__)
def __init__(self, model_name, features=list(), label_name="label", base_path="features"): self.feature_functions = features self.vocabs = dict() self.label_name = label_name self.base_path = base_path self.logger = LogHelper.get_logger(Features.__name__) self.mname = model_name
def __init__(self,doc_db,lim_unigram=5000,naming=None,gold=True): super().__init__() self.doc_db = doc_db self.lim_unigram = lim_unigram self.naming = naming self.logger = LogHelper.get_logger(self.get_name()) self.logger.info("Term Frequency Feature Function with top {0} unigrams".format(lim_unigram)) if gold: self.ename = "evidence" else: self.ename = "predicted"
def __init__(self, h_max_length=50, b_max_length=50, trainable=False, lstm_layers=1, num_neurons=[128, 128, 32], optimizer='adam', learning_rate=0.001, batch_size=128, activation='relu', initializer='he', num_epoch=100, dropout_rate=None, max_check_without_progress=10, show_progress=1, tensorboard_logdir=None, random_state=None, l2_lambda=0.01, n_outputs=3, pos_weight=None, n_sents=5, ckpt_path=None, max_gpu_memory=0.5): self.n_outputs = n_outputs self.pos_weight = pos_weight self.n_sents = n_sents self.ckpt_path = ckpt_path self.activation = activation self.initializer = initializer self.optimizer = optimizer self.lstm_layers = lstm_layers self.num_neurons = num_neurons self.max_check_without_progress = max_check_without_progress self.show_progress = show_progress self.random_state = random_state self.tensorboard_logdir = tensorboard_logdir self.num_epoch = num_epoch self.l2_lambda = l2_lambda self.h_max_length = h_max_length self.b_max_length = b_max_length self.trainable = trainable self.learning_rate = learning_rate self.batch_size = batch_size self.dropout_rate = dropout_rate self.max_gpu_memory = max_gpu_memory self.elmo = None self._graph = None self._classes = None self._session = None self._initializer = None self._optimizer = None self._activation = None self.logger = LogHelper.get_logger(self.__class__.__name__)
def loadGloVe(filename, heads, bodies): logger = LogHelper.get_logger("text_processing") is_gz = os.path.splitext(filename)[1] == '.gz' # Getting embedding dimension def _get_dim(_file): line = _file.readline() return len(line.strip().split(' ')) - 1 if is_gz: with gzip.open(filename, 'rt') as file0: emb_dim = _get_dim(file0) else: with open(filename, 'r', encoding='utf-8') as file0: emb_dim = _get_dim(file0) dataset_token_set = get_token_set(heads, bodies) logger.info("Finished tokenization") # First row of embedding matrix is 0 for zero padding vocab = ['[PAD]'] embd = [[0.0] * emb_dim] vocab.append('UNK') embd.append([1.0] * emb_dim) def _read_glove_file(_vocab, _embd, _token_set, _file): for line in _file: items = line.replace('\r', '').replace('\n', '').split(' ') if len(items) < 10: print(items) continue word = items[0] if word in _token_set: _vocab.append(word) vect = [float(i) for i in items[1:]] _embd.append(vect) return _vocab, _embd # Reading embedding matrix if is_gz: with gzip.open(filename, 'rt') as file: vocab, embd = _read_glove_file(vocab, embd, dataset_token_set, file) else: with open(filename, 'r', encoding='utf-8') as file: vocab, embd = _read_glove_file(vocab, embd, dataset_token_set, file) logger.info('Loaded GloVe!') return vocab, embd
def single_sentence_set_2_ids_given_vocab(texts, vocab_dict): logger = LogHelper.get_logger("single_sentence_set_2_ids_given_vocab") doc_ids = [] out_of_vocab_counts = 0 for sent in texts: tokens = tokenize(sent) word_ids = [] for token in tokens: if token.lower() in vocab_dict: word_ids.append(vocab_dict[token.lower()]) else: out_of_vocab_counts += 1 word_ids.append(vocab_dict['UNK']) doc_ids.append(word_ids) logger.debug("{} times out of vocab".format(str(out_of_vocab_counts))) return doc_ids
def single_sentence_set_2_fasttext_embedded(sents: List[str], fasttext_model: Union[str, FastText]): logger = LogHelper.get_logger("single_sentence_set_2_fasttext_embedded") if type(fasttext_model) == str: fasttext_model = FastText.load_fasttext_format(fasttext_model) fasttext_embeddings = [] for sent in sents: tokens = tokenize(sent) sent_embeddings = [] for token in tokens: try: sent_embeddings.append(fasttext_model[token.lower()]) except KeyError: sent_embeddings.append(np.ones([dim_fasttext], np.float32)) fasttext_embeddings.append(sent_embeddings) return fasttext_embeddings, fasttext_model
def __init__(self, h_max_length=20, b_max_length=200, trainable=False, lstm_layers=2, mlp_layers=1, num_neurons=[128, 128, 32], share_parameters=True, average_pooling=False, optimizer=tf.train.AdamOptimizer, learning_rate=0.001, batch_size=128, activation=tf.nn.relu, initializer=he_init, num_epoch=100, batch_norm_momentum=None, dropout_rate=None, max_check_without_progress=10, show_progress=10, tensorboard_logdir=None, random_state=None, embedding=None, l2_lambda=0.01, vocab_size=None, n_outputs=3, pos_weight=None): LSTM.__init__(self, h_max_length, b_max_length, trainable, lstm_layers, mlp_layers, num_neurons, share_parameters, average_pooling, optimizer, learning_rate, batch_size, activation, initializer, num_epoch, batch_norm_momentum, dropout_rate, max_check_without_progress, show_progress, tensorboard_logdir, random_state, embedding, l2_lambda, vocab_size) self.mlp_layers = len(num_neurons) - 2 self.vocab_size = vocab_size self.embedding_size = 300 + dim_fasttext self.n_outputs = n_outputs self.pos_weight = pos_weight self._graph = None self._classes = None self._session = None self.logger = LogHelper.get_logger(self.__class__.__name__) if self.embedding is None and self.vocab_size is None: raise Exception("Either embedding or vocab_size must be set!")
def __init__(self, optimizer=tf.train.AdamOptimizer, h_max_length=40, s_max_length=40, learning_rate=0.0001, num_sents=10, batch_size=128, activation=tf.nn.relu, initializer=he_init, num_epoch=100, dropout_rate=None, embedding=None, word_dict=None, max_check_without_progress=5, show_progress=1, tensorboard_logdir=None, random_state=None, l2_lambda=0, trainable=False, n_outputs=3): # self.mlp_layers = mlp_layers # self.num_neurons = num_neurons self.optimizer = optimizer self.h_max_length = h_max_length self.s_max_length = s_max_length self.num_sents = num_sents self.learning_rate = learning_rate self.embedding = embedding self.word_dict = word_dict self.batch_size = batch_size self.activation = activation self.num_epoch = num_epoch self.initializer = initializer self.dropout_rate = dropout_rate self.max_checks_without_progress = max_check_without_progress self.show_progress = show_progress self.randome_state = random_state self.tensorboard_logdir = tensorboard_logdir self.l2_lambda = l2_lambda self.trainable = trainable self.n_outputs = n_outputs self._session = None self.logger = LogHelper.get_logger(self.__class__.__name__)
def eval_da(dataset_to_work_on, args, operation, mithun_logger): LogHelper.setup() LogHelper.get_logger("allennlp.training.trainer") LogHelper.get_logger(__name__) params = Params.from_file(args.param_path, args.overrides) uofa_params = params.pop('uofa_params', {}) path_to_saved_db = uofa_params.pop("path_to_saved_db") db = FeverDocDB(path_to_saved_db) mithun_logger.info("inside main function going to call eval on " + str(dataset_to_work_on)) mithun_logger.info("path_to_pyproc_annotated_data_folder " + str(path_to_pyproc_annotated_data_folder)) mithun_logger.info("value of name_of_trained_model_to_use: " + str(name_of_trained_model_to_use)) mithun_logger.info("value of dataset_to_work_on: " + str(dataset_to_work_on)) if (dataset_to_work_on == "fnc"): fever_dataset_details = uofa_params.pop('fever_dataset_details', {}) dev_partition_details = fever_dataset_details.pop( 'dev_partition_details', {}) name_of_trained_model_to_use = dev_partition_details.pop( 'name_of_trained_model_to_use', {}) path_to_pyproc_annotated_data_folder = dev_partition_details.pop( 'path_to_pyproc_annotated_data_folder', {}) debug_mode = uofa_params.pop('debug_mode', {}) path_to_trained_models_folder = uofa_params.pop( 'path_to_trained_models_folder', {}) path_to_fnc_annotated_data = dev_partition_details.pop( 'path_to_pyproc_annotated_data_folder', {}) eval_model_fnc_data(db, args, mithun_logger, name_of_trained_model_to_use, path_to_trained_models_folder, cuda_device, operation, path_to_fnc_annotated_data) elif (dataset_to_work_on == "fever"): fever_dataset_details = uofa_params.pop('fever_dataset_details', {}) dev_partition_details = fever_dataset_details.pop( 'dev_partition_details', {}) name_of_trained_model_to_use = dev_partition_details.pop( 'name_of_trained_model_to_use', {}) path_to_pyproc_annotated_data_folder = dev_partition_details.pop( 'path_to_pyproc_annotated_data_folder', {}) debug_mode = uofa_params.pop('debug_mode', {}) path_to_trained_models_folder = uofa_params.pop( 'path_to_trained_models_folder', {}) eval_model(db, args, mithun_logger, path_to_trained_models_folder, name_of_trained_model_to_use)
def load_whole_glove(glove_file): logger = LogHelper.get_logger("load_whole_glove") is_gz = os.path.splitext(glove_file)[1] == '.gz' # Getting embedding dimension def _get_dim(_file): line = _file.readline() return len(line.strip().split(' ')) - 1 if is_gz: with gzip.open(glove_file, 'rt') as file0: emb_dim = _get_dim(file0) else: with open(glove_file, 'r', encoding='utf-8') as file0: emb_dim = _get_dim(file0) # First row of embedding matrix is 0 for zero padding vocab = ['[PAD]'] embed = [[0.0] * emb_dim] vocab.append('UNK') embed.append([1.0] * emb_dim) def _read_glove_file(_vocab, _embed, _file): for line in _file: items = line.replace('\r', '').replace('\n', '').split(' ') if len(items) < 10: logger.debug("exceptional line: {}".format(line)) continue word = items[0] _vocab.append(word) vec = [float(i) for i in items[1:]] _embed.append(vec) return _vocab, _embed # Reading embedding matrix if is_gz: with gzip.open(glove_file, 'rt') as file: vocab, embed = _read_glove_file(vocab, embed, file) else: with open(glove_file, 'r', encoding='utf-8') as file: vocab, embed = _read_glove_file(vocab, embed, file) logger.info('Loaded GloVe!') return vocab, embed
def __init__(self, optimizer='adam', h_max_length=50, s_max_length=50, learning_rate=0.0001, batch_size=128, activation='relu', initializer='he', num_epoch=100, dropout_rate=None, max_check_without_progress=10, model_store_dir=None, random_state=None, trainable=False, share_rnn=False, vocab_size=None, embedding_size=300, filter_width=3, num_filters=300, s_max_num_sents=5, sent_encoding_size=None, n_outputs=3, pos_weight=None, name=None, show_progress=1, ckpt_path=None): self.optimizer = optimizer self.h_max_length = h_max_length self.s_max_length = s_max_length self.learning_rate = learning_rate self.batch_size = batch_size self.activation = activation self.num_epoch = num_epoch self.initializer = initializer self.dropout_rate = dropout_rate self.max_checks_without_progress = max_check_without_progress self.random_state = random_state self.model_store_dir = model_store_dir self.trainable = trainable self.share_rnn = share_rnn self.vocab_size = vocab_size self.embedding_size = embedding_size self.filter_width = filter_width self.num_filters = num_filters self.s_max_num_sents = s_max_num_sents self.sent_encoding_size = sent_encoding_size if sent_encoding_size is not None else embedding_size self.n_outputs = n_outputs self.pos_weight = pos_weight self.name = name self.show_progress = show_progress self.max_check_without_progress = max_check_without_progress self.ckpt_path = ckpt_path self._session = None self.embedding = None self._graph = None self._classes = None self._initializer = None self._optimizer = None self._activation = None self.logger = LogHelper.get_logger(self.__class__.__name__)
def fasttext_padding_for_single_sentence_set_given_size( fasttext_embeddings, max_sent_size=None): logger = LogHelper.get_logger( "fasttext_padding_for_single_sentence_set_given_size") sent_sizes_ = np.asarray([len(sent) for sent in fasttext_embeddings]) if max_sent_size is None: max_sent_size = sent_sizes_.max() def padded_text_ids(_list, num_doc, max_num_words): doc_np = np.zeros([num_doc, max_num_words, dim_fasttext], dtype=np.float32) for i, doc in enumerate(_list): for k, word in enumerate(doc): if k >= max_num_words: break doc_np[i, k] = word return doc_np ft_np = padded_text_ids(fasttext_embeddings, len(fasttext_embeddings), max_sent_size) return ft_np
def embed_data_set_with_glove_2(data_set_path: str, db: Union[str, FeverDocDB], glove_path: str = None, vocab_dict: Dict[str, int] = None, glove_embeddings=None, predicted: bool = True, threshold_b_sent_num=None, threshold_b_sent_size=50, threshold_h_sent_size=50): if vocab_dict is None or glove_embeddings is None: vocab, glove_embeddings = load_whole_glove(glove_path) vocab_dict = vocab_map(vocab) logger = LogHelper.get_logger("embed_data_set_given_vocab") datas, labels = read_data_set_from_jsonl(data_set_path, db, predicted) heads_ids = single_sentence_set_2_ids_given_vocab(datas['h'], vocab_dict) logger.debug("Finished sentence to IDs for claims") bodies_ids = multi_sentence_set_2_ids_given_vocab(datas['b'], vocab_dict) logger.debug("Finished sentence to IDs for evidences") h_np, h_sent_sizes = ids_padding_for_single_sentence_set_given_size( heads_ids, threshold_h_sent_size) logger.debug("Finished padding claims") b_np, b_sizes, b_sent_sizes = ids_padding_for_multi_sentences_set( bodies_ids, threshold_b_sent_num, threshold_b_sent_size) logger.debug("Finished padding evidences") processed_data_set = { 'data': { 'h_np': h_np, 'b_np': b_np, 'h_sent_sizes': h_sent_sizes, 'b_sent_sizes': b_sent_sizes, 'b_sizes': b_sizes }, 'id': datas['id'] } if 'paths' in datas: padded_paths_np = pad_paths(datas['paths'], threshold_b_sent_num) processed_data_set['data']['paths'] = padded_paths_np if labels is not None and len(labels) == len(processed_data_set['id']): processed_data_set['label'] = labels return processed_data_set, vocab_dict, glove_embeddings, threshold_b_sent_num, threshold_b_sent_size
def __init__(self, lstm_layers=1, mlp_layers=2, num_neurons=[128, 128, 32], optimizer=tf.train.AdamOptimizer, learning_rate=0.0001, batch_size=128, activation=tf.nn.relu, initializer=he_init, num_epoch=100, batch_norm_momentum=None, dropout_rate=None, max_check_without_progress=10, show_progress=1, tensorboard_logdir=None, random_state=None, l2_lambda=0): self.lstm_layers = lstm_layers self.mlp_layers = mlp_layers self.num_neurons = num_neurons self.optimizer = optimizer self.learning_rate = learning_rate self.batch_size = batch_size self.activation = activation self.num_epoch = num_epoch self.initializer = initializer self.batch_norm_momentum = batch_norm_momentum self.dropout_rate = dropout_rate self.max_checks_without_progress = max_check_without_progress self.show_progress = show_progress self.randome_state = random_state self.tensorboard_logdir = tensorboard_logdir self.l2_lambda = l2_lambda self._session = None self.logger = LogHelper.get_logger(self.__class__.__name__)
def evidence_num_to_text(db: Union[Dict, FeverDocDB], page_id: str, line: int, is_snopes: bool = False): assert isinstance( db, Dict) or not is_snopes, "db should be dictionary for Snopes data" assert isinstance( db, FeverDocDB) or is_snopes, "db should be fever DB for fever data" logger = LogHelper.get_logger("evidence_num_to_text") if is_snopes: return evidence_num_to_text_snopes(db, page_id, line) lines = db.get_doc_lines(page_id) if lines is None: return "" if line > -1: return lines.split("\n")[line].split("\t")[1] else: non_empty_lines = [ line.split("\t")[1] for line in lines.split("\n") if len(line.split("\t")) > 1 and len(line.split("\t")[1].strip()) ] return non_empty_lines[SimpleRandom.get_instance().next_rand( 0, len(non_empty_lines) - 1)]
trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model if __name__ == "__main__": LogHelper.setup() LogHelper.get_logger("allennlp.training.trainer") LogHelper.get_logger(__name__) parser = argparse.ArgumentParser() parser.add_argument('db', type=str, help='/path/to/saved/db.db') parser.add_argument( 'param_path', type=str, help='path to parameter file describing the model to be trained') parser.add_argument("logdir", type=str) parser.add_argument("--filtering", type=str, default=None) parser.add_argument("--cuda-device", type=int,
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM arguments: " + str(Config.esim_hyper_param)) logger.info("this script is only for FEVER dataset") if mode == RTERunPhase.train: # # training mode if hasattr(Config, 'training_dump') and os.path.exists( Config.training_dump): with open(Config.training_dump, 'rb') as f: (X_dict, y_train) = pickle.load(f) else: training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2( Config.training_set_file, Config.db_path, glove_path=Config.glove_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size) h_sent_sizes = training_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set['data']['h_sent_sizes'] = np.expand_dims( h_sent_sizes, 1) training_set['data']['h_sizes'] = h_sizes training_set['data']['h_np'] = np.expand_dims( training_set['data']['h_np'], 1) training_set['data']['scores'] = load_scores( Config.training_set_file, Config.max_sentences) valid_set, _, _, _, _ = embed_data_set_with_glove_2( Config.dev_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size) h_sent_sizes = valid_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set['data']['h_sizes'] = h_sizes valid_set['data']['h_np'] = np.expand_dims( valid_set['data']['h_np'], 1) valid_set['data']['scores'] = load_scores(Config.dev_set_file, Config.max_sentences) X_dict = { 'X_train': training_set['data'], 'X_valid': valid_set['data'], 'y_valid': valid_set['label'], 'embedding': embeddings } y_train = training_set['label'] if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump((X_dict, y_train), f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_dict, y_train) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) vocab, embeddings = load_whole_glove(Config.glove_path) vocab = vocab_map(vocab) test_set, _, _, _, _ = embed_data_set_with_glove_2( Config.test_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size) h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1) test_set['data']['scores'] = load_scores(Config.test_set_file, Config.max_sentences) x_dict = {'X_test': test_set['data'], 'embedding': embeddings} if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict( x_dict, restore_param_required=restore_param_required) generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) return estimator
import random from common.dataset.corpus import Corpus from common.util.log_helper import LogHelper def preprocess(p): return p.replace(" ", "_").replace("(", "-LRB-").replace(")", "-RRB-").replace( ":", "-COLON-").split("#")[0] lut = dict() LogHelper.setup() pages = Corpus("page", "data/fever", 50, lambda x: x) for page, doc in pages: lut[page] = doc claim_evidence = defaultdict(lambda: []) # Connect to the database connection = pymysql.connect(host=os.getenv("DB_HOST", "localhost"), user=os.getenv("DB_USER", "root"), password=os.getenv("DB_PASS", ""), db=os.getenv("DB_SCHEMA", "fever"), charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
_line['predicted_evidence'] = _line['predicted_evidence'][:args. max_evidence] _line['scores'] = _line['scores'][:args.max_evidence] return _line if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('input', help='/path/to/input/file') parser.add_argument('output', help='/path/to/output/file') parser.add_argument('--max_evidence', help='max num of evidences', type=int, default=5) args = parser.parse_args() LogHelper.setup() logger = LogHelper.get_logger("replace_noise_dataset") random.seed(55) jlr = JSONLineReader() lines = jlr.read(args.input) counter = 0 with open(args.output, 'w') as f: for i, line in tqdm(enumerate(lines)): if not line[ 'label'] == 'NOT ENOUGH INFO' and not is_gold_evidence_predicted( line): counter += 1 logger.info("line " + str(i + 1) + " should be filled") line = random_fill_gold_evidence(line) f.write(json.dumps(line) + '\n') logger.info(str(counter) + " samples filled with gold evidence")
# LICENSE file in the root directory of this source tree. """A script to read in and store documents in a sqlite database.""" import argparse import importlib.util import json import os import sqlite3 from multiprocessing import Pool as ProcessPool from drqa.retriever import utils from tqdm import tqdm from common.util.log_helper import LogHelper LogHelper.setup() logger = LogHelper.get_logger("DrQA BuildDB") # ------------------------------------------------------------------------------ # Preprocessing Function. # ------------------------------------------------------------------------------ PREPROCESS_FN = None def init(filename): global PREPROCESS_FN if filename: PREPROCESS_FN = import_module(filename).preprocess
def embed_data_set_with_glove_and_fasttext(data_set_path: str, db: Union[str, FeverDocDB], fasttext_model: Union[str, FastText], glove_path: str = None, vocab_dict: Dict[str, int] = None, glove_embeddings=None, predicted: bool = True, threshold_b_sent_num=None, threshold_b_sent_size=50, threshold_h_sent_size=50, is_snopes=False): assert vocab_dict is not None and glove_embeddings is not None or glove_path is not None, "Either vocab_dict and glove_embeddings, or glove_path should be not None" if vocab_dict is None or glove_embeddings is None: vocab, glove_embeddings = load_whole_glove(glove_path) vocab_dict = vocab_map(vocab) logger = LogHelper.get_logger("embed_data_set_given_vocab") datas, labels = read_data_set_from_jsonl(data_set_path, db, predicted, is_snopes=is_snopes) heads_ft_embeddings, fasttext_model = single_sentence_set_2_fasttext_embedded( datas['h'], fasttext_model) logger.debug("Finished sentence to FastText embeddings for claims") heads_ids = single_sentence_set_2_ids_given_vocab(datas['h'], vocab_dict) logger.debug("Finished sentence to IDs for claims") bodies_ft_embeddings, fasttext_model = multi_sentence_set_2_fasttext_embedded( datas['b'], fasttext_model) logger.debug("Finished sentence to FastText embeddings for evidences") bodies_ids = multi_sentence_set_2_ids_given_vocab(datas['b'], vocab_dict) logger.debug("Finished sentence to IDs for evidences") h_ft_np = fasttext_padding_for_single_sentence_set_given_size( heads_ft_embeddings, threshold_h_sent_size) logger.debug( "Finished padding FastText embeddings for claims. Shape of h_ft_np: {}" .format(str(h_ft_np.shape))) b_ft_np = fasttext_padding_for_multi_sentences_set(bodies_ft_embeddings, threshold_b_sent_num, threshold_b_sent_size) logger.debug( "Finished padding FastText embeddings for evidences. Shape of b_ft_np: {}" .format(str(b_ft_np.shape))) h_np, h_sent_sizes = ids_padding_for_single_sentence_set_given_size( heads_ids, threshold_h_sent_size) logger.debug("Finished padding claims") b_np, b_sizes, b_sent_sizes = ids_padding_for_multi_sentences_set( bodies_ids, threshold_b_sent_num, threshold_b_sent_size) logger.debug("Finished padding evidences") processed_data_set = { 'data': { 'h_np': h_np, 'b_np': b_np, 'h_ft_np': h_ft_np, 'b_ft_np': b_ft_np, 'h_sent_sizes': h_sent_sizes, 'b_sent_sizes': b_sent_sizes, 'b_sizes': b_sizes }, 'id': datas['id'] } if labels is not None and len(labels) == len(processed_data_set['id']): processed_data_set['label'] = labels return processed_data_set, fasttext_model, vocab_dict, glove_embeddings, threshold_b_sent_num, threshold_b_sent_size
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) if hasattr(Config, 'is_snopes'): is_snopes = Config.is_snopes else: is_snopes = False logger.debug("is_snopes: " + str(is_snopes)) if mode == RTERunPhase.train: # training mode if hasattr(Config, 'training_dump') and os.path.exists( Config.training_dump): with open(Config.training_dump, 'rb') as f: (X_train, Y_labels_train, X_valid, Y_labels_valid) = pickle.load(f) else: # process training JSONL file X_train, Y_labels_train = read_data_set_from_jsonl( Config.training_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) X_valid, Y_labels_valid = read_data_set_from_jsonl( Config.dev_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) b_train = X_train['b'] X_train['b_sizes'] = get_num_sents_of_bodies(b_train) for i, sample in enumerate(b_train): if len(sample) < Config.max_sentences: for _ in range(Config.max_sentences - len(sample)): sample.append(" ") b_train[i] = np.asarray(sample) b_train = np.asarray(b_train) X_train['b'] = b_train logger.debug("b_train.shape: " + str(b_train.shape)) b_valid = X_valid['b'] X_valid['b_sizes'] = get_num_sents_of_bodies(b_valid) for i, sample in enumerate(b_valid): if len(sample) < Config.max_sentences: for _ in range(Config.max_sentences - len(sample)): sample.append(" ") b_valid[i] = np.asarray(sample) b_valid = np.asarray(b_valid) X_valid['b'] = b_valid logger.debug("b_valid.shape: " + str(b_valid.shape)) if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump( (X_train, Y_labels_train, X_valid, Y_labels_valid), f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_train, Y_labels_train, X_valid, Y_labels_valid) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) X_test, Y_labels_test = read_data_set_from_jsonl( Config.test_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) b_test = X_test['b'] X_test['b_sizes'] = get_num_sents_of_bodies(b_test) for i, sample in enumerate(b_test): if len(sample) < Config.max_sentences: for _ in range(Config.max_sentences - len(sample)): sample.append(" ") b_test[i] = np.asarray(sample) b_test = np.asarray(b_test) X_test['b'] = b_test logger.debug("b_test.shape: " + str(b_test.shape)) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict(X_test, restore_param_required) generate_submission(predictions, X_test['id'], Config.test_set_file, Config.submission_file) if Y_labels_test: print_metrics(Y_labels_test, predictions, logger) return estimator
def read_data_set_from_jsonl(file_path: str, db: Union[str, FeverDocDB], predicted: bool = True, num_sentences=None, is_snopes=False): logger = LogHelper.get_logger("read_data_set_from_jsonl") if not is_snopes: if type(db) is str: db = FeverDocDB(db) else: with open(db) as f: db = json.load(f) with open(file_path, 'r') as f: claims = [] evidences = [] paths = [] labels = [] ids = [] for line in tqdm(f): json_obj = json.loads(line) if predicted: evidences_texts = [] if 'predicted_evidence' in json_obj: _evidences = json_obj['predicted_evidence'] elif 'predicted_sentences' in json_obj: _evidences = json_obj['predicted_sentences'] else: _evidences = [] if len(_evidences) > 0: for sent in _evidences: page, line_num = sent[-2], sent[-1] page_title = page.replace("_", " ") evidences_texts.append( # page_title + " # " + clean_text(evidence_num_to_text(db, page, line_num, is_snopes))) clean_text( evidence_num_to_text(db, page, line_num, is_snopes))) else: evidences_texts = set() _evidences = json_obj['evidence'] for evidence in _evidences: for sent in evidence: page, line_num = sent[-2], sent[-1] page_title = page.replace("_", " ") evidences_texts.add( # page_title + " # " + clean_text(evidence_num_to_text(db, page, line_num, is_snopes))) clean_text( evidence_num_to_text(db, page, line_num, is_snopes))) evidences_texts = list(evidences_texts) if len(evidences_texts) == 0: continue if num_sentences is not None: if len(evidences_texts) > num_sentences: evidences_texts = evidences_texts[:num_sentences] claims.append(clean_text(json_obj['claim'])) if 'label' in json_obj: labels.append(label_dict.index(json_obj['label'])) evidences.append(evidences_texts) if 'paths' in json_obj: paths_from_sent_to_claim = [ 1.0 if p else 0.0 for p in json_obj['paths'] ] if num_sentences is not None and num_sentences > len( paths_from_sent_to_claim): paths_from_sent_to_claim += [0.0] * ( num_sentences - len(paths_from_sent_to_claim)) paths.append(paths_from_sent_to_claim) ids.append(json_obj['id']) datas = {'h': claims, 'b': evidences, 'id': ids} if paths: datas['paths'] = paths return datas, labels