def fasttext_padding_for_multi_sentences_set(fasttext_embeddings,
                                             max_bodies_size=None,
                                             max_bodies_sent_size=None):
    logger = LogHelper.get_logger("fasttext_padding_for_multi_sentences_set")
    b_sizes = np.asarray([len(sents) for sents in fasttext_embeddings])
    bodies_sent_sizes_ = [[len(sent) for sent in sents]
                          for sents in fasttext_embeddings]
    if max_bodies_size is None:
        max_bodies_size = max(b_sizes)
    if max_bodies_sent_size is None:
        max_bodies_sent_size = max(map(max, bodies_sent_sizes_))

    def padded_text_ids(_list, num_doc, max_num_sent, max_num_words):
        doc_np = np.zeros([num_doc, max_num_sent, max_num_words, dim_fasttext],
                          dtype=np.float32)
        for i, doc in enumerate(_list):
            for j, sent in enumerate(doc):
                if j >= max_num_sent:
                    break
                for k, word in enumerate(sent):
                    if k >= max_num_words:
                        break
                    doc_np[i, j, k] = word
        return doc_np

    ft_np = padded_text_ids(fasttext_embeddings, len(fasttext_embeddings),
                            max_bodies_size, max_bodies_sent_size)
    return ft_np
Exemple #2
0
    def __init__(self, name=None, ckpt_path=None, trainable=False, lstm_layers=2, num_neurons=[128, 128, 32],
                 pos_weight=None, optimizer='adam', learning_rate=0.001, batch_size=128,
                 activation='relu', initializer='he', num_epoch=100, dropout_rate=None,
                 max_check_without_progress=10, show_progress=1, tensorboard_logdir=None, random_state=None,
                 vocab_size=None, n_outputs=3, device=None, embedding=None):

        self.ckpt_path = ckpt_path
        self.trainable = trainable
        self.lstm_layers = lstm_layers
        self.num_neurons = num_neurons
        self.optimizer = optimizer
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.activation = activation
        self.num_epoch = num_epoch
        self.initializer = initializer
        self.dropout_rate = dropout_rate
        self.max_check_without_progress = max_check_without_progress
        self.show_progress = show_progress
        self.random_state = random_state
        self.tensorboard_logdir = tensorboard_logdir
        self.vocab_size = vocab_size
        self.n_outputs = n_outputs
        self.pos_weight = pos_weight
        self.name = name
        self.device = device
        self.embedding = embedding
        self._graph = None
        self._classes = None
        self._session = None
        self._initializer = None
        self._optimizer = None
        self._activation = None
        self.logger = LogHelper.get_logger(self.name)
Exemple #3
0
    def __init__(self, num_neurons=[256, 32], optimizer='adam', learning_rate=0.0001,
                 batch_size=128, activation='relu', initializer='he', num_epoch=100, batch_norm_momentum=None,
                 dropout_rate=None, n_outputs=3, max_check_without_progress=10, show_progress=1, ckpt_path=None,
                 tensorboard_logdir=None, random_state=None, l2_lambda=0, max_sentences=5, attention=cosine_similarity,
                 pos_weight=None, embedding_size=dim_bert, max_gpu_memory=0.5):

        self.num_neurons = num_neurons
        self.optimizer = optimizer
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.activation = activation
        self.num_epoch = num_epoch
        self.initializer = initializer
        self.batch_norm_momentum = batch_norm_momentum
        self.dropout_rate = dropout_rate
        self.max_checks_without_progress = max_check_without_progress
        self.show_progress = show_progress
        self.random_state = random_state
        self.tensorboard_logdir = tensorboard_logdir
        self.l2_lambda = l2_lambda
        self.max_sentences = max_sentences
        self.attention = attention
        self.n_outputs = n_outputs
        self.pos_weight = pos_weight
        self.embedding_size = embedding_size
        self.ckpt_path = ckpt_path
        self.max_gpu_memory = max_gpu_memory
        self._session = None
        self._activation = None
        self._initializer = None
        self._optimizer = None
        self._graph = None
        self.logger = LogHelper.get_logger(self.__class__.__name__)
Exemple #4
0
    def __init__(self,
                 h_max_length=20,
                 b_max_length=200,
                 trainable=False,
                 lstm_layers=1,
                 mlp_layers=0,
                 num_neurons=[128, 128, 32],
                 share_parameters=True,
                 average_pooling=False,
                 optimizer=tf.train.AdamOptimizer,
                 learning_rate=0.001,
                 batch_size=128,
                 activation=tf.nn.relu,
                 initializer=he_init,
                 num_epoch=10,
                 batch_norm_momentum=None,
                 dropout_rate=None,
                 max_check_without_progress=20,
                 show_progress=10,
                 tensorboard_logdir=None,
                 random_state=None,
                 embedding=None,
                 l2_lambda=0.001,
                 vocab_size=None):
        self.logger = LogHelper.get_logger(self.__class__.__name__)
        self.h_max_length = h_max_length
        self.b_max_length = b_max_length
        self.trainable = trainable
        self.lstm_layers = lstm_layers
        self.mlp_layers = mlp_layers
        self.num_neurons = num_neurons
        self.share_parameters = share_parameters
        self.average_pooling = average_pooling
        self.optimizer = optimizer
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.activation = activation
        self.num_epoch = num_epoch
        self.initializer = initializer
        self.batch_norm_momentum = batch_norm_momentum
        self.dropout_rate = dropout_rate
        self.max_checks_without_progress = max_check_without_progress
        self.show_progress = show_progress
        self.randome_state = random_state
        self.tensorboard_logdir = tensorboard_logdir
        self.embedding = embedding
        self.embed_size = len(embedding[0]) if embedding is not None else 0
        self.l2_lambda = l2_lambda
        self.logger.debug(vocab_size)
        self.vocab_size = vocab_size
        # self.share_embeddings = share_embeddings

        # assert self.lstm_layers + self.mlp_layers == len(self.num_neurons)

        # if self.embedding is None and self.vocab_size is None:
        #     raise Exception("Either embedding or vocab_size must be setted!")

        self._session = None
Exemple #5
0
 def __init__(self, name, path, blocks, preprocessing=None):
     self.logger = LogHelper.get_logger(Corpus.__name__)
     self.name = name
     self.path = path
     self.blocks = blocks
     self.active_block_iter = None
     self.active_block = None
     self.active_block_number = None
     self.preprocessing = preprocessing
def main(args=NullArgs()):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0])
    args.mode = Mode.PREDICT
    if args.config is not None:
        Config.load_config(args.config)

    if args.out_file is not None:
        Config.relative_path_submission = args.out_file

    if args.in_file is not None:
        Config.relative_path_test_file = args.in_file

    if args.database is not None:
        Config.relative_path_db = args.database

    print("relative_path_db " + Config.relative_path_db)
    print("raw_test_set " + Config.raw_test_set())

    if os.path.exists(Config.test_doc_file):
        os.remove(Config.test_doc_file)
    if os.path.exists(Config.test_set_file):
        os.remove(Config.test_set_file)

    if args.mode in {Mode.PIPELINE, Mode.PREDICT, Mode.PREDICT_ALL_DATASETS}:
        logger.info(
            "=========================== Sub-task 1. Document Retrieval =========================================="
        )
        document_retrieval(logger, args.mode)
    if args.mode in {
            Mode.PIPELINE_NO_DOC_RETR, Mode.PIPELINE, Mode.PREDICT,
            Mode.PREDICT_NO_DOC_RETR, Mode.PREDICT_ALL_DATASETS,
            Mode.PREDICT_NO_DOC_RETR_ALL_DATASETS
    }:
        logger.info(
            "=========================== Sub-task 2. Sentence Retrieval =========================================="
        )
        sentence_retrieval_ensemble(logger, args.mode)
    logger.info(
        "=========================== Sub-task 3. Claim Validation ============================================"
    )
    rte(logger, args, args.mode)
Exemple #7
0
    def __init__(self, name, patience=8):
        self.patience = patience
        self.best_model = None
        self.best_score = None

        self.best_epoch = 0
        self.epoch = 0

        self.name = name
        self.logger = LogHelper.get_logger(EarlyStopping.__name__)
Exemple #8
0
 def __init__(self,
              model_name,
              features=list(),
              label_name="label",
              base_path="features"):
     self.feature_functions = features
     self.vocabs = dict()
     self.label_name = label_name
     self.base_path = base_path
     self.logger = LogHelper.get_logger(Features.__name__)
     self.mname = model_name
 def __init__(self,doc_db,lim_unigram=5000,naming=None,gold=True):
     super().__init__()
     self.doc_db = doc_db
     self.lim_unigram = lim_unigram
     self.naming = naming
     self.logger = LogHelper.get_logger(self.get_name())
     self.logger.info("Term Frequency Feature Function with top {0} unigrams".format(lim_unigram))
     if gold:
         self.ename = "evidence"
     else:
         self.ename = "predicted"
Exemple #10
0
    def __init__(self,
                 h_max_length=50,
                 b_max_length=50,
                 trainable=False,
                 lstm_layers=1,
                 num_neurons=[128, 128, 32],
                 optimizer='adam',
                 learning_rate=0.001,
                 batch_size=128,
                 activation='relu',
                 initializer='he',
                 num_epoch=100,
                 dropout_rate=None,
                 max_check_without_progress=10,
                 show_progress=1,
                 tensorboard_logdir=None,
                 random_state=None,
                 l2_lambda=0.01,
                 n_outputs=3,
                 pos_weight=None,
                 n_sents=5,
                 ckpt_path=None,
                 max_gpu_memory=0.5):

        self.n_outputs = n_outputs
        self.pos_weight = pos_weight
        self.n_sents = n_sents
        self.ckpt_path = ckpt_path
        self.activation = activation
        self.initializer = initializer
        self.optimizer = optimizer
        self.lstm_layers = lstm_layers
        self.num_neurons = num_neurons
        self.max_check_without_progress = max_check_without_progress
        self.show_progress = show_progress
        self.random_state = random_state
        self.tensorboard_logdir = tensorboard_logdir
        self.num_epoch = num_epoch
        self.l2_lambda = l2_lambda
        self.h_max_length = h_max_length
        self.b_max_length = b_max_length
        self.trainable = trainable
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.dropout_rate = dropout_rate
        self.max_gpu_memory = max_gpu_memory
        self.elmo = None
        self._graph = None
        self._classes = None
        self._session = None
        self._initializer = None
        self._optimizer = None
        self._activation = None
        self.logger = LogHelper.get_logger(self.__class__.__name__)
def loadGloVe(filename, heads, bodies):
    logger = LogHelper.get_logger("text_processing")
    is_gz = os.path.splitext(filename)[1] == '.gz'

    # Getting embedding dimension
    def _get_dim(_file):
        line = _file.readline()
        return len(line.strip().split(' ')) - 1

    if is_gz:
        with gzip.open(filename, 'rt') as file0:
            emb_dim = _get_dim(file0)
    else:
        with open(filename, 'r', encoding='utf-8') as file0:
            emb_dim = _get_dim(file0)

    dataset_token_set = get_token_set(heads, bodies)
    logger.info("Finished tokenization")
    # First row of embedding matrix is 0 for zero padding
    vocab = ['[PAD]']
    embd = [[0.0] * emb_dim]
    vocab.append('UNK')
    embd.append([1.0] * emb_dim)

    def _read_glove_file(_vocab, _embd, _token_set, _file):
        for line in _file:
            items = line.replace('\r', '').replace('\n', '').split(' ')
            if len(items) < 10:
                print(items)
                continue
            word = items[0]
            if word in _token_set:
                _vocab.append(word)
                vect = [float(i) for i in items[1:]]
                _embd.append(vect)
        return _vocab, _embd

    # Reading embedding matrix
    if is_gz:
        with gzip.open(filename, 'rt') as file:
            vocab, embd = _read_glove_file(vocab, embd, dataset_token_set,
                                           file)
    else:
        with open(filename, 'r', encoding='utf-8') as file:
            vocab, embd = _read_glove_file(vocab, embd, dataset_token_set,
                                           file)
    logger.info('Loaded GloVe!')
    return vocab, embd
def single_sentence_set_2_ids_given_vocab(texts, vocab_dict):
    logger = LogHelper.get_logger("single_sentence_set_2_ids_given_vocab")
    doc_ids = []
    out_of_vocab_counts = 0
    for sent in texts:
        tokens = tokenize(sent)
        word_ids = []
        for token in tokens:
            if token.lower() in vocab_dict:
                word_ids.append(vocab_dict[token.lower()])
            else:
                out_of_vocab_counts += 1
                word_ids.append(vocab_dict['UNK'])
        doc_ids.append(word_ids)
    logger.debug("{} times out of vocab".format(str(out_of_vocab_counts)))
    return doc_ids
def single_sentence_set_2_fasttext_embedded(sents: List[str],
                                            fasttext_model: Union[str,
                                                                  FastText]):
    logger = LogHelper.get_logger("single_sentence_set_2_fasttext_embedded")
    if type(fasttext_model) == str:
        fasttext_model = FastText.load_fasttext_format(fasttext_model)
    fasttext_embeddings = []
    for sent in sents:
        tokens = tokenize(sent)
        sent_embeddings = []
        for token in tokens:
            try:
                sent_embeddings.append(fasttext_model[token.lower()])
            except KeyError:
                sent_embeddings.append(np.ones([dim_fasttext], np.float32))
        fasttext_embeddings.append(sent_embeddings)
    return fasttext_embeddings, fasttext_model
Exemple #14
0
    def __init__(self,
                 h_max_length=20,
                 b_max_length=200,
                 trainable=False,
                 lstm_layers=2,
                 mlp_layers=1,
                 num_neurons=[128, 128, 32],
                 share_parameters=True,
                 average_pooling=False,
                 optimizer=tf.train.AdamOptimizer,
                 learning_rate=0.001,
                 batch_size=128,
                 activation=tf.nn.relu,
                 initializer=he_init,
                 num_epoch=100,
                 batch_norm_momentum=None,
                 dropout_rate=None,
                 max_check_without_progress=10,
                 show_progress=10,
                 tensorboard_logdir=None,
                 random_state=None,
                 embedding=None,
                 l2_lambda=0.01,
                 vocab_size=None,
                 n_outputs=3,
                 pos_weight=None):
        LSTM.__init__(self, h_max_length, b_max_length, trainable, lstm_layers,
                      mlp_layers, num_neurons, share_parameters,
                      average_pooling, optimizer, learning_rate, batch_size,
                      activation, initializer, num_epoch, batch_norm_momentum,
                      dropout_rate, max_check_without_progress, show_progress,
                      tensorboard_logdir, random_state, embedding, l2_lambda,
                      vocab_size)

        self.mlp_layers = len(num_neurons) - 2
        self.vocab_size = vocab_size
        self.embedding_size = 300 + dim_fasttext
        self.n_outputs = n_outputs
        self.pos_weight = pos_weight
        self._graph = None
        self._classes = None
        self._session = None
        self.logger = LogHelper.get_logger(self.__class__.__name__)
        if self.embedding is None and self.vocab_size is None:
            raise Exception("Either embedding or vocab_size must be set!")
    def __init__(self,
                 optimizer=tf.train.AdamOptimizer,
                 h_max_length=40,
                 s_max_length=40,
                 learning_rate=0.0001,
                 num_sents=10,
                 batch_size=128,
                 activation=tf.nn.relu,
                 initializer=he_init,
                 num_epoch=100,
                 dropout_rate=None,
                 embedding=None,
                 word_dict=None,
                 max_check_without_progress=5,
                 show_progress=1,
                 tensorboard_logdir=None,
                 random_state=None,
                 l2_lambda=0,
                 trainable=False,
                 n_outputs=3):

        # self.mlp_layers = mlp_layers
        # self.num_neurons = num_neurons
        self.optimizer = optimizer
        self.h_max_length = h_max_length
        self.s_max_length = s_max_length
        self.num_sents = num_sents
        self.learning_rate = learning_rate
        self.embedding = embedding
        self.word_dict = word_dict
        self.batch_size = batch_size
        self.activation = activation
        self.num_epoch = num_epoch
        self.initializer = initializer
        self.dropout_rate = dropout_rate
        self.max_checks_without_progress = max_check_without_progress
        self.show_progress = show_progress
        self.randome_state = random_state
        self.tensorboard_logdir = tensorboard_logdir
        self.l2_lambda = l2_lambda
        self.trainable = trainable
        self.n_outputs = n_outputs
        self._session = None
        self.logger = LogHelper.get_logger(self.__class__.__name__)
Exemple #16
0
def eval_da(dataset_to_work_on, args, operation, mithun_logger):
    LogHelper.setup()
    LogHelper.get_logger("allennlp.training.trainer")
    LogHelper.get_logger(__name__)

    params = Params.from_file(args.param_path, args.overrides)
    uofa_params = params.pop('uofa_params', {})
    path_to_saved_db = uofa_params.pop("path_to_saved_db")
    db = FeverDocDB(path_to_saved_db)

    mithun_logger.info("inside main function going to call eval on " +
                       str(dataset_to_work_on))
    mithun_logger.info("path_to_pyproc_annotated_data_folder " +
                       str(path_to_pyproc_annotated_data_folder))
    mithun_logger.info("value of name_of_trained_model_to_use: " +
                       str(name_of_trained_model_to_use))
    mithun_logger.info("value of dataset_to_work_on: " +
                       str(dataset_to_work_on))

    if (dataset_to_work_on == "fnc"):
        fever_dataset_details = uofa_params.pop('fever_dataset_details', {})
        dev_partition_details = fever_dataset_details.pop(
            'dev_partition_details', {})
        name_of_trained_model_to_use = dev_partition_details.pop(
            'name_of_trained_model_to_use', {})
        path_to_pyproc_annotated_data_folder = dev_partition_details.pop(
            'path_to_pyproc_annotated_data_folder', {})
        debug_mode = uofa_params.pop('debug_mode', {})
        path_to_trained_models_folder = uofa_params.pop(
            'path_to_trained_models_folder', {})
        path_to_fnc_annotated_data = dev_partition_details.pop(
            'path_to_pyproc_annotated_data_folder', {})
        eval_model_fnc_data(db, args, mithun_logger,
                            name_of_trained_model_to_use,
                            path_to_trained_models_folder, cuda_device,
                            operation, path_to_fnc_annotated_data)

    elif (dataset_to_work_on == "fever"):
        fever_dataset_details = uofa_params.pop('fever_dataset_details', {})
        dev_partition_details = fever_dataset_details.pop(
            'dev_partition_details', {})
        name_of_trained_model_to_use = dev_partition_details.pop(
            'name_of_trained_model_to_use', {})
        path_to_pyproc_annotated_data_folder = dev_partition_details.pop(
            'path_to_pyproc_annotated_data_folder', {})
        debug_mode = uofa_params.pop('debug_mode', {})
        path_to_trained_models_folder = uofa_params.pop(
            'path_to_trained_models_folder', {})

        eval_model(db, args, mithun_logger, path_to_trained_models_folder,
                   name_of_trained_model_to_use)
def load_whole_glove(glove_file):
    logger = LogHelper.get_logger("load_whole_glove")
    is_gz = os.path.splitext(glove_file)[1] == '.gz'

    # Getting embedding dimension
    def _get_dim(_file):
        line = _file.readline()
        return len(line.strip().split(' ')) - 1

    if is_gz:
        with gzip.open(glove_file, 'rt') as file0:
            emb_dim = _get_dim(file0)
    else:
        with open(glove_file, 'r', encoding='utf-8') as file0:
            emb_dim = _get_dim(file0)

    # First row of embedding matrix is 0 for zero padding
    vocab = ['[PAD]']
    embed = [[0.0] * emb_dim]
    vocab.append('UNK')
    embed.append([1.0] * emb_dim)

    def _read_glove_file(_vocab, _embed, _file):
        for line in _file:
            items = line.replace('\r', '').replace('\n', '').split(' ')
            if len(items) < 10:
                logger.debug("exceptional line: {}".format(line))
                continue
            word = items[0]
            _vocab.append(word)
            vec = [float(i) for i in items[1:]]
            _embed.append(vec)
        return _vocab, _embed

    # Reading embedding matrix
    if is_gz:
        with gzip.open(glove_file, 'rt') as file:
            vocab, embed = _read_glove_file(vocab, embed, file)
    else:
        with open(glove_file, 'r', encoding='utf-8') as file:
            vocab, embed = _read_glove_file(vocab, embed, file)
    logger.info('Loaded GloVe!')
    return vocab, embed
Exemple #18
0
    def __init__(self, optimizer='adam', h_max_length=50, s_max_length=50, learning_rate=0.0001,
                 batch_size=128, activation='relu', initializer='he', num_epoch=100, dropout_rate=None,
                 max_check_without_progress=10, model_store_dir=None, random_state=None, trainable=False,
                 share_rnn=False, vocab_size=None, embedding_size=300, filter_width=3, num_filters=300,
                 s_max_num_sents=5, sent_encoding_size=None, n_outputs=3, pos_weight=None, name=None, show_progress=1,
                 ckpt_path=None):

        self.optimizer = optimizer
        self.h_max_length = h_max_length
        self.s_max_length = s_max_length
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.activation = activation
        self.num_epoch = num_epoch
        self.initializer = initializer
        self.dropout_rate = dropout_rate
        self.max_checks_without_progress = max_check_without_progress
        self.random_state = random_state
        self.model_store_dir = model_store_dir
        self.trainable = trainable
        self.share_rnn = share_rnn
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.filter_width = filter_width
        self.num_filters = num_filters
        self.s_max_num_sents = s_max_num_sents
        self.sent_encoding_size = sent_encoding_size if sent_encoding_size is not None else embedding_size
        self.n_outputs = n_outputs
        self.pos_weight = pos_weight
        self.name = name
        self.show_progress = show_progress
        self.max_check_without_progress = max_check_without_progress
        self.ckpt_path = ckpt_path

        self._session = None
        self.embedding = None
        self._graph = None
        self._classes = None
        self._initializer = None
        self._optimizer = None
        self._activation = None
        self.logger = LogHelper.get_logger(self.__class__.__name__)
def fasttext_padding_for_single_sentence_set_given_size(
        fasttext_embeddings, max_sent_size=None):
    logger = LogHelper.get_logger(
        "fasttext_padding_for_single_sentence_set_given_size")
    sent_sizes_ = np.asarray([len(sent) for sent in fasttext_embeddings])
    if max_sent_size is None:
        max_sent_size = sent_sizes_.max()

    def padded_text_ids(_list, num_doc, max_num_words):
        doc_np = np.zeros([num_doc, max_num_words, dim_fasttext],
                          dtype=np.float32)
        for i, doc in enumerate(_list):
            for k, word in enumerate(doc):
                if k >= max_num_words:
                    break
                doc_np[i, k] = word
        return doc_np

    ft_np = padded_text_ids(fasttext_embeddings, len(fasttext_embeddings),
                            max_sent_size)
    return ft_np
def embed_data_set_with_glove_2(data_set_path: str,
                                db: Union[str, FeverDocDB],
                                glove_path: str = None,
                                vocab_dict: Dict[str, int] = None,
                                glove_embeddings=None,
                                predicted: bool = True,
                                threshold_b_sent_num=None,
                                threshold_b_sent_size=50,
                                threshold_h_sent_size=50):
    if vocab_dict is None or glove_embeddings is None:
        vocab, glove_embeddings = load_whole_glove(glove_path)
        vocab_dict = vocab_map(vocab)
    logger = LogHelper.get_logger("embed_data_set_given_vocab")
    datas, labels = read_data_set_from_jsonl(data_set_path, db, predicted)
    heads_ids = single_sentence_set_2_ids_given_vocab(datas['h'], vocab_dict)
    logger.debug("Finished sentence to IDs for claims")
    bodies_ids = multi_sentence_set_2_ids_given_vocab(datas['b'], vocab_dict)
    logger.debug("Finished sentence to IDs for evidences")
    h_np, h_sent_sizes = ids_padding_for_single_sentence_set_given_size(
        heads_ids, threshold_h_sent_size)
    logger.debug("Finished padding claims")
    b_np, b_sizes, b_sent_sizes = ids_padding_for_multi_sentences_set(
        bodies_ids, threshold_b_sent_num, threshold_b_sent_size)
    logger.debug("Finished padding evidences")
    processed_data_set = {
        'data': {
            'h_np': h_np,
            'b_np': b_np,
            'h_sent_sizes': h_sent_sizes,
            'b_sent_sizes': b_sent_sizes,
            'b_sizes': b_sizes
        },
        'id': datas['id']
    }
    if 'paths' in datas:
        padded_paths_np = pad_paths(datas['paths'], threshold_b_sent_num)
        processed_data_set['data']['paths'] = padded_paths_np
    if labels is not None and len(labels) == len(processed_data_set['id']):
        processed_data_set['label'] = labels
    return processed_data_set, vocab_dict, glove_embeddings, threshold_b_sent_num, threshold_b_sent_size
Exemple #21
0
    def __init__(self,
                 lstm_layers=1,
                 mlp_layers=2,
                 num_neurons=[128, 128, 32],
                 optimizer=tf.train.AdamOptimizer,
                 learning_rate=0.0001,
                 batch_size=128,
                 activation=tf.nn.relu,
                 initializer=he_init,
                 num_epoch=100,
                 batch_norm_momentum=None,
                 dropout_rate=None,
                 max_check_without_progress=10,
                 show_progress=1,
                 tensorboard_logdir=None,
                 random_state=None,
                 l2_lambda=0):

        self.lstm_layers = lstm_layers
        self.mlp_layers = mlp_layers
        self.num_neurons = num_neurons
        self.optimizer = optimizer
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.activation = activation
        self.num_epoch = num_epoch
        self.initializer = initializer
        self.batch_norm_momentum = batch_norm_momentum
        self.dropout_rate = dropout_rate
        self.max_checks_without_progress = max_check_without_progress
        self.show_progress = show_progress
        self.randome_state = random_state
        self.tensorboard_logdir = tensorboard_logdir
        self.l2_lambda = l2_lambda
        self._session = None
        self.logger = LogHelper.get_logger(self.__class__.__name__)
def evidence_num_to_text(db: Union[Dict, FeverDocDB],
                         page_id: str,
                         line: int,
                         is_snopes: bool = False):
    assert isinstance(
        db, Dict) or not is_snopes, "db should be dictionary for Snopes data"
    assert isinstance(
        db, FeverDocDB) or is_snopes, "db should be fever DB for fever data"
    logger = LogHelper.get_logger("evidence_num_to_text")
    if is_snopes:
        return evidence_num_to_text_snopes(db, page_id, line)
    lines = db.get_doc_lines(page_id)
    if lines is None:
        return ""
    if line > -1:
        return lines.split("\n")[line].split("\t")[1]
    else:
        non_empty_lines = [
            line.split("\t")[1] for line in lines.split("\n")
            if len(line.split("\t")) > 1 and len(line.split("\t")[1].strip())
        ]
        return non_empty_lines[SimpleRandom.get_instance().next_rand(
            0,
            len(non_empty_lines) - 1)]
Exemple #23
0
    trainer_params = params.pop("trainer")
    if cuda_device is not None:
        trainer_params["cuda_device"] = cuda_device
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model


if __name__ == "__main__":
    LogHelper.setup()
    LogHelper.get_logger("allennlp.training.trainer")
    LogHelper.get_logger(__name__)

    parser = argparse.ArgumentParser()
    parser.add_argument('db', type=str, help='/path/to/saved/db.db')
    parser.add_argument(
        'param_path',
        type=str,
        help='path to parameter file describing the model to be trained')

    parser.add_argument("logdir", type=str)

    parser.add_argument("--filtering", type=str, default=None)
    parser.add_argument("--cuda-device",
                        type=int,
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    logger.info("this script is only for FEVER dataset")
    if mode == RTERunPhase.train:
        # # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_dict, y_train) = pickle.load(f)
        else:
            training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2(
                Config.training_set_file,
                Config.db_path,
                glove_path=Config.glove_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size)
            h_sent_sizes = training_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set['data']['h_sent_sizes'] = np.expand_dims(
                h_sent_sizes, 1)
            training_set['data']['h_sizes'] = h_sizes
            training_set['data']['h_np'] = np.expand_dims(
                training_set['data']['h_np'], 1)
            training_set['data']['scores'] = load_scores(
                Config.training_set_file, Config.max_sentences)

            valid_set, _, _, _, _ = embed_data_set_with_glove_2(
                Config.dev_set_file,
                Config.db_path,
                vocab_dict=vocab,
                glove_embeddings=embeddings,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size)
            h_sent_sizes = valid_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set['data']['h_sizes'] = h_sizes
            valid_set['data']['h_np'] = np.expand_dims(
                valid_set['data']['h_np'], 1)
            valid_set['data']['scores'] = load_scores(Config.dev_set_file,
                                                      Config.max_sentences)

            X_dict = {
                'X_train': training_set['data'],
                'X_valid': valid_set['data'],
                'y_valid': valid_set['label'],
                'embedding': embeddings
            }
            y_train = training_set['label']
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_dict, y_train),
                                f,
                                protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_dict, y_train)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.test_set_file,
            Config.db_path,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        test_set['data']['scores'] = load_scores(Config.test_set_file,
                                                 Config.max_sentences)
        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(
            x_dict, restore_param_required=restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
Exemple #25
0
import random

from common.dataset.corpus import Corpus
from common.util.log_helper import LogHelper


def preprocess(p):
    return p.replace(" ", "_").replace("(",
                                       "-LRB-").replace(")", "-RRB-").replace(
                                           ":", "-COLON-").split("#")[0]


lut = dict()

LogHelper.setup()

pages = Corpus("page", "data/fever", 50, lambda x: x)
for page, doc in pages:
    lut[page] = doc

claim_evidence = defaultdict(lambda: [])

# Connect to the database
connection = pymysql.connect(host=os.getenv("DB_HOST", "localhost"),
                             user=os.getenv("DB_USER", "root"),
                             password=os.getenv("DB_PASS", ""),
                             db=os.getenv("DB_SCHEMA", "fever"),
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)
    _line['predicted_evidence'] = _line['predicted_evidence'][:args.
                                                              max_evidence]
    _line['scores'] = _line['scores'][:args.max_evidence]
    return _line


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('input', help='/path/to/input/file')
    parser.add_argument('output', help='/path/to/output/file')
    parser.add_argument('--max_evidence',
                        help='max num of evidences',
                        type=int,
                        default=5)
    args = parser.parse_args()
    LogHelper.setup()
    logger = LogHelper.get_logger("replace_noise_dataset")
    random.seed(55)
    jlr = JSONLineReader()
    lines = jlr.read(args.input)
    counter = 0
    with open(args.output, 'w') as f:
        for i, line in tqdm(enumerate(lines)):
            if not line[
                    'label'] == 'NOT ENOUGH INFO' and not is_gold_evidence_predicted(
                        line):
                counter += 1
                logger.info("line " + str(i + 1) + " should be filled")
                line = random_fill_gold_evidence(line)
            f.write(json.dumps(line) + '\n')
    logger.info(str(counter) + " samples filled with gold evidence")
# LICENSE file in the root directory of this source tree.
"""A script to read in and store documents in a sqlite database."""

import argparse
import importlib.util
import json
import os
import sqlite3
from multiprocessing import Pool as ProcessPool

from drqa.retriever import utils
from tqdm import tqdm

from common.util.log_helper import LogHelper

LogHelper.setup()
logger = LogHelper.get_logger("DrQA BuildDB")

# ------------------------------------------------------------------------------
# Preprocessing Function.
# ------------------------------------------------------------------------------

PREPROCESS_FN = None


def init(filename):
    global PREPROCESS_FN
    if filename:
        PREPROCESS_FN = import_module(filename).preprocess

def embed_data_set_with_glove_and_fasttext(data_set_path: str,
                                           db: Union[str, FeverDocDB],
                                           fasttext_model: Union[str,
                                                                 FastText],
                                           glove_path: str = None,
                                           vocab_dict: Dict[str, int] = None,
                                           glove_embeddings=None,
                                           predicted: bool = True,
                                           threshold_b_sent_num=None,
                                           threshold_b_sent_size=50,
                                           threshold_h_sent_size=50,
                                           is_snopes=False):
    assert vocab_dict is not None and glove_embeddings is not None or glove_path is not None, "Either vocab_dict and glove_embeddings, or glove_path should be not None"
    if vocab_dict is None or glove_embeddings is None:
        vocab, glove_embeddings = load_whole_glove(glove_path)
        vocab_dict = vocab_map(vocab)
    logger = LogHelper.get_logger("embed_data_set_given_vocab")
    datas, labels = read_data_set_from_jsonl(data_set_path,
                                             db,
                                             predicted,
                                             is_snopes=is_snopes)
    heads_ft_embeddings, fasttext_model = single_sentence_set_2_fasttext_embedded(
        datas['h'], fasttext_model)
    logger.debug("Finished sentence to FastText embeddings for claims")
    heads_ids = single_sentence_set_2_ids_given_vocab(datas['h'], vocab_dict)
    logger.debug("Finished sentence to IDs for claims")
    bodies_ft_embeddings, fasttext_model = multi_sentence_set_2_fasttext_embedded(
        datas['b'], fasttext_model)
    logger.debug("Finished sentence to FastText embeddings for evidences")
    bodies_ids = multi_sentence_set_2_ids_given_vocab(datas['b'], vocab_dict)
    logger.debug("Finished sentence to IDs for evidences")
    h_ft_np = fasttext_padding_for_single_sentence_set_given_size(
        heads_ft_embeddings, threshold_h_sent_size)
    logger.debug(
        "Finished padding FastText embeddings for claims. Shape of h_ft_np: {}"
        .format(str(h_ft_np.shape)))
    b_ft_np = fasttext_padding_for_multi_sentences_set(bodies_ft_embeddings,
                                                       threshold_b_sent_num,
                                                       threshold_b_sent_size)
    logger.debug(
        "Finished padding FastText embeddings for evidences. Shape of b_ft_np: {}"
        .format(str(b_ft_np.shape)))
    h_np, h_sent_sizes = ids_padding_for_single_sentence_set_given_size(
        heads_ids, threshold_h_sent_size)
    logger.debug("Finished padding claims")
    b_np, b_sizes, b_sent_sizes = ids_padding_for_multi_sentences_set(
        bodies_ids, threshold_b_sent_num, threshold_b_sent_size)
    logger.debug("Finished padding evidences")
    processed_data_set = {
        'data': {
            'h_np': h_np,
            'b_np': b_np,
            'h_ft_np': h_ft_np,
            'b_ft_np': b_ft_np,
            'h_sent_sizes': h_sent_sizes,
            'b_sent_sizes': b_sent_sizes,
            'b_sizes': b_sizes
        },
        'id': datas['id']
    }
    if labels is not None and len(labels) == len(processed_data_set['id']):
        processed_data_set['label'] = labels
    return processed_data_set, fasttext_model, vocab_dict, glove_embeddings, threshold_b_sent_num, threshold_b_sent_size
Exemple #29
0
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    if mode == RTERunPhase.train:
        # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_train, Y_labels_train, X_valid,
                 Y_labels_valid) = pickle.load(f)
        else:
            # process training JSONL file
            X_train, Y_labels_train = read_data_set_from_jsonl(
                Config.training_set_file,
                Config.db_path,
                num_sentences=Config.max_sentences,
                is_snopes=is_snopes)
            X_valid, Y_labels_valid = read_data_set_from_jsonl(
                Config.dev_set_file,
                Config.db_path,
                num_sentences=Config.max_sentences,
                is_snopes=is_snopes)
            b_train = X_train['b']
            X_train['b_sizes'] = get_num_sents_of_bodies(b_train)
            for i, sample in enumerate(b_train):
                if len(sample) < Config.max_sentences:
                    for _ in range(Config.max_sentences - len(sample)):
                        sample.append(" ")
                b_train[i] = np.asarray(sample)
            b_train = np.asarray(b_train)
            X_train['b'] = b_train
            logger.debug("b_train.shape: " + str(b_train.shape))
            b_valid = X_valid['b']
            X_valid['b_sizes'] = get_num_sents_of_bodies(b_valid)
            for i, sample in enumerate(b_valid):
                if len(sample) < Config.max_sentences:
                    for _ in range(Config.max_sentences - len(sample)):
                        sample.append(" ")
                b_valid[i] = np.asarray(sample)
            b_valid = np.asarray(b_valid)
            X_valid['b'] = b_valid
            logger.debug("b_valid.shape: " + str(b_valid.shape))
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump(
                        (X_train, Y_labels_train, X_valid, Y_labels_valid),
                        f,
                        protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_train, Y_labels_train, X_valid, Y_labels_valid)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        X_test, Y_labels_test = read_data_set_from_jsonl(
            Config.test_set_file,
            Config.db_path,
            num_sentences=Config.max_sentences,
            is_snopes=is_snopes)
        b_test = X_test['b']
        X_test['b_sizes'] = get_num_sents_of_bodies(b_test)
        for i, sample in enumerate(b_test):
            if len(sample) < Config.max_sentences:
                for _ in range(Config.max_sentences - len(sample)):
                    sample.append(" ")
            b_test[i] = np.asarray(sample)
        b_test = np.asarray(b_test)
        X_test['b'] = b_test
        logger.debug("b_test.shape: " + str(b_test.shape))
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(X_test, restore_param_required)
        generate_submission(predictions, X_test['id'], Config.test_set_file,
                            Config.submission_file)
        if Y_labels_test:
            print_metrics(Y_labels_test, predictions, logger)
    return estimator
def read_data_set_from_jsonl(file_path: str,
                             db: Union[str, FeverDocDB],
                             predicted: bool = True,
                             num_sentences=None,
                             is_snopes=False):
    logger = LogHelper.get_logger("read_data_set_from_jsonl")
    if not is_snopes:
        if type(db) is str:
            db = FeverDocDB(db)
    else:
        with open(db) as f:
            db = json.load(f)
    with open(file_path, 'r') as f:
        claims = []
        evidences = []
        paths = []
        labels = []
        ids = []
        for line in tqdm(f):
            json_obj = json.loads(line)
            if predicted:
                evidences_texts = []
                if 'predicted_evidence' in json_obj:
                    _evidences = json_obj['predicted_evidence']
                elif 'predicted_sentences' in json_obj:
                    _evidences = json_obj['predicted_sentences']
                else:
                    _evidences = []
                if len(_evidences) > 0:
                    for sent in _evidences:
                        page, line_num = sent[-2], sent[-1]
                        page_title = page.replace("_", " ")
                        evidences_texts.append(
                            # page_title + " # " + clean_text(evidence_num_to_text(db, page, line_num, is_snopes)))
                            clean_text(
                                evidence_num_to_text(db, page, line_num,
                                                     is_snopes)))
            else:
                evidences_texts = set()
                _evidences = json_obj['evidence']
                for evidence in _evidences:
                    for sent in evidence:
                        page, line_num = sent[-2], sent[-1]
                        page_title = page.replace("_", " ")
                        evidences_texts.add(
                            # page_title + " # " + clean_text(evidence_num_to_text(db, page, line_num, is_snopes)))
                            clean_text(
                                evidence_num_to_text(db, page, line_num,
                                                     is_snopes)))
                evidences_texts = list(evidences_texts)
            if len(evidences_texts) == 0:
                continue
            if num_sentences is not None:
                if len(evidences_texts) > num_sentences:
                    evidences_texts = evidences_texts[:num_sentences]
            claims.append(clean_text(json_obj['claim']))
            if 'label' in json_obj:
                labels.append(label_dict.index(json_obj['label']))
            evidences.append(evidences_texts)
            if 'paths' in json_obj:
                paths_from_sent_to_claim = [
                    1.0 if p else 0.0 for p in json_obj['paths']
                ]
                if num_sentences is not None and num_sentences > len(
                        paths_from_sent_to_claim):
                    paths_from_sent_to_claim += [0.0] * (
                        num_sentences - len(paths_from_sent_to_claim))
                paths.append(paths_from_sent_to_claim)
            ids.append(json_obj['id'])
        datas = {'h': claims, 'b': evidences, 'id': ids}
        if paths:
            datas['paths'] = paths
        return datas, labels