Ejemplo n.º 1
0
    def nextblock(self):
        self.block = (self.block + 1) if self.block is not None else 0
        if self.file is not None:
            self.file.close()
        self.file = open(
            os.path.join(
                "data", "fever", "wiki",
                "wiki-{0}.jsonl".format(str.zfill(str(self.block), 3))), "w+")

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.file.close()


if __name__ == "__main__":
    blocks = int(sys.argv[1])
    LogHelper.setup()
    logger = LogHelper.get_logger("convert")

    blk = Corpus("page", os.path.join("data", "fever"), blocks, lambda x:
                 (x, read_words(x)))

    with BlockWriter(os.path.join("data", "fever", "wiki"), 50000) as f:
        for page, body in tqdm(blk):
            f.write(
                json.dumps({
                    "id": page,
                    "text": " ".join(body[1]),
                    "lines": body[0]
                }))
        neg_indices = reversed(neg_indices)
        for i in neg_indices:
            sent = selected_sents[i]
            selected_sents[i] = _replace_sent_with_str(sent, gold_sents.pop())
            if len(gold_sents) == 0:
                return selected_sents
    if len(gold_sents) > 0:
        logger.warn(
            str(len(gold_sents)) +
            " gold sentences cannot be filled into prediction")
    return selected_sents


if __name__ == '__main__':
    LogHelper.setup()
    logger = LogHelper.get_logger('fill_gold_sentences')
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', help='/path/to/input/file', required=True)
    parser.add_argument('--output', help='/path/to/output/file', required=True)
    parser.add_argument('--max-sent',
                        type=int,
                        help='Maximal number of sentences per claim',
                        default=10)
    args = parser.parse_args()
    jlr = JSONLineReader()
    data = jlr.read(args.input)
    with open(args.output, "w+") as output_file:
        for data in tqdm(data):
            if data['verifiable'] != 'NOT VERIFIABLE':
                pred_sents = data['predicted_sentences']
                gold_evidences = data['evidence']
Ejemplo n.º 3
0
                     required=True)
 parser.add_argument('-o',
                     '--output',
                     help='/path/to/output/file',
                     required=True)
 parser.add_argument('-ip',
                     '--input-pickle',
                     help='/path/to/input/pickle/file')
 parser.add_argument('-op',
                     '--output-pickle',
                     help='/path/to/output/pickle/file',
                     required=True)
 parser.add_argument('-db', '--db', help='/path/to/db/file', required=True)
 args = parser.parse_args()
 LogHelper.setup()
 logger = LogHelper.get_logger("generate_paths")
 db = FeverDocDB(args.db)
 jlr = JSONLineReader()
 lines = jlr.read(args.input)
 if args.input_pickle is not None:
     with open(args.input_pickle, 'rb') as f:
         per_claim_dict = pickle.load(f)
 else:
     logger.warn("no pickle file loaded!")
     per_claim_dict = dict()
 with open(args.output,
           'w') as f, ThreadPoolExecutor(max_workers=8) as executor:
     future_map = dict()
     for line in tqdm(lines):
         _id = line['id']
         claim_text = line['claim']
Ejemplo n.º 4
0
from common.util.log_helper import LogHelper
from retrieval.fever_doc_db import FeverDocDB
from rte.parikh.reader import FEVERReader

import argparse
import numpy as np


from rte.riedel.data import FEVERGoldFormatter, FEVERLabelSchema
# from scripts.retrieval.sentence.process_tfidf import XTermFrequencyFeatureFunction
from retrieval.process_tfidf import XTermFrequencyFeatureFunction

from rte.tmp.nei_rte_model import NeiRteModel

LogHelper.setup()
logger = LogHelper.get_logger(__name__)  # pylint: disable=invalid-name


def tf_idf_sim(claim, lines):
    test = []
    for line in lines:
        test.append({"claim": claim, "text": line})

    return tf.lookup(test).reshape(-1).tolist()

def eval_model(db: FeverDocDB, args) -> Model:
    # archive = load_archive(args.archive_file, cuda_device=args.cuda_device, overrides=args.overrides)

    # config = archive.config
    # ds_params = config["dataset_reader"]
    #
Ejemplo n.º 5
0
def embed_claims(claims: List,
                 db: Union[str, FeverDocDB],
                 fasttext_model: Union[str, FastText],
                 glove_path: str = None,
                 vocab_dict: Dict[str, int] = None,
                 glove_embeddings=None,
                 predicted: bool = True,
                 threshold_b_sent_num=None,
                 threshold_b_sent_size=50,
                 threshold_h_sent_size=50,
                 is_snopes=False):
    assert vocab_dict is not None and glove_embeddings is not None or glove_path is not None, "Either vocab_dict and glove_embeddings, or glove_path should be not None"
    if vocab_dict is None or glove_embeddings is None:
        vocab, glove_embeddings = load_whole_glove(glove_path)
        vocab_dict = vocab_map(vocab)
    print(len(claims))
    logger = LogHelper.get_logger("embed_data_set_given_vocab")
    datas, labels = read_data_set_from_lines(claims,
                                             db,
                                             predicted,
                                             is_snopes=is_snopes)
    print(len(datas["h"]), len(datas["b"]))
    heads_ft_embeddings, fasttext_model = single_sentence_set_2_fasttext_embedded(
        datas['h'], fasttext_model)
    logger.debug("Finished sentence to FastText embeddings for claims")
    print(len(heads_ft_embeddings))
    heads_ids = single_sentence_set_2_ids_given_vocab(datas['h'], vocab_dict)
    logger.debug("Finished sentence to IDs for claims")
    bodies_ft_embeddings, fasttext_model = multi_sentence_set_2_fasttext_embedded(
        datas['b'], fasttext_model)
    logger.debug("Finished sentence to FastText embeddings for evidences")
    bodies_ids = multi_sentence_set_2_ids_given_vocab(datas['b'], vocab_dict)
    logger.debug("Finished sentence to IDs for evidences")
    h_ft_np = fasttext_padding_for_single_sentence_set_given_size(
        heads_ft_embeddings, threshold_h_sent_size)
    logger.debug(
        "Finished padding FastText embeddings for claims. Shape of h_ft_np: {}"
        .format(str(h_ft_np.shape)))
    b_ft_np = fasttext_padding_for_multi_sentences_set(bodies_ft_embeddings,
                                                       threshold_b_sent_num,
                                                       threshold_b_sent_size)
    logger.debug(
        "Finished padding FastText embeddings for evidences. Shape of b_ft_np: {}"
        .format(str(b_ft_np.shape)))
    h_np, h_sent_sizes = ids_padding_for_single_sentence_set_given_size(
        heads_ids, threshold_h_sent_size)
    logger.debug("Finished padding claims")
    b_np, b_sizes, b_sent_sizes = ids_padding_for_multi_sentences_set(
        bodies_ids, threshold_b_sent_num, threshold_b_sent_size)
    logger.debug("Finished padding evidences")
    processed_data_set = {
        'data': {
            'h_np': h_np,
            'b_np': b_np,
            'h_ft_np': h_ft_np,
            'b_ft_np': b_ft_np,
            'h_sent_sizes': h_sent_sizes,
            'b_sent_sizes': b_sent_sizes,
            'b_sizes': b_sizes
        }
    }

    return processed_data_set, fasttext_model, vocab_dict, glove_embeddings, threshold_b_sent_num, threshold_b_sent_size
Ejemplo n.º 6
0
    with open(fever_data_file, 'w') as f:
        for row in claim_dict:
            f.write(json.dumps(row) + '\n')
    page_dict = dict()
    for url_pages in snippet_dict.values():
        for page in url_pages:
            page_dict[page['id']] = {
                'lines': page['lines'],
                'stance': page['stance']
            }
    with open(fever_page_file, 'w') as f:
        json.dump(page_dict, f, indent=4)


if __name__ == '__main__':
    import argparse
    from common.util.log_helper import LogHelper

    parser = argparse.ArgumentParser()
    parser.add_argument('--snopes', help='/path/to/snopes/file', required=True)
    parser.add_argument('--out-page',
                        help='/path/to/page/output/file',
                        required=True)
    parser.add_argument('--out-claim',
                        help='/path/to/claim/output/file',
                        required=True)
    args = parser.parse_args()
    LogHelper.setup()
    logger = LogHelper.get_logger('snopes')
    snopes_file_2_fever_data_set(args.snopes, args.out_page, args.out_claim)
Ejemplo n.º 7
0
    return os.path.exists(os.path.join("models", "{0}.model".format(mname)))


def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


if __name__ == "__main__":

    LogHelper.setup()
    logger = LogHelper.get_logger(__name__)

    parser = argparse.ArgumentParser()
    parser.add_argument('db', type=str, help='db file path')
    parser.add_argument('test', type=str, help='test file path')
    parser.add_argument("--model", type=str, help="model name")
    parser.add_argument("--sentence", type=str2bool, default=False)
    parser.add_argument("--log", type=str, default=None)
    args = parser.parse_args()

    logger.info("Loading DB {0}".format(args.db))
    db = FeverDocDB(args.db)

    mname = args.model
    logger.info("Model name is {0}".format(mname))
Ejemplo n.º 8
0
def main(mode, config, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + mode)
    logger.info("model: " + mode + ", config: " + str(config))
    if hasattr(Config, 'use_inter_evidence_comparison'):
        use_inter_evidence_comparison = Config.use_inter_evidence_comparison
    else:
        use_inter_evidence_comparison = False
    if hasattr(Config, 'use_claim_evidences_comparison'):
        use_claim_evidences_comparison = Config.use_claim_evidences_comparison
    else:
        use_claim_evidences_comparison = False
    if hasattr(Config, 'use_extra_features'):
        use_extra_features = Config.use_extra_features
    else:
        use_extra_features = False
    if hasattr(Config, 'use_numeric_feature'):
        use_numeric_feature = Config.use_numeric_feature
    else:
        use_numeric_feature = False
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_end_2_end_hyper_param))
    logger.info("use_inter_sentence_comparison: " +
                str(use_inter_evidence_comparison))
    logger.info("use_extra_features: " + str(use_extra_features))
    logger.info("use_numeric_feature: " + str(use_numeric_feature))
    logger.info("use_claim_evidences_comparison: " +
                str(use_claim_evidences_comparison))
    if mode == 'train':
        # # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_dict, y_train) = pickle.load(f)
        else:
            training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2(
                Config.training_set_file,
                Config.db_path,
                glove_path=Config.glove_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_sentence_size)
            h_sent_sizes = training_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set['data']['h_sent_sizes'] = np.expand_dims(
                h_sent_sizes, 1)
            training_set['data']['h_sizes'] = h_sizes
            training_set['data']['h_np'] = np.expand_dims(
                training_set['data']['h_np'], 1)

            valid_set, _, _, _, _ = embed_data_set_with_glove_2(
                Config.dev_set_file,
                Config.db_path,
                vocab_dict=vocab,
                glove_embeddings=embeddings,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_sentence_size)
            h_sent_sizes = valid_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set['data']['h_sizes'] = h_sizes
            valid_set['data']['h_np'] = np.expand_dims(
                valid_set['data']['h_np'], 1)
            if use_extra_features:
                assert hasattr(
                    Config, 'feature_path'
                ), "Config should has feature_path if Config.use_feature is True"
                training_claim_features, training_evidence_features = load_feature_by_data_set(
                    Config.training_set_file, Config.feature_path,
                    Config.max_sentences)
                valid_claim_features, valid_evidence_features = load_feature_by_data_set(
                    Config.dev_set_file, Config.feature_path,
                    Config.max_sentences)
                training_set['data']['h_feats'] = training_claim_features
                training_set['data']['b_feats'] = training_evidence_features
                valid_set['data']['h_feats'] = valid_claim_features
                valid_set['data']['b_feats'] = valid_evidence_features
            if use_numeric_feature:
                training_num_feat = number_feature(Config.training_set_file,
                                                   Config.db_path,
                                                   Config.max_sentences)
                valid_num_feat = number_feature(Config.dev_set_file,
                                                Config.db_path,
                                                Config.max_sentences)
                training_set['data']['num_feat'] = training_num_feat
                valid_set['data']['num_feat'] = valid_num_feat
            if use_inter_evidence_comparison:
                training_concat_sent_indices, training_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                    training_set['data']['b_np'],
                    training_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                training_set['data'][
                    'b_concat_indices'] = training_concat_sent_indices
                training_set['data'][
                    'b_concat_sizes'] = training_concat_sent_sizes
                valid_concat_sent_indices, valid_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                    valid_set['data']['b_np'],
                    valid_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                valid_set['data'][
                    'b_concat_indices'] = valid_concat_sent_indices
                valid_set['data']['b_concat_sizes'] = valid_concat_sent_sizes
            if use_claim_evidences_comparison:
                training_all_evidences_indices, training_all_evidences_sizes = generate_concat_indices_for_claim(
                    training_set['data']['b_np'],
                    training_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                training_set['data'][
                    'b_concat_indices_for_h'] = training_all_evidences_indices
                training_set['data'][
                    'b_concat_sizes_for_h'] = training_all_evidences_sizes
                valid_all_evidences_indices, valid_all_evidences_sizes = generate_concat_indices_for_claim(
                    valid_set['data']['b_np'],
                    valid_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                valid_set['data'][
                    'b_concat_indices_for_h'] = valid_all_evidences_indices
                valid_set['data'][
                    'b_concat_sizes_for_h'] = valid_all_evidences_sizes
            X_dict = {
                'X_train': training_set['data'],
                'X_valid': valid_set['data'],
                'y_valid': valid_set['label'],
                'embedding': embeddings
            }
            y_train = training_set['label']
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_dict, y_train),
                                f,
                                protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        estimator.fit(X_dict, y_train)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    elif mode == 'test':
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.test_set_file,
            Config.db_path,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_sentence_size)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        if use_extra_features:
            assert hasattr(
                Config, 'feature_path'
            ), "Config should has feature_path if Config.use_feature is True"
            test_claim_features, test_evidence_features = load_feature_by_data_set(
                Config.test_set_file, Config.feature_path,
                Config.max_sentences)
            test_set['data']['h_feats'] = test_claim_features
            test_set['data']['b_feats'] = test_evidence_features
        if use_numeric_feature:
            test_num_feat = number_feature(Config.test_set_file,
                                           Config.db_path,
                                           Config.max_sentences)
            test_set['data']['num_feat'] = test_num_feat
        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}
        if use_inter_evidence_comparison:
            test_concat_sent_indices, test_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                test_set['data']['b_np'], test_set['data']['b_sent_sizes'],
                Config.max_sentence_size, Config.max_sentences)
            test_set['data']['b_concat_indices'] = test_concat_sent_indices
            test_set['data']['b_concat_sizes'] = test_concat_sent_sizes
        if use_claim_evidences_comparison:
            test_all_evidences_indices, test_all_evidences_sizes = generate_concat_indices_for_claim(
                test_set['data']['b_np'], test_set['data']['b_sent_sizes'],
                Config.max_sentence_size, Config.max_sentences)
            test_set['data'][
                'b_concat_indices_for_h'] = test_all_evidences_indices
            test_set['data']['b_concat_sizes_for_h'] = test_all_evidences_sizes
        predictions = estimator.predict(x_dict, restore_param_required)
        generate_submission(predictions, Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    else:
        logger.error("Invalid argument --mode: " + mode +
                     " Argument --mode should be either 'train’ or ’test’")
    return estimator
                "predicted_label": prediction_2_label(_prediction)
            }
            f.write(json.dumps(obj))
            f.write('\n')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode',
                        help='\'train\' or \'test\' or \'stub\'',
                        required=True)
    parser.add_argument('--config',
                        help='/path/to/config/file, in JSON format')
    args = parser.parse_args()
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + args.mode)
    logger.info("parameters:\n" + str(vars(args)))
    if args.config is not None:
        Config.load_config(args.config)
    # loading FastText takes long time, so better pickle the loaded FastText model
    if os.path.splitext(Config.fasttext_path)[1] == '.p':
        with open(Config.fasttext_path, "rb") as ft_file:
            fasttext_model = pickle.load(ft_file)
    else:
        fasttext_model = Config.fasttext_path

    if args.mode == 'train':
        # # training mode
        training_set, fasttext_model, vocab, embeddings, _, _ = embed_data_set_with_glove_and_fasttext(
            Config.training_set_file,
            Config.db_path,
Ejemplo n.º 10
0
import argparse
import json
import os
import sqlite3
from hashlib import md5
import unicodedata as ud

from common.util.log_helper import LogHelper

LogHelper.setup()
logger = LogHelper.get_logger("FEVERcs Normalize")
BUFFER_SIZE, tmp_file = 100000, ""


def is_fever_db(file):
    try:
        with sqlite3.connect(file) as db:
            db.cursor().execute(
                "SELECT id, text, lines FROM documents LIMIT 1")
            return True
    except sqlite3.DatabaseError:
        return False


def normalize_sqlite(args):
    logger.info('Reading into a temporary database...')
    with sqlite3.connect(
            args.source_file) as source, sqlite3.connect(tmp_file) as target:
        c_source, c_target = source.cursor(), target.cursor()
        c_target.execute(
            "CREATE TABLE documents (id PRIMARY KEY, text, lines);")
def vocab_map(vocab):
    voc_dict = {}
    for i, v in enumerate(vocab):
        voc_dict[v] = i + 2
    voc_dict['PAD'] = 0
    voc_dict['UNK'] = 1
    return voc_dict


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('db', help='/path/to/db/file')
    parser.add_argument('output', help='/path/to/output/pickle/file')
    args = parser.parse_args()
    LogHelper.setup()
    logger = LogHelper.get_logger("generate_vocab_all_wiki")
    db = FeverDocDB(args.db)
    vocab = set()
    for doc in tqdm(db.get_doc_ids()):
        lines = db.get_doc_lines(doc)
        lines = lines.split("\n")
        for line in lines:
            segments = line.split("\t")
            if len(segments) < 2:
                continue
            line = segments[1]
            if line.strip() == "":
                continue
            tokens = set(token.lower() for token in tokenize(clean_text(line)))
            vocab.update(tokens)
    logger.info("total size of vocab: " + str(len(vocab)))
Ejemplo n.º 12
0
    """
    logger.info("Starting document retrieval for training set...")
    document_retrieval_main(Config.db_path, Config.document_k_wiki, Config.raw_training_set, Config.training_doc_file,
                            Config.document_add_claim, Config.document_parallel)
    logger.info("Finished document retrieval for training set.")
    logger.info("Starting document retrieval for dev set...")
    document_retrieval_main(Config.db_path, Config.document_k_wiki, Config.raw_dev_set, Config.dev_doc_file,
                            Config.document_add_claim, Config.document_parallel)
    logger.info("Finished document retrieval for dev set.")
    logger.info("Starting document retrieval for test set...")
    document_retrieval_main(Config.db_path, Config.document_k_wiki, Config.raw_test_set, Config.test_doc_file,
                            Config.document_add_claim, Config.document_parallel)
    """
    document_retrieval_main(args.database, 7, args.infile, args.outfile, args.path_wiki_titles, True, True)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--infile')
    parser.add_argument('--outfile')
    parser.add_argument('--database')
    parser.add_argument('--path_wiki_titles')
    args = parser.parse_args()
    
    LogHelper.setup()
    logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0])
    logger.info("=========================== Subtask 1. Document Retrieval ==========================================")
    print (args.database, args.infile, args.outfile)
    document_retrieval(logger)

Ejemplo n.º 13
0
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM_MTL arguments: " + str(Config.esim_mtl_hyper_param))
    if mode == RTERunPhase.train:
        # # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                X_dict, y_dict = pickle.load(f)
        else:
            training_set_claim_valid, training_set_evidence_eval, vocab, embeddings, _, _ = embed_data_set_with_evidence_label(
                Config.training_set_file,
                Config.db_path,
                glove_path=Config.glove_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_sentence_size)
            h_sent_sizes = training_set_claim_valid['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set_claim_valid['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            training_set_claim_valid['data']['h_sizes'] = h_sizes
            training_set_claim_valid['data']['h_np'] = np.expand_dims(training_set_claim_valid['data']['h_np'], 1)

            h_sent_sizes = training_set_evidence_eval['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set_evidence_eval['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            training_set_evidence_eval['data']['h_sizes'] = h_sizes
            training_set_evidence_eval['data']['h_np'] = np.expand_dims(training_set_evidence_eval['data']['h_np'], 1)

            valid_set_claim_valid, valid_set_evidence_eval, _, _, _, _ = embed_data_set_with_evidence_label(
                Config.dev_set_file, Config.db_path,
                vocab_dict=vocab,
                glove_embeddings=embeddings,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_sentence_size)
            h_sent_sizes = valid_set_claim_valid['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set_claim_valid['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set_claim_valid['data']['h_sizes'] = h_sizes
            valid_set_claim_valid['data']['h_np'] = np.expand_dims(valid_set_claim_valid['data']['h_np'], 1)

            h_sent_sizes = valid_set_evidence_eval['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set_evidence_eval['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set_evidence_eval['data']['h_sizes'] = h_sizes
            valid_set_evidence_eval['data']['h_np'] = np.expand_dims(valid_set_evidence_eval['data']['h_np'], 1)

            X_dict = {'claim': {'train': training_set_claim_valid['data'], 'valid': valid_set_claim_valid['data'], },
                      'evidence': {'train': training_set_evidence_eval['data'],
                                   'valid': valid_set_evidence_eval['data']},
                      'embedding': embeddings
                      }
            y_dict = {
                'claim': {'train': training_set_claim_valid['label'], 'valid': valid_set_claim_valid['label']},
                'evidence': {'train': training_set_evidence_eval['label'], 'valid': valid_set_evidence_eval['label']}
            }
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_dict, y_dict), f, protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        estimator.fit(X_dict, y_dict)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _, _, _ = embed_data_set_with_evidence_label(Config.test_set_file, Config.db_path,
                                                                     vocab_dict=vocab,
                                                                     glove_embeddings=embeddings,
                                                                     threshold_b_sent_num=Config.max_sentences,
                                                                     threshold_b_sent_size=Config.max_sentence_size,
                                                                     threshold_h_sent_size=Config.max_sentence_size)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        x_dict = {
            'X_test': test_set['data'],
            'embedding': embeddings
        }
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        predictions = estimator.predict(x_dict, restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
Ejemplo n.º 14
0
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("BERT sentence embedding arguments: " + str(Config.bert_sent_hyper_parameter))
    if mode == RTERunPhase.train:
        # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_train, Y_labels_train, X_valid, Y_labels_valid) = pickle.load(f)
        else:
            # process training JSONL file
            X_train, Y_labels_train = read_data_set_from_jsonl(Config.training_set_file,
                                                               Config.db_path,
                                                               num_sentences=Config.max_sentences,
                                                               is_snopes=is_snopes)
            X_valid, Y_labels_valid = read_data_set_from_jsonl(Config.dev_set_file,
                                                               Config.db_path,
                                                               num_sentences=Config.max_sentences,
                                                               is_snopes=is_snopes)
            X_train['b_sizes'] = get_num_sents_of_bodies(X_train['b'])
            X_valid['b_sizes'] = get_num_sents_of_bodies(X_valid['b'])
            b_train = X_train['b']
            b_encoded_train = encode_multi_sentence_set_with_bert(b_train, Config.max_sentences, port=Config.bert_port,
                                                                  port_out=Config.bert_port_out)
            X_train['b'] = b_encoded_train
            logger.debug("b_encoded_train.shape: " + str(b_encoded_train.shape))
            h_train = X_train['h']
            h_encoded_train = encode_single_sentence_set_with_bert(h_train, port=Config.bert_port,
                                                                   port_out=Config.bert_port_out)
            X_train['h'] = h_encoded_train
            logger.debug("h_encoded_train.shape: " + str(h_encoded_train.shape))
            b_valid = X_valid['b']
            b_encoded_valid = encode_multi_sentence_set_with_bert(b_valid, Config.max_sentences, port=Config.bert_port,
                                                                  port_out=Config.bert_port_out)
            X_valid['b'] = b_encoded_valid
            logger.debug("b_encoded_valid.shape: " + str(b_encoded_valid.shape))
            h_valid = X_valid['h']
            h_encoded_valid = encode_single_sentence_set_with_bert(h_valid, port=Config.bert_port,
                                                                   port_out=Config.bert_port_out)
            X_valid['h'] = h_encoded_valid
            logger.debug("h_encoded_valid.shape: " + str(h_encoded_valid.shape))
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_train, Y_labels_train, X_valid, Y_labels_valid), f, protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        estimator.fit(X_train, Y_labels_train, X_valid, Y_labels_valid)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        X_test, Y_labels_test = read_data_set_from_jsonl(Config.test_set_file,
                                                         Config.db_path,
                                                         num_sentences=Config.max_sentences,
                                                         is_snopes=is_snopes)
        X_test['b_sizes'] = get_num_sents_of_bodies(X_test['b'])
        b_test = X_test['b']
        b_encoded_test = encode_multi_sentence_set_with_bert(b_test, Config.max_sentences, port=Config.bert_port,
                                                             port_out=Config.bert_port_out)
        X_test['b'] = b_encoded_test
        logger.debug("b_encoded_test.shape: " + str(b_encoded_test.shape))
        h_test = X_test['h']
        h_encoded_test = encode_single_sentence_set_with_bert(h_test, port=Config.bert_port,
                                                              port_out=Config.bert_port_out)
        X_test['h'] = h_encoded_test
        logger.debug("h_encoded_test.shape: " + str(h_encoded_test.shape))
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        predictions = estimator.predict(X_test, restore_param_required)
        generate_submission(predictions, X_test['id'], Config.test_set_file, Config.submission_file)
        if Y_labels_test is not None:
            print_metrics(Y_labels_test, predictions, logger)
    return estimator
Ejemplo n.º 15
0
import argparse
import json

from tqdm import tqdm

from common.dataset.reader import JSONLineReader
from common.util.log_helper import LogHelper

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('input', help='/path/to/input/file')
    parser.add_argument('output', help='/path/to/output/file')
    args = parser.parse_args()
    LogHelper.setup()
    logger = LogHelper.get_logger("separate_scores")
    jlr = JSONLineReader()
    lines = jlr.read(args.input)
    with open(args.output, 'w') as f:
        for obj in tqdm(lines):
            predicted_evidence = obj['predicted_evidence']
            new_predicted_evidence = []
            scores = []
            for evidence in predicted_evidence:
                new_predicted_evidence.append(evidence[0])
                scores.append(evidence[1])
            obj['predicted_evidence'] = new_predicted_evidence
            obj['scores'] = scores
            f.write(json.dumps(obj) + '\n')
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    if mode == RTERunPhase.train:
        # training mode
        training_set = embed_data_set_with_bert(Config.training_set_file, Config.db_path,
                                                threshold_b_sent_num=Config.max_sentences,
                                                threshold_b_sent_size=Config.max_sentence_size,
                                                is_snopes=is_snopes,
                                                port=Config.bert_port,
                                                port_out=Config.bert_port_out)
        h_sent_sizes = training_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        training_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        training_set['data']['h_sizes'] = h_sizes
        training_set['data']['h_bert_np'] = np.expand_dims(training_set['data']['h_bert_np'], 1)
        valid_set = embed_data_set_with_bert(Config.dev_set_file, Config.db_path,
                                             threshold_b_sent_num=Config.max_sentences,
                                             threshold_b_sent_size=Config.max_sentence_size,
                                             is_snopes=is_snopes,
                                             port=Config.bert_port,
                                             port_out=Config.bert_port_out)
        h_sent_sizes = valid_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        valid_set['data']['h_sizes'] = h_sizes
        valid_set['data']['h_bert_np'] = np.expand_dims(valid_set['data']['h_bert_np'], 1)

        X_dict = {
            'X_train': training_set['data'],
            'X_valid': valid_set['data'],
            'y_valid': valid_set['label']
        }
        if estimator is None:
            estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        estimator.fit(X_dict, training_set['label'])
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        test_set = embed_data_set_with_bert(Config.test_set_file, Config.db_path,
                                            threshold_b_sent_num=Config.max_sentences,
                                            threshold_b_sent_size=Config.max_sentence_size,
                                            is_snopes=is_snopes,
                                            port=Config.bert_port,
                                            port_out=Config.bert_port_out)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_bert_np'] = np.expand_dims(test_set['data']['h_bert_np'], 1)
        x_dict = {
            'X_test': test_set['data']
        }
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        predictions = estimator.predict(x_dict, restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
Ejemplo n.º 17
0
    if cuda_device is not None:
        trainer_params["cuda_device"] = cuda_device
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model


if __name__ == "__main__":
    LogHelper.setup()
    LogHelper.get_logger("allennlp.training.trainer")
    LogHelper.get_logger(__name__)

    parser = argparse.ArgumentParser()
    parser.add_argument('db', type=str, help='/path/to/saved/db.db')
    parser.add_argument(
        'param_path',
        type=str,
        help='path to parameter file describing the model to be trained')

    parser.add_argument("logdir", type=str)

    parser.add_argument("--filtering", type=str, default=None)
    parser.add_argument("--cuda-device",
                        type=int,
                        default=None,
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    # loading FastText takes long time, so better pickle the loaded FastText model
    if os.path.splitext(Config.fasttext_path)[1] == '.p':
        with open(Config.fasttext_path, "rb") as ft_file:
            fasttext_model = pickle.load(ft_file)
    else:
        fasttext_model = Config.fasttext_path
    if mode == RTERunPhase.train:
        # # training mode
        training_set, fasttext_model, vocab, embeddings = embed_data_set_with_glove_and_fasttext_claim_only(
            Config.training_set_file,
            fasttext_model,
            glove_path=Config.glove_path,
            threshold_h_sent_size=Config.max_sentence_size)
        h_sent_sizes = training_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        training_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        training_set['data']['h_sizes'] = h_sizes
        training_set['data']['h_np'] = np.expand_dims(
            training_set['data']['h_np'], 1)
        training_set['data']['h_ft_np'] = np.expand_dims(
            training_set['data']['h_ft_np'], 1)

        valid_set, _, _, _ = embed_data_set_with_glove_and_fasttext_claim_only(
            Config.dev_set_file,
            fasttext_model,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_h_sent_size=Config.max_sentence_size)
        del fasttext_model
        h_sent_sizes = valid_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        valid_set['data']['h_sizes'] = h_sizes
        valid_set['data']['h_np'] = np.expand_dims(valid_set['data']['h_np'],
                                                   1)
        valid_set['data']['h_ft_np'] = np.expand_dims(
            valid_set['data']['h_ft_np'], 1)

        X_dict = {
            'X_train': training_set['data'],
            'X_valid': valid_set['data'],
            'y_valid': valid_set['label'],
            'embedding': embeddings
        }
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_dict, training_set['label'])
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    elif mode == 'test':
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _ = embed_data_set_with_glove_and_fasttext_claim_only(
            Config.test_set_file,
            fasttext_model,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_h_sent_size=Config.max_sentence_size)
        del fasttext_model
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        test_set['data']['h_ft_np'] = np.expand_dims(
            test_set['data']['h_ft_np'], 1)
        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(x_dict, restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
Ejemplo n.º 19
0
def main(mode, config, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + mode)
    logger.info("model: " + mode + ", config: " + str(config))
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    # loading FastText takes a long time, so better pickle the loaded FastText model
    if os.path.splitext(Config.fasttext_path)[1] == '.p':
        with open(Config.fasttext_path, "rb") as ft_file:
            fasttext_model = pickle.load(ft_file)
    else:
        fasttext_model = Config.fasttext_path
    if mode == 'train':
        # # training mode
        training_set, fasttext_model, vocab, embeddings, _, _ = embed_data_set_with_glove_and_fasttext(
            Config.training_set_file,
            Config.db_path,
            fasttext_model,
            glove_path=Config.glove_path,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_sentence_size)
        h_sent_sizes = training_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        training_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        training_set['data']['h_sizes'] = h_sizes
        training_set['data']['h_np'] = np.expand_dims(
            training_set['data']['h_np'], 1)
        training_set['data']['h_ft_np'] = np.expand_dims(
            training_set['data']['h_ft_np'], 1)

        valid_set, _, _, _, _, _ = embed_data_set_with_glove_and_fasttext(
            Config.dev_set_file,
            Config.db_path,
            fasttext_model,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_sentence_size)
        del fasttext_model
        h_sent_sizes = valid_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        valid_set['data']['h_sizes'] = h_sizes
        valid_set['data']['h_np'] = np.expand_dims(valid_set['data']['h_np'],
                                                   1)
        valid_set['data']['h_ft_np'] = np.expand_dims(
            valid_set['data']['h_ft_np'], 1)

        X_dict = {
            'X_train': training_set['data'],
            'X_valid': valid_set['data'],
            'y_valid': valid_set['label'],
            'embedding': embeddings
        }
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        estimator.fit(X_dict, training_set['label'])
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    elif mode == 'test':
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _, _, _ = embed_data_set_with_glove_and_fasttext(
            Config.test_set_file,
            Config.db_path,
            fasttext_model,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_sentence_size)
        del fasttext_model
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        test_set['data']['h_ft_np'] = np.expand_dims(
            test_set['data']['h_ft_np'], 1)
        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}
        predictions = estimator.predict(x_dict, restore_param_required)
        generate_submission(predictions, Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    else:
        logger.error("Invalid argument --mode: " + mode +
                     " Argument --mode should be either 'train’ or ’test’")
Ejemplo n.º 20
0
def fever_app(caller):


    global db, tokenizer, text_encoder, encoder, X_train, M_train, X, M, Y_train, Y,params,sess, n_batch_train, db_file, \
        drqa_index, max_page, max_sent, encoder_path, bpe_path, n_ctx, n_batch, model_file
    global n_vocab,n_special,n_y,max_len,clf_token,eval_lm_losses,eval_clf_losses,eval_mgpu_clf_losses,eval_logits, \
        eval_mgpu_logits,eval_logits

    LogHelper.setup()
    logger = LogHelper.get_logger("papelo")

    logger.info("Load config")
    config = json.load(open(os.getenv("CONFIG_FILE","configs/config-docker.json")))
    globals().update(config)
    print(globals())

    logger.info("Set Seeds")
    random.seed(42)
    np.random.seed(42)
    tf.set_random_seed(42)

    logger.info("Load FEVER DB")
    db = FeverDocDB(db_file)
    retrieval = TopNDocsTopNSents(db, max_page, max_sent, True, False, drqa_index)

    logger.info("Init word tokenizer")
    tokenizer = SimpleWordSplitter()

    # Prepare text encoder
    logger.info("Load BPE Text Encoder")
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    n_y = 3
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2

    n_batch_train = n_batch

    logger.info("Create TF Placeholders")
    X_train = tf.placeholder(tf.int32, [n_batch, 1, n_ctx, 2])
    M_train = tf.placeholder(tf.float32, [n_batch, 1, n_ctx])
    X = tf.placeholder(tf.int32, [None, 1, n_ctx, 2])
    M = tf.placeholder(tf.float32, [None, 1, n_ctx])

    Y_train = tf.placeholder(tf.int32, [n_batch])
    Y = tf.placeholder(tf.int32, [None])

    logger.info("Model Setup")
    eval_logits, eval_clf_losses, eval_lm_losses = model(X, M, Y, train=False, reuse=None)
    eval_mgpu_logits, eval_mgpu_clf_losses, eval_mgpu_lm_losses = mgpu_predict(X_train, M_train, Y_train)

    logger.info("Create TF Session")
    params = find_trainable_variables('model')

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=float(os.getenv("TF_GPU_MEMORY_FRACTION","0.5")))
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options))
    sess.run(tf.global_variables_initializer())
    sess.run([p.assign(ip) for p, ip in zip(params, joblib.load(model_file))])

    logger.info("Ready")

    def predict(instances):
        predictions = []

        for instance in tqdm(instances):
            sents = retrieval.get_sentences_for_claim(instance["claim"])
            found_evidence = resolve_evidence(sents)
            instance["tokenized_claim"] = " ".join(map(lambda x: x.text, tokenizer.split_words(instance["claim"])))

            sub_instances = make_instances(instance, found_evidence)
            sub_predictions = predict_sub_instances(text_encoder, sub_instances)

            refute_evidence =  [i for i, x in enumerate(sub_predictions) if x == 2]
            support_evidence = [i for i, x in enumerate(sub_predictions) if x == 0]

            if len(support_evidence):
                predicted_label = "SUPPORTS"
                predicted_evidence = [[found_evidence[i]["title"], found_evidence[i]["line_number"]] for i in support_evidence]
            elif len(refute_evidence):
                predicted_label = "REFUTES"
                predicted_evidence = [[found_evidence[i]["title"], found_evidence[i]["line_number"]] for i in refute_evidence]
            else:
                predicted_label = "NOT ENOUGH INFO"
                predicted_evidence = []

            predictions.append({"predicted_label":predicted_label,
                                "predicted_evidence": predicted_evidence})




        return predictions

    return caller(predict)
Ejemplo n.º 21
0
 def __init__(self,file):
     self.pages = []
     self.file = file
     self.logger = LogHelper.get_logger(__name__)
     self.logger.info("Indexing Pages")
Ejemplo n.º 22
0
class TopNDocsTopNSents(RetrievalMethod):
    class RankArgs:
        def __init__(self):
            self.ngram = 2
            self.hash_size = int(math.pow(2, 24))
            self.tokenizer = "simple"
            self.num_workers = None

    def __init__(self, db, n_docs, n_sents, model):
        super().__init__(db)
        self.n_docs = n_docs
        self.n_sents = n_sents
        self.ranker = retriever.get_class('tfidf')(tfidf_path=model)
        self.onlineranker_args = self.RankArgs()

    def get_docs_for_claim(self, claim_text):
        doc_names, doc_scores = self.ranker.closest_docs(
            claim_text, self.n_docs)
        return zip(doc_names, doc_scores)

    def tf_idf_sim(self, claim, lines, freqs=None):
        tfidf = OnlineTfidfDocRanker(self.onlineranker_args,
                                     [line["sentence"] for line in lines],
                                     freqs)
        line_ids, scores = tfidf.closest_docs(claim, self.n_sents)
        ret_lines = []
        for idx, line in enumerate(line_ids):
            ret_lines.append(lines[line])
            ret_lines[-1]["score"] = scores[idx]
        return ret_lines

    LogHelper.setup()
    logger = LogHelper.get_logger(__name__)

    def get_sentences_given_claim(self, page, logger, line_no):
        lines = self.db.get_doc_lines(page)
        lines = [
            line.split("\t")[1] if len(line.split("\t")[1]) > 1 else ""
            for line in lines.split("\n")
        ]
        sent = lines[line_no]
        return sent

    def get_sentences_for_claim(self, claim_text, include_text=False):
        #given a claim get a bunch of documents that might be relevant for it
        pages = self.get_docs_for_claim(claim_text)
        sorted_p = list(sorted(pages, reverse=True, key=lambda elem: elem[1]))
        pages = [p[0] for p in sorted_p[:self.n_docs]]
        p_lines = []
        for page in pages:
            logger.info("page:" + page)
            #query the db and get the list of sentences in a given wikipedia page
            lines = self.db.get_doc_lines(page)
            logger.info(lines)
            sys.exit(1)
            lines = [
                line.split("\t")[1] if len(line.split("\t")[1]) > 1 else ""
                for line in lines.split("\n")
            ]

            p_lines.extend(zip(lines, [page] * len(lines), range(len(lines))))

        lines = []
        for p_line in p_lines:
            logger.info("value of sentence in p_line is:" + p_line[0])
            sys.exit(1)
            lines.append({
                "sentence": p_line[0],
                "page": p_line[1],
                "line_on_page": p_line[2]
            })

        scores = self.tf_idf_sim(claim_text, lines)

        if include_text:
            return scores

        return [(s["page"], s["line_on_page"]) for s in scores]
Ejemplo n.º 23
0
def read_data_set_from_lines(lines: List,
                             db: Union[str, FeverDocDB],
                             predicted: bool = True,
                             num_sentences=None,
                             is_snopes=False):
    logger = LogHelper.get_logger("read_data_set_from_jsonl")
    if not is_snopes:
        if type(db) is str:
            db = FeverDocDB(db)
    else:
        with open(db) as f:
            db = json.load(f)

    claims = []
    evidences = []
    paths = []
    labels = []

    for line in tqdm(lines):
        json_obj = line
        if predicted:
            evidences_texts = []
            if 'predicted_evidence' in json_obj:
                _evidences = json_obj['predicted_evidence']
            elif 'predicted_sentences' in json_obj:
                _evidences = json_obj['predicted_sentences']
            else:
                _evidences = []
            if len(_evidences) > 0:
                for sent in _evidences:
                    page, line_num = sent[-2], sent[-1]
                    page_title = page.replace("_", " ")
                    evidences_texts.append(
                        # page_title + " # " + clean_text(evidence_num_to_text(db, page, line_num, is_snopes)))
                        clean_text(
                            evidence_num_to_text(db, page, line_num,
                                                 is_snopes)))
        else:
            evidences_texts = set()
            _evidences = json_obj['evidence']
            for evidence in _evidences:
                for sent in evidence:
                    page, line_num = sent[-2], sent[-1]
                    page_title = page.replace("_", " ")
                    evidences_texts.add(
                        # page_title + " # " + clean_text(evidence_num_to_text(db, page, line_num, is_snopes)))
                        clean_text(
                            evidence_num_to_text(db, page, line_num,
                                                 is_snopes)))
            evidences_texts = list(evidences_texts)

        if len(evidences_texts) == 0:
            evidence_texts = [""]

        if num_sentences is not None:
            if len(evidences_texts) > num_sentences:
                evidences_texts = evidences_texts[:num_sentences]
        claims.append(clean_text(json_obj['claim']))
        if 'label' in json_obj:
            labels.append(label_dict.index(json_obj['label']))
        evidences.append(evidences_texts)
        if 'paths' in json_obj:
            paths_from_sent_to_claim = [
                1.0 if p else 0.0 for p in json_obj['paths']
            ]
            if num_sentences is not None and num_sentences > len(
                    paths_from_sent_to_claim):
                paths_from_sent_to_claim += [0.0] * (
                    num_sentences - len(paths_from_sent_to_claim))
            paths.append(paths_from_sent_to_claim)

    datas = {'h': claims, 'b': evidences}
    if paths:
        datas['paths'] = paths
    return datas, labels
Ejemplo n.º 24
0
#!/usr/bin/env python3

#Adapted from https://github.com/facebookresearch/DrQA/blob/master/scripts/retriever/build_db.py
# Copyright 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""A script to build the tf-idf document matrices for retrieval."""
import os
from drqascripts.retriever.build_tfidf import *
from common.util.log_helper import LogHelper

if __name__ == '__main__':
    LogHelper.setup()
    logger = LogHelper.get_logger("DrQA Build TFIDF")
    LogHelper.get_logger("DRQA")

    logger.info("Build TF-IDF matrix")

    parser = argparse.ArgumentParser()
    parser.add_argument('db_path',
                        type=str,
                        default=None,
                        help='Path to sqlite db holding document texts')
    parser.add_argument('out_dir',
                        type=str,
                        default=None,
                        help='Directory for saving output files')
    parser.add_argument('--ngram',
                        type=int,
import argparse
from functools import partial
from utils.data_reader import read_data_set_from_jsonl
from common.util.log_helper import LogHelper
from deep_models.USE_Attention_finetune import USEAttention, weight_matrix_2


def max_len_body(x):
    lengths = [len(sents) for sents in x['b']]
    print(max(lengths), min(lengths), sum(lengths) / len(lengths))
    return max(lengths)


if __name__ == '__main__':
    LogHelper.setup()
    logger = LogHelper.get_logger('train')
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--train', help='/path/to/training/set', required=True)
    parser.add_argument(
        '--valid', help='/path/to/validation/set', required=True)
    parser.add_argument('--save', help='/path/to/save/model')
    parser.add_argument('--db', help='/path/to/data/base', required=True)
    parser.add_argument(
        '--max-sent', type=int, help='Maximal number of sentences per claim', default=5)
    args = parser.parse_args()
    num_sentences = args.max_sent
    X_train, Y_labels_train = read_data_set_from_jsonl(
        args.train, args.db, num_sentences=num_sentences)
    X_valid, Y_labels_valid = read_data_set_from_jsonl(
        args.valid, args.db, num_sentences=num_sentences)
 parser.add_argument('--save-data',
                     help='/path/to/save/data',
                     default="data/rte/train/")
 parser.add_argument('--load-data', help='/path/to/load/data/file')
 parser.add_argument('--db', help='/path/to/data/base', required=True)
 parser.add_argument('--max-sent',
                     type=int,
                     help='Maximal number of sentences per claim',
                     default=5)
 parser.add_argument('--embed', help='/path/to/embedding')
 parser.add_argument('--save-result',
                     help='/path/to/save/result',
                     default="data/rte/result/")
 args = parser.parse_args()
 LogHelper.setup()
 logger = LogHelper.get_logger(args.mode)
 if args.mode == 'train':
     assert args.train is not None or args.load_data is not None, "--train training set or --load-data should be provided in train mode"
     assert args.embed is not None, "--embed should be provided in train mode"
     # training mode
     if args.load_data:
         # load pre-processed training data
         with open(args.load_data, "rb") as file:
             param = pickle.load(file)
     else:
         # process training JSONL file
         paths = [args.train, args.valid]
         dataset_list, vocab, embeddings, b_max_sent_num, b_max_sent_size = embed_data_sets_with_glove(
             paths, args.db, args.embed, threshold_b_sent_num=args.max_sent)
         vocab = vocab_map(vocab)
         param = {