Esempio n. 1
0
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    if mode == RTERunPhase.train:
        # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_train, Y_labels_train, X_valid,
                 Y_labels_valid) = pickle.load(f)
        else:
            # process training JSONL file
            X_train, Y_labels_train = read_data_set_from_jsonl(
                Config.training_set_file,
                Config.db_path,
                num_sentences=Config.max_sentences,
                is_snopes=is_snopes)
            X_valid, Y_labels_valid = read_data_set_from_jsonl(
                Config.dev_set_file,
                Config.db_path,
                num_sentences=Config.max_sentences,
                is_snopes=is_snopes)
            b_train = X_train['b']
            X_train['b_sizes'] = get_num_sents_of_bodies(b_train)
            for i, sample in enumerate(b_train):
                if len(sample) < Config.max_sentences:
                    for _ in range(Config.max_sentences - len(sample)):
                        sample.append(" ")
                b_train[i] = np.asarray(sample)
            b_train = np.asarray(b_train)
            X_train['b'] = b_train
            logger.debug("b_train.shape: " + str(b_train.shape))
            b_valid = X_valid['b']
            X_valid['b_sizes'] = get_num_sents_of_bodies(b_valid)
            for i, sample in enumerate(b_valid):
                if len(sample) < Config.max_sentences:
                    for _ in range(Config.max_sentences - len(sample)):
                        sample.append(" ")
                b_valid[i] = np.asarray(sample)
            b_valid = np.asarray(b_valid)
            X_valid['b'] = b_valid
            logger.debug("b_valid.shape: " + str(b_valid.shape))
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump(
                        (X_train, Y_labels_train, X_valid, Y_labels_valid),
                        f,
                        protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_train, Y_labels_train, X_valid, Y_labels_valid)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        X_test, Y_labels_test = read_data_set_from_jsonl(
            Config.test_set_file,
            Config.db_path,
            num_sentences=Config.max_sentences,
            is_snopes=is_snopes)
        b_test = X_test['b']
        X_test['b_sizes'] = get_num_sents_of_bodies(b_test)
        for i, sample in enumerate(b_test):
            if len(sample) < Config.max_sentences:
                for _ in range(Config.max_sentences - len(sample)):
                    sample.append(" ")
            b_test[i] = np.asarray(sample)
        b_test = np.asarray(b_test)
        X_test['b'] = b_test
        logger.debug("b_test.shape: " + str(b_test.shape))
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(X_test, restore_param_required)
        generate_submission(predictions, X_test['id'], Config.test_set_file,
                            Config.submission_file)
        if Y_labels_test:
            print_metrics(Y_labels_test, predictions, logger)
    return estimator
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    if hasattr(Config, 'use_inter_evidence_comparison'):
        use_inter_evidence_comparison = Config.use_inter_evidence_comparison
    else:
        use_inter_evidence_comparison = False
    # 'esim_inter_evidence' model and 'esim_inter_evidence_claim_evidences_comparison' models need inter evidence inputs
    use_inter_evidence_comparison = use_inter_evidence_comparison or Config.estimator_name in {
        'esim_inter_evidence', 'esim_inter_evidence_claim_evidences_comparison'
    }
    if hasattr(Config, 'use_claim_evidences_comparison'):
        use_claim_evidences_comparison = Config.use_claim_evidences_comparison
    else:
        use_claim_evidences_comparison = False
    # 'esim_inter_evidence_claim_evidences_comparison' model needs claim-evidence inputs
    use_claim_evidences_comparison = use_claim_evidences_comparison or Config.estimator_name in {
        'esim_inter_evidence_claim_evidences_comparison'
    }
    if hasattr(Config, 'use_extra_features'):
        use_extra_features = Config.use_extra_features
    else:
        use_extra_features = False
    if hasattr(Config, 'use_numeric_feature'):
        use_numeric_feature = Config.use_numeric_feature
    else:
        use_numeric_feature = False
    # 'esim_num_feature' model needs numeric feature inputs
    use_numeric_feature = use_numeric_feature or Config.estimator_name in {
        'esim_num_feature'
    }
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_end_2_end_hyper_param))
    logger.info("use_inter_sentence_comparison: " +
                str(use_inter_evidence_comparison))
    logger.info("use_extra_features: " + str(use_extra_features))
    logger.info("use_numeric_feature: " + str(use_numeric_feature))
    logger.info("use_claim_evidences_comparison: " +
                str(use_claim_evidences_comparison))
    if mode == RTERunPhase.train:
        # # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_dict, y_train) = pickle.load(f)
        else:
            training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2(
                Config.training_set_file,
                Config.db_path,
                glove_path=Config.glove_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size,
                is_snopes=is_snopes)
            h_sent_sizes = training_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set['data']['h_sent_sizes'] = np.expand_dims(
                h_sent_sizes, 1)
            training_set['data']['h_sizes'] = h_sizes
            training_set['data']['h_np'] = np.expand_dims(
                training_set['data']['h_np'], 1)

            valid_set, _, _, _, _ = embed_data_set_with_glove_2(
                Config.dev_set_file,
                Config.db_path,
                vocab_dict=vocab,
                glove_embeddings=embeddings,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size,
                is_snopes=is_snopes)
            h_sent_sizes = valid_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set['data']['h_sizes'] = h_sizes
            valid_set['data']['h_np'] = np.expand_dims(
                valid_set['data']['h_np'], 1)
            if use_extra_features:
                assert hasattr(
                    Config, 'feature_path'
                ), "Config should has feature_path if Config.use_feature is True"
                training_claim_features, training_evidence_features = load_feature_by_data_set(
                    Config.training_set_file, Config.feature_path,
                    Config.max_sentences)
                valid_claim_features, valid_evidence_features = load_feature_by_data_set(
                    Config.dev_set_file, Config.feature_path,
                    Config.max_sentences)
                training_set['data']['h_feats'] = training_claim_features
                training_set['data']['b_feats'] = training_evidence_features
                valid_set['data']['h_feats'] = valid_claim_features
                valid_set['data']['b_feats'] = valid_evidence_features
            if use_numeric_feature:
                training_num_feat = number_feature(Config.training_set_file,
                                                   Config.db_path,
                                                   Config.max_sentences,
                                                   is_snopes)
                valid_num_feat = number_feature(Config.dev_set_file,
                                                Config.db_path,
                                                Config.max_sentences,
                                                is_snopes)
                training_set['data']['num_feat'] = training_num_feat
                valid_set['data']['num_feat'] = valid_num_feat
            if use_inter_evidence_comparison:
                training_concat_sent_indices, training_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                    training_set['data']['b_np'],
                    training_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                training_set['data'][
                    'b_concat_indices'] = training_concat_sent_indices
                training_set['data'][
                    'b_concat_sizes'] = training_concat_sent_sizes
                valid_concat_sent_indices, valid_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                    valid_set['data']['b_np'],
                    valid_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                valid_set['data'][
                    'b_concat_indices'] = valid_concat_sent_indices
                valid_set['data']['b_concat_sizes'] = valid_concat_sent_sizes
            if use_claim_evidences_comparison:
                training_all_evidences_indices, training_all_evidences_sizes = generate_concat_indices_for_claim(
                    training_set['data']['b_np'],
                    training_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                training_set['data'][
                    'b_concat_indices_for_h'] = training_all_evidences_indices
                training_set['data'][
                    'b_concat_sizes_for_h'] = training_all_evidences_sizes
                valid_all_evidences_indices, valid_all_evidences_sizes = generate_concat_indices_for_claim(
                    valid_set['data']['b_np'],
                    valid_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                valid_set['data'][
                    'b_concat_indices_for_h'] = valid_all_evidences_indices
                valid_set['data'][
                    'b_concat_sizes_for_h'] = valid_all_evidences_sizes
            X_dict = {
                'X_train': training_set['data'],
                'X_valid': valid_set['data'],
                'y_valid': valid_set['label'],
                'embedding': embeddings
            }
            y_train = training_set['label']
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_dict, y_train),
                                f,
                                protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_dict, y_train)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.test_set_file,
            Config.db_path,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size,
            is_snopes=is_snopes)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        if use_extra_features:
            assert hasattr(
                Config, 'feature_path'
            ), "Config should has feature_path if Config.use_feature is True"
            test_claim_features, test_evidence_features = load_feature_by_data_set(
                Config.test_set_file, Config.feature_path,
                Config.max_sentences)
            test_set['data']['h_feats'] = test_claim_features
            test_set['data']['b_feats'] = test_evidence_features
        if use_numeric_feature:
            test_num_feat = number_feature(Config.test_set_file,
                                           Config.db_path,
                                           Config.max_sentences, is_snopes)
            test_set['data']['num_feat'] = test_num_feat
        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}
        if use_inter_evidence_comparison:
            test_concat_sent_indices, test_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                test_set['data']['b_np'], test_set['data']['b_sent_sizes'],
                Config.max_sentence_size, Config.max_sentences)
            test_set['data']['b_concat_indices'] = test_concat_sent_indices
            test_set['data']['b_concat_sizes'] = test_concat_sent_sizes
        if use_claim_evidences_comparison:
            test_all_evidences_indices, test_all_evidences_sizes = generate_concat_indices_for_claim(
                test_set['data']['b_np'], test_set['data']['b_sent_sizes'],
                Config.max_sentence_size, Config.max_sentences)
            test_set['data'][
                'b_concat_indices_for_h'] = test_all_evidences_indices
            test_set['data']['b_concat_sizes_for_h'] = test_all_evidences_sizes
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(
            x_dict, restore_param_required=restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    logger.info("this script is only for FEVER dataset")
    if mode == RTERunPhase.train:
        # # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_dict, y_train) = pickle.load(f)
        else:
            training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2(
                Config.training_set_file,
                Config.db_path,
                glove_path=Config.glove_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size)
            h_sent_sizes = training_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set['data']['h_sent_sizes'] = np.expand_dims(
                h_sent_sizes, 1)
            training_set['data']['h_sizes'] = h_sizes
            training_set['data']['h_np'] = np.expand_dims(
                training_set['data']['h_np'], 1)
            training_set['data']['scores'] = load_scores(
                Config.training_set_file, Config.max_sentences)

            valid_set, _, _, _, _ = embed_data_set_with_glove_2(
                Config.dev_set_file,
                Config.db_path,
                vocab_dict=vocab,
                glove_embeddings=embeddings,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size)
            h_sent_sizes = valid_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set['data']['h_sizes'] = h_sizes
            valid_set['data']['h_np'] = np.expand_dims(
                valid_set['data']['h_np'], 1)
            valid_set['data']['scores'] = load_scores(Config.dev_set_file,
                                                      Config.max_sentences)

            X_dict = {
                'X_train': training_set['data'],
                'X_valid': valid_set['data'],
                'y_valid': valid_set['label'],
                'embedding': embeddings
            }
            y_train = training_set['label']
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_dict, y_train),
                                f,
                                protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_dict, y_train)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.test_set_file,
            Config.db_path,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        test_set['data']['scores'] = load_scores(Config.test_set_file,
                                                 Config.max_sentences)
        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(
            x_dict, restore_param_required=restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    if mode == RTERunPhase.train:
        # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                dataset_list = pickle.load(f)
        else:
            # process training JSONL file
            training_set, _, _ = embed_data_set_for_elmo(
                Config.training_set_file,
                Config.db_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_h_sent_size=Config.max_claim_size,
                threshold_b_sent_size=Config.max_sentence_size,
                is_snopes=is_snopes)
            h_sent_sizes = training_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set['data']['h_sent_sizes'] = np.expand_dims(
                h_sent_sizes, 1)
            training_set['data']['h_sizes'] = h_sizes
            training_set['data']['h_tokens'] = np.expand_dims(
                training_set['data']['h_tokens'], 1)
            # training_set['data']['h_ft_np'] = np.expand_dims(training_set['data']['h_ft_np'], 1)

            valid_set, _, _ = embed_data_set_for_elmo(
                Config.dev_set_file,
                Config.db_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size,
                is_snopes=is_snopes)
            h_sent_sizes = valid_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set['data']['h_sizes'] = h_sizes
            valid_set['data']['h_tokens'] = np.expand_dims(
                valid_set['data']['h_tokens'], 1)

            dataset_list = [training_set, valid_set]
            # save processed training data
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump(dataset_list,
                                f,
                                protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(dataset_list[0]['data'], dataset_list[0]['label'],
                      dataset_list[1]['data'], dataset_list[1]['label'])
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        test_set, _, _ = embed_data_set_for_elmo(
            Config.test_set_file,
            Config.db_path,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size,
            is_snopes=is_snopes)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_tokens'] = np.expand_dims(
            test_set['data']['h_tokens'], 1)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        logger.debug("CUDA_VISIBLE_DEVICES: " +
                     os.environ['CUDA_VISIBLE_DEVICES'])
        predictions = estimator.predict(
            test_set['data'], restore_param_required=restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    assert hasattr(
        Config, 'page_source_file_path'
    ), "'page_source_file_path' field is needed in config file for this script"
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM credibility MTL arguments: " +
                str(Config.esim_credibility_mtl_hyper_param))
    logger.info("this script is only for Snopes dataset")
    if mode == RTERunPhase.train:
        # training sets
        # @formatter:off
        claim_training_set, word_vocab, word_embeddings, domain_vocab, domain_embeddings, suffix_vocab, \
            suffix_embeddings, protocol_vocab, protocol_embeddings, claim_stance_vocab, claim_stance_embeddings = \
            embed_data_set_with_glove_with_credibility(
                Config.esim_credibility_mtl_hyper_param['claim_training_set'],
                Config.db_path,
                Config.page_source_file_path,
                glove_path=Config.glove_path,
                domain_embedding_size=Config.esim_credibility_mtl_hyper_param['domain_embedding_size'],
                suffix_embedding_size=Config.esim_credibility_mtl_hyper_param['suffix_embedding_size'],
                protocol_embedding_size=Config.esim_credibility_mtl_hyper_param['protocol_embedding_size'],
                stance_embedding_size=Config.esim_credibility_mtl_hyper_param['stance_embedding_size'],
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_sentence_size)
        # @formatter:on
        claim_h_sent_sizes = claim_training_set['data']['h_sent_sizes']
        claim_h_sizes = np.ones(len(claim_h_sent_sizes), np.int32)
        claim_training_set['data']['h_sent_sizes'] = np.expand_dims(
            claim_h_sent_sizes, 1)
        claim_training_set['data']['h_sizes'] = claim_h_sizes
        claim_training_set['data']['h_np'] = np.expand_dims(
            claim_training_set['data']['h_np'], 1)
        logger.info("size of training set: " +
                    str(claim_training_set['data']['h_np'].shape[0]))
        stance_training_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.esim_credibility_mtl_hyper_param['stance_training_set'],
            Config.db_path,
            Config.glove_path,
            vocab_dict=word_vocab,
            glove_embeddings=word_embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size,
            is_snopes=True)
        stance_h_sent_sizes = stance_training_set['data']['h_sent_sizes']
        stance_h_sizes = np.ones(len(stance_h_sent_sizes), np.int32)
        stance_training_set['data']['h_sent_sizes'] = np.expand_dims(
            stance_h_sent_sizes, 1)
        stance_training_set['data']['h_sizes'] = stance_h_sizes
        stance_training_set['data']['h_np'] = np.expand_dims(
            stance_training_set['data']['h_np'], 1)
        # valid sets
        claim_valid_set, _, _ = embed_data_set_with_glove_with_credibility(
            Config.esim_credibility_mtl_hyper_param['claim_dev_set'],
            Config.db_path,
            Config.page_source_file_path,
            vocab_dict=word_vocab,
            glove_embeddings=word_embeddings,
            domain_vocab=domain_vocab,
            domain_embeddings=domain_embeddings,
            suffix_vocab=suffix_vocab,
            suffix_embeddings=suffix_embeddings,
            protocol_vocab=protocol_vocab,
            protocol_embeddings=protocol_embeddings,
            stance_vocab=claim_stance_vocab,
            stance_embeddings=claim_stance_embeddings,
            domain_embedding_size=Config.
            esim_credibility_mtl_hyper_param['domain_embedding_size'],
            suffix_embedding_size=Config.
            esim_credibility_mtl_hyper_param['suffix_embedding_size'],
            protocol_embedding_size=Config.
            esim_credibility_mtl_hyper_param['protocol_embedding_size'],
            stance_embedding_size=Config.
            esim_credibility_mtl_hyper_param['stance_embedding_size'],
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_sentence_size)
        claim_h_sent_sizes = claim_valid_set['data']['h_sent_sizes']
        claim_h_sizes = np.ones(len(claim_h_sent_sizes), np.int32)
        claim_valid_set['data']['h_sent_sizes'] = np.expand_dims(
            claim_h_sent_sizes, 1)
        claim_valid_set['data']['h_sizes'] = claim_h_sizes
        claim_valid_set['data']['h_np'] = np.expand_dims(
            claim_valid_set['data']['h_np'], 1)
        logger.info("size of dev set: " +
                    str(claim_valid_set['data']['h_np'].shape[0]))
        stance_valid_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.esim_credibility_mtl_hyper_param['stance_dev_set'],
            Config.db_path,
            Config.glove_path,
            vocab_dict=word_vocab,
            glove_embeddings=word_embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size,
            is_snopes=True)
        stance_h_sent_sizes = stance_valid_set['data']['h_sent_sizes']
        stance_h_sizes = np.ones(len(stance_h_sent_sizes), np.int32)
        stance_valid_set['data']['h_sent_sizes'] = np.expand_dims(
            stance_h_sent_sizes, 1)
        stance_valid_set['data']['h_sizes'] = stance_h_sizes
        stance_valid_set['data']['h_np'] = np.expand_dims(
            stance_valid_set['data']['h_np'], 1)

        X_dict_claim = {
            'train': claim_training_set['data'],
            'valid': claim_valid_set['data'],
        }
        y_claim = {
            'train': claim_training_set['label'],
            'valid': claim_valid_set['label']
        }

        X_dict_stance = {
            'train': stance_training_set['data'],
            'valid': stance_valid_set['data'],
        }
        y_stance = {
            'train': stance_training_set['label'],
            'valid': stance_valid_set['label']
        }

        X_dict = {
            'claim': X_dict_claim,
            'stance': X_dict_stance,
            'word_embedding': word_embeddings,
            'domain_embedding': domain_embeddings,
            'suffix_embedding': suffix_embeddings,
            'protocol_embedding': protocol_embeddings,
            'stance_embedding': claim_stance_embeddings
        }
        y_dict = {'claim': y_claim, 'stance': y_stance}
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_dict, y_dict)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
        dump_source_features_embeddings(
            Config.
            esim_credibility_mtl_hyper_param['features_embeddings_path'],
            domain_vocab, domain_embeddings, suffix_vocab, suffix_embeddings,
            protocol_vocab, protocol_embeddings, claim_stance_vocab,
            claim_stance_embeddings)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        word_vocab, word_embeddings = load_whole_glove(Config.glove_path)
        word_vocab = vocab_map(word_vocab)
        # @formatter:off
        domain_vocab, domain_embeddings, \
            suffix_vocab, suffix_embeddings, \
            protocol_vocab, protocol_embeddings, \
            claim_stance_vocab, claim_stance_embeddings = load_source_features_embeddings(
                Config.esim_credibility_mtl_hyper_param['features_embeddings_path'])
        # @formatter:on
        test_set, _, _ = embed_data_set_with_glove_with_credibility(
            Config.esim_credibility_mtl_hyper_param['claim_test_set'],
            Config.db_path,
            Config.page_source_file_path,
            vocab_dict=word_vocab,
            glove_embeddings=word_embeddings,
            domain_vocab=domain_vocab,
            domain_embeddings=domain_embeddings,
            suffix_vocab=suffix_vocab,
            suffix_embeddings=suffix_embeddings,
            protocol_vocab=protocol_vocab,
            protocol_embeddings=protocol_embeddings,
            stance_vocab=claim_stance_vocab,
            stance_embeddings=claim_stance_embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_sentence_size)
        claim_h_sent_sizes = test_set['data']['h_sent_sizes']
        claim_h_sizes = np.ones(len(claim_h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(
            claim_h_sent_sizes, 1)
        test_set['data']['h_sizes'] = claim_h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        logger.info("size of test set: " +
                    str(test_set['data']['h_np'].shape[0]))
        x_dict = {
            'X_test': test_set['data'],
            'word_embedding': word_embeddings,
            'domain_embedding': domain_embeddings,
            'suffix_embedding': suffix_embeddings,
            'protocol_embedding': protocol_embeddings,
            'stance_embedding': claim_stance_embeddings
        }
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(x_dict, restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
Esempio n. 6
0
Created on Fri May 17 10:56:28 2019

@author: lukasmalik
"""

# ===========================================================================
import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer

from scripts.load_data import * 
from scripts.load_model import * 
from tqdm import tqdm

df = load_data() 
model = load_model() # loads pretrained word2vec model with 400 dimensions
# ===========================================================================
# calculate the document vector as average of all the words 
def avg_feature_vector(tweet, model, num_features,index2word_set,tokenizer):
    '''
    calculates the average vector 
    '''
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    words = tokenizer.tokenize(tweet)
    for word in words:
        #print(word) # sanity check
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    if mode == RTERunPhase.train:
        # training mode
        training_set = embed_data_set_with_bert(Config.training_set_file, Config.db_path,
                                                threshold_b_sent_num=Config.max_sentences,
                                                threshold_b_sent_size=Config.max_sentence_size,
                                                is_snopes=is_snopes,
                                                port=Config.bert_port,
                                                port_out=Config.bert_port_out)
        h_sent_sizes = training_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        training_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        training_set['data']['h_sizes'] = h_sizes
        training_set['data']['h_bert_np'] = np.expand_dims(training_set['data']['h_bert_np'], 1)
        valid_set = embed_data_set_with_bert(Config.dev_set_file, Config.db_path,
                                             threshold_b_sent_num=Config.max_sentences,
                                             threshold_b_sent_size=Config.max_sentence_size,
                                             is_snopes=is_snopes,
                                             port=Config.bert_port,
                                             port_out=Config.bert_port_out)
        h_sent_sizes = valid_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        valid_set['data']['h_sizes'] = h_sizes
        valid_set['data']['h_bert_np'] = np.expand_dims(valid_set['data']['h_bert_np'], 1)

        X_dict = {
            'X_train': training_set['data'],
            'X_valid': valid_set['data'],
            'y_valid': valid_set['label']
        }
        if estimator is None:
            estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        estimator.fit(X_dict, training_set['label'])
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        test_set = embed_data_set_with_bert(Config.test_set_file, Config.db_path,
                                            threshold_b_sent_num=Config.max_sentences,
                                            threshold_b_sent_size=Config.max_sentence_size,
                                            is_snopes=is_snopes,
                                            port=Config.bert_port,
                                            port_out=Config.bert_port_out)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_bert_np'] = np.expand_dims(test_set['data']['h_bert_np'], 1)
        x_dict = {
            'X_test': test_set['data']
        }
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        predictions = estimator.predict(x_dict, restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
Esempio n. 8
0
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("BERT sentence embedding arguments: " + str(Config.bert_sent_hyper_parameter))
    if mode == RTERunPhase.train:
        # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_train, Y_labels_train, X_valid, Y_labels_valid) = pickle.load(f)
        else:
            # process training JSONL file
            X_train, Y_labels_train = read_data_set_from_jsonl(Config.training_set_file,
                                                               Config.db_path,
                                                               num_sentences=Config.max_sentences,
                                                               is_snopes=is_snopes)
            X_valid, Y_labels_valid = read_data_set_from_jsonl(Config.dev_set_file,
                                                               Config.db_path,
                                                               num_sentences=Config.max_sentences,
                                                               is_snopes=is_snopes)
            X_train['b_sizes'] = get_num_sents_of_bodies(X_train['b'])
            X_valid['b_sizes'] = get_num_sents_of_bodies(X_valid['b'])
            b_train = X_train['b']
            b_encoded_train = encode_multi_sentence_set_with_bert(b_train, Config.max_sentences, port=Config.bert_port,
                                                                  port_out=Config.bert_port_out)
            X_train['b'] = b_encoded_train
            logger.debug("b_encoded_train.shape: " + str(b_encoded_train.shape))
            h_train = X_train['h']
            h_encoded_train = encode_single_sentence_set_with_bert(h_train, port=Config.bert_port,
                                                                   port_out=Config.bert_port_out)
            X_train['h'] = h_encoded_train
            logger.debug("h_encoded_train.shape: " + str(h_encoded_train.shape))
            b_valid = X_valid['b']
            b_encoded_valid = encode_multi_sentence_set_with_bert(b_valid, Config.max_sentences, port=Config.bert_port,
                                                                  port_out=Config.bert_port_out)
            X_valid['b'] = b_encoded_valid
            logger.debug("b_encoded_valid.shape: " + str(b_encoded_valid.shape))
            h_valid = X_valid['h']
            h_encoded_valid = encode_single_sentence_set_with_bert(h_valid, port=Config.bert_port,
                                                                   port_out=Config.bert_port_out)
            X_valid['h'] = h_encoded_valid
            logger.debug("h_encoded_valid.shape: " + str(h_encoded_valid.shape))
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_train, Y_labels_train, X_valid, Y_labels_valid), f, protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        estimator.fit(X_train, Y_labels_train, X_valid, Y_labels_valid)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        X_test, Y_labels_test = read_data_set_from_jsonl(Config.test_set_file,
                                                         Config.db_path,
                                                         num_sentences=Config.max_sentences,
                                                         is_snopes=is_snopes)
        X_test['b_sizes'] = get_num_sents_of_bodies(X_test['b'])
        b_test = X_test['b']
        b_encoded_test = encode_multi_sentence_set_with_bert(b_test, Config.max_sentences, port=Config.bert_port,
                                                             port_out=Config.bert_port_out)
        X_test['b'] = b_encoded_test
        logger.debug("b_encoded_test.shape: " + str(b_encoded_test.shape))
        h_test = X_test['h']
        h_encoded_test = encode_single_sentence_set_with_bert(h_test, port=Config.bert_port,
                                                              port_out=Config.bert_port_out)
        X_test['h'] = h_encoded_test
        logger.debug("h_encoded_test.shape: " + str(h_encoded_test.shape))
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        predictions = estimator.predict(X_test, restore_param_required)
        generate_submission(predictions, X_test['id'], Config.test_set_file, Config.submission_file)
        if Y_labels_test is not None:
            print_metrics(Y_labels_test, predictions, logger)
    return estimator