def __init__(self, genre):
        self.genre = genre
        self.train_data = pickle_load(
            format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, genre))
        self.dev_data = pickle_load(
            format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, genre))
        self.test_data = pickle_load(
            format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, genre))

        if not os.path.exists(FEATURE_DIR):
            os.makedirs(FEATURE_DIR)
Example #2
0
def load_single_ngram_data(variation, vectorizer_type, level, ngram_range,
                           data_type):
    if data_type == 'train':
        filename = format_filename(PROCESSED_DATA_DIR,
                                   TRAIN_NGRAM_DATA_TEMPLATE,
                                   variation=variation,
                                   type=vectorizer_type,
                                   level=level,
                                   ngram_range=ngram_range)
    elif data_type == 'valid' or data_type == 'dev':
        filename = format_filename(PROCESSED_DATA_DIR,
                                   DEV_NGRAM_DATA_TEMPLATE,
                                   variation=variation,
                                   type=vectorizer_type,
                                   level=level,
                                   ngram_range=ngram_range)
    elif data_type == 'test':
        filename = format_filename(PROCESSED_DATA_DIR,
                                   TEST_NGRAM_DATA_TEMPLATE,
                                   variation=variation,
                                   type=vectorizer_type,
                                   level=level,
                                   ngram_range=ngram_range)
    else:
        raise ValueError('Data Type Not Understood: {}'.format(data_type))
    if os.path.exists(filename):
        return pickle_load(filename)
    else:
        return None
Example #3
0
def load_data(data_type):
    if data_type == 'train':
        data = pickle_load(
            format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_FILENAME))
    elif data_type == 'dev':
        data = pickle_load(
            format_filename(PROCESSED_DATA_DIR, DEV_DATA_FILENAME))
    elif data_type == 'test':
        data = pickle_load(
            format_filename(PROCESSED_DATA_DIR, TEST_DATA_FILENAME))
    elif data_type == 'test_final':
        data = pickle_load(
            format_filename(PROCESSED_DATA_DIR, TEST_FINAL_DATA_FILENAME))
    else:
        raise ValueError('data tye not understood: {}'.format(data_type))
    return data
    def add_sent_fred_feature(self, data_type):
        """
        Idea from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain (Quora Question Pairs Competition)
        Magic features based on question frequency. The idea.md behind is a question that is asked often has more chances
        to be duplicated.
        """

        feat_file = self.format_feature_file(data_type, 'sent_fred')
        if os.path.exists(feat_file):
            features = pickle_load(feat_file)
        else:
            sents_dict, p_vc, h_vc = self.get_sent_freq()
            data = pd.DataFrame(self.get_data(data_type))
            data['p_hash'] = data['premise'].map(sents_dict)
            data['h_hash'] = data['hypotheis'].map(sents_dict)
            data['p_freq'] = data['p_hash'].map(
                lambda x: p_vc.get(x, 0) + h_vc.get(x, 0))
            data['h_freq'] = data['h_hash'].map(
                lambda x: p_vc.get(x, 0) + h_vc.get(x, 0))
            data['freq_mean'] = (data['p_freq'] + data['h_freq']) / 2
            data['freq_cross'] = data['p_freq'] * data['h_freq']
            data['p_freq_sq'] = data['p_freq'] * data['p_freq']
            data['h_freq_sq'] = data['h_freq'] * data['h_freq']

            features = data[[
                'p_freq', 'h_freq', 'freq_mean', 'freq_cross', 'p_freq_sq',
                'h_freq_sq'
            ]].values
            pickle_dump(feat_file, features)
        return features
    def tfidf_model(self):
        print('Logging Info - Get Tf-idf model...')
        tfidf_model_path = os.path.join(FEATURE_DIR,
                                        '{}_tfidf.model').format(self.genre)
        dict_path = os.path.join(FEATURE_DIR,
                                 '{}_tfidf.dict').format(self.genre)
        if os.path.exists(tfidf_model_path):
            dictionary = pickle_load(dict_path)
            tfidf_model = TfidfModel.load(tfidf_model_path)
        else:
            corpus = [
                text.split() for text in self.train_data['premise'] +
                self.train_data['hypothesis'] + self.dev_data['premise'] +
                self.dev_data['hypothesis'] + self.test_data['premise'] +
                self.test_data['hypothesis']
            ]
            dictionary = corpora.Dictionary(corpus)
            corpus = [dictionary.doc2bow(text) for text in corpus]
            tfidf_model = TfidfModel(corpus)

            del corpus
            tfidf_model.save(tfidf_model_path)
            pickle_dump(dict_path, dictionary)

        return dictionary, tfidf_model
 def add_tfidf_feature(self, data_type):
     feat_file = self.format_feature_file(data_type, 'tfidf')
     if os.path.exists(feat_file):
         features = pickle_load(feat_file)
     else:
         dictionary, tfidf_model = self.tfidf_model()
         features = list()
         for premise, hypothesis in zip(
                 self.get_data(data_type)['premise'],
                 self.get_data(data_type)['hypothesis']):
             premise = premise.split()
             hypothesis = hypothesis.split()
             p_tfidf = dict(tfidf_model[dictionary.doc2bow(premise)])
             h_tfidf = dict(tfidf_model[dictionary.doc2bow(hypothesis)])
             features.append([
                 np.sum(list(p_tfidf.values())),
                 np.sum(list(h_tfidf.values())),
                 np.mean(list(p_tfidf.values())),
                 np.mean(list(h_tfidf.values()))
             ])
         features = np.array(features)
         pickle_dump(feat_file, features)
     print('Logging Info - {} : w_ngram_ol_tfidf feature shape : {}'.format(
         data_type, features.shape))
     return features
Example #7
0
def load_processed_data(genre, level, data_type):
    if data_type == 'train':
        filename = format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, genre, level)
    elif data_type == 'valid' or data_type == 'dev':
        filename = format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, genre, level)
    elif data_type == 'test':
        filename = format_filename(PROCESSED_DATA_DIR, TEST_IDS_MATRIX_TEMPLATE, genre, level)
    else:
        raise ValueError('Data Type Not Understood: {}'.format(data_type))
    return pickle_load(filename)
Example #8
0
def load_features(genre, data_type, scale_features):
    feat_type = 'all_scaled' if scale_features else 'all'
    if data_type == 'train':
        filename = format_filename(FEATURE_DIR, TRAIN_FEATURES_TEMPLATE, genre, feat_type)
    elif data_type == 'valid' or data_type == 'dev':
        filename = format_filename(FEATURE_DIR, DEV_FEATURES_TEMPLATE, genre, feat_type)
    elif data_type == 'test':
        filename = format_filename(FEATURE_DIR, TEST_FEATURES_TEMPLATE, genre, feat_type)
    else:
        raise ValueError('Data Type Not Understood: {}'.format(data_type))
    return pickle_load(filename)
    def get_sent_freq(self):
        print('Logging Info - Get sentence frequency...')
        sents_dict_path = os.path.join(FEATURE_DIR,
                                       '{}_sent_dict.pkl'.format(self.genre))
        p_vc_path = os.path.join(FEATURE_DIR,
                                 '{}_premise_vc.pkl'.format(self.genre))
        h_vc_path = os.path.join(FEATURE_DIR,
                                 '{}_hypothesis_vc.pkl'.format(self.genre))
        if os.path.exists(p_vc_path):
            sents_dict = pickle_load(sents_dict_path)
            p_vc = pickle_load(p_vc_path)
            h_vc = pickle_load(h_vc_path)
        else:
            train_data = pd.DataFrame(self.train_data)
            dev_data = pd.DataFrame(self.dev_data)
            test_data = pd.DataFrame(self.test_data)
            all_data = pd.concat([train_data, dev_data, test_data])

            df1 = all_data[['premise']]
            df2 = all_data[['hypothesis']]
            df2.rename(columns={'hypothesis': 'premise'}, inplace=True)

            train_sents = pd.concat([df1, df2])
            train_sents.drop_duplicates(subset=['premise'], inplace=True)
            train_sents.reset_index(inplace=True, drop=True)

            sents_dict = pd.Series(train_sents.index.values,
                                   index=train_sents.premise.values).to_dict()
            all_data['p_hash'] = all_data['premise'].map(sents_dict)
            all_data['h_hash'] = all_data['hypothesis'].map(sents_dict)

            p_vc = all_data.p_hash.value_counts().to_dict()
            h_vc = all_data.h_hash.value_counts().to_dict()

            pickle_dump(sents_dict_path, sents_dict)
            pickle_dump(p_vc_path, p_vc)
            pickle_dump(h_vc_path, h_vc)
            del train_data, dev_data, test_data, all_data
        return sents_dict, p_vc, h_vc
    def generate_graph(self):
        print('Logging Info - Get graph...')
        sent2id_path = os.path.join(FEATURE_DIR,
                                    '{}_graph_sent2id.pkl'.format(self.genre))
        graph_path = os.path.join(FEATURE_DIR,
                                  '{}_graph.pkl'.format(self.genre))
        if os.path.exists(graph_path):
            sent2id = pickle_load(sent2id_path)
            graph = pickle_load(graph_path)
        else:
            sent2id = {}  # sentence to id
            graph = nx.Graph()
            for data_type in ['train', 'dev', 'test']:
                for premise, hypothesis in zip(
                        self.get_data(data_type)['premise'],
                        self.get_data(data_type)['hypothesis']):
                    if premise not in sent2id:
                        sent2id[premise] = len(sent2id)
                    if hypothesis not in sent2id:
                        sent2id[hypothesis] = len(sent2id)
                    p_id = sent2id[premise]
                    h_id = sent2id[hypothesis]

                    match = 0.0
                    premise = premise.split()
                    hypothesis = hypothesis.split()
                    for w1 in premise:
                        if w1 in hypothesis:
                            match += 1

                    if len(premise) + len(hypothesis) == 0:
                        weight = 0.0
                    else:
                        weight = 2.0 * (match /
                                        (len(premise) + len(hypothesis)))
                    graph.add_edge(p_id, h_id, weight=weight)
            pickle_dump(sent2id_path, sent2id)
            pickle_dump(graph_path, graph)
        return sent2id, graph
Example #11
0
def plot_data(dir_data='/tmp', fn_data='data.pkl'):
    data = pickle_load(os.path.join(dir_data, fn_data))
    #print data[0].shape
    shape = data[0].shape[0:2]
    print shape
    data = np.array([255-to_gray(A).flatten() for A in data])
    plot_images(data, 
        10, 10, shape,
        border=2,
        reshape=True, figsize=None, colorbar=False,
        idx_highlight=None,
        vmin=0, vmax=255)
    
    plt.show()
 def add_similarity_feature(self, data_type, feat_type, sim_func):
     feat_file = self.format_feature_file(data_type, feat_type)
     if os.path.exists(feat_file):
         features = pickle_load(feat_file)
     else:
         len_dist_feat = np.array([
             sim_func(p, h) for p, h in zip(
                 self.get_data(data_type)['premise'],
                 self.get_data(data_type)['hypothesis'])
         ])
         features = self.check_and_expand_shape(len_dist_feat)
         pickle_dump(feat_file, features)
     print('Logging Info - {} : {} feature shape : {}'.format(
         data_type, feat_type, features.shape))
     return features
    def gen_all_features(self, data_type, scaled=False):
        if scaled:
            feat_file = self.format_feature_file(data_type, 'all_scaled')
        else:
            feat_file = self.format_feature_file(data_type, 'all')
        if os.path.exists(feat_file):
            features = pickle_load(feat_file)
        else:
            features = list()
            feat_types = [('len_dis', length_distance),
                          ('lcs_seq', lcs_seq_norm),
                          ('lcs_str', lcs_str_1_norm),
                          ('edit_dist', edit_distance),
                          ('jaro', jaro_distance),
                          ('jaro_winkler', jaro_winkler_dist), ('fuzz', fuzzy),
                          ('simhash', simhash), ('w_share', word_share),
                          ('w_ngram_dist', word_ngram_distance),
                          ('c_ngram_ol', char_ngram_overlap),
                          ('w_ngram_ol', word_ngram_overlap)]
            for feat_type, sim_func in feat_types:
                features.append(
                    self.add_similarity_feature(data_type, feat_type,
                                                sim_func))

            features.append(
                self.add_weighted_word_ngram_overlap_feature(data_type))
            features.append(self.add_tfidf_feature(data_type))
            features.append(self.add_word_power_feature(data_type))
            features.append(self.add_graph_feature(data_type))
            features = np.concatenate(features, axis=-1)

            if scaled:
                scaler = StandardScaler()
                features = scaler.fit_transform(features)
                joblib.dump(
                    scaler,
                    os.path.join(FEATURE_DIR,
                                 '{}_scaler.model'.format(self.genre)))

            pickle_dump(feat_file, features)

        print('Logging Info - {} : all feature shape : {}'.format(
            data_type, features.shape))
Example #14
0
def plot_data(dir_data='/tmp', fn_data='data.pkl'):
    data = pickle_load(os.path.join(dir_data, fn_data))
    #print data[0].shape
    shape = data[0].shape[0:2]
    print shape
    data = np.array([255 - to_gray(A).flatten() for A in data])
    plot_images(data,
                10,
                10,
                shape,
                border=2,
                reshape=True,
                figsize=None,
                colorbar=False,
                idx_highlight=None,
                vmin=0,
                vmax=255)

    plt.show()
    def add_word_power_feature(self, data_type):
        feat_file = self.format_feature_file(data_type, 'word_power')
        if os.path.exists(feat_file):
            features = pickle_load(feat_file)
        else:
            power_word = self.get_power_word()
            num_least = 100
            features = list()
            for premise, hypothesis in zip(
                    self.get_data(data_type)['premise'],
                    self.get_data(data_type)['hypothesis']):
                premise = premise.split()
                hypothesis = hypothesis.split()

                rate = [1.0, 1.0]
                share_words = list(set(premise).intersection(set(hypothesis)))
                for word in share_words:
                    if word not in power_word:
                        continue
                    if power_word[word][0] * power_word[word][
                            5] < num_least:  # 共享词出现在双侧语句对数量要大于num_least
                        continue
                    rate[0] *= (1.0 - power_word[word][6]
                                )  # 共享词但是语句对不是正确的(label!=2)
                p_diff = list(set(premise).difference(set(hypothesis)))
                h_diff = list(set(premise).difference(set(hypothesis)))
                all_diff = set(p_diff + h_diff)
                for word in all_diff:
                    if word not in power_word:
                        continue
                    if power_word[word][0] * power_word[word][
                            3] < num_least:  # 共享词只出现在单侧语句数量要大于num_least
                        continue
                    rate[1] *= (1.0 - power_word[word][4]
                                )  # 非共享词但是语句对是正确的(label=2)
                rate = [1 - num for num in rate]
                features.append(rate)
            features = np.array(features)
            pickle_dump(feat_file, features)
        print('Logging Info - {} : word_power feature shape : {}'.format(
            data_type, features.shape))
        return features
Example #16
0
def load_processed_text_data(variation, data_type):
    if data_type == 'train':
        filename = format_filename(PROCESSED_DATA_DIR,
                                   TRAIN_DATA_TEMPLATE,
                                   variation=variation)
    elif data_type == 'valid' or data_type == 'dev':
        filename = format_filename(PROCESSED_DATA_DIR,
                                   DEV_DATA_TEMPLATE,
                                   variation=variation)
    elif data_type == 'test':
        filename = format_filename(PROCESSED_DATA_DIR,
                                   TEST_DATA_TEMPLATE,
                                   variation=variation)
    else:
        raise ValueError('Data Type Not Understood: {}'.format(data_type))

    if os.path.exists(filename):
        return pickle_load(filename)
    else:
        return None
    def add_weighted_word_ngram_overlap_feature(self, data_type):
        feat_file = self.format_feature_file(data_type, 'w_ngram_ol_tfidf')
        if os.path.exists(feat_file):
            features = pickle_load(feat_file)
        else:
            dictionary, tfidf_model = self.tfidf_model()
            idf_model = tfidf_model.idfs
            features = list()
            for premise, hypothesis in zip(
                    self.get_data(data_type)['premise'],
                    self.get_data(data_type)['hypothesis']):
                premise = premise.split()
                p_tfidf = dict(tfidf_model[dictionary.doc2bow(premise)])
                input_premise = [
                    (word, idf_model.get(dictionary.token2id.get(word, 0),
                                         0.0),
                     p_tfidf.get(dictionary.token2id.get(word, 0), 0.0))
                    for word in premise
                ]

                hypothesis = hypothesis.split()
                h_tfidf = dict(tfidf_model[dictionary.doc2bow(hypothesis)])
                input_hypothesis = [
                    (word, idf_model.get(dictionary.token2id.get(word, 0),
                                         0.0),
                     h_tfidf.get(dictionary.token2id.get(word, 0), 0.0))
                    for word in hypothesis
                ]
                features.append(
                    weighted_word_ngram_overlap(input_premise,
                                                input_hypothesis))
            features = np.array(features)
            pickle_dump(feat_file, features)
        print('Logging Info - {} : w_ngram_ol_tfidf feature shape : {}'.format(
            data_type, features.shape))
        return features
Example #18
0
def recognition(model_name,
                predict_log,
                label_schema='BIOES',
                batch_size=32,
                n_epoch=50,
                learning_rate=0.001,
                optimizer_type='adam',
                use_char_input=True,
                embed_type=None,
                embed_trainable=True,
                use_bert_input=False,
                bert_type='bert',
                bert_trainable=True,
                bert_layer_num=1,
                use_bichar_input=False,
                bichar_embed_type=None,
                bichar_embed_trainable=True,
                use_word_input=False,
                word_embed_type=None,
                word_embed_trainable=True,
                use_charpos_input=False,
                charpos_embed_type=None,
                charpos_embed_trainable=True,
                use_softword_input=False,
                use_dictfeat_input=False,
                use_maxmatch_input=False,
                callbacks_to_add=None,
                swa_type=None,
                predict_on_dev=True,
                predict_on_final_test=True,
                **kwargs):
    config = ModelConfig()
    config.model_name = model_name
    config.label_schema = label_schema
    config.batch_size = batch_size
    config.n_epoch = n_epoch
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    config.embed_type = embed_type
    config.use_char_input = use_char_input
    if embed_type:
        config.embeddings = np.load(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            type=embed_type))
        config.embed_trainable = embed_trainable
        config.embed_dim = config.embeddings.shape[1]
    else:
        config.embeddings = None
        config.embed_trainable = True
    config.callbacks_to_add = callbacks_to_add or [
        'modelcheckpoint', 'earlystopping'
    ]

    config.vocab = pickle_load(
        format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='char'))
    config.vocab_size = len(config.vocab) + 2
    config.mention_to_entity = pickle_load(
        format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME))

    if config.use_char_input:
        config.exp_name = '{}_{}_{}_{}_{}_{}_{}'.format(
            model_name, config.embed_type if config.embed_type else 'random',
            'tune' if config.embed_trainable else 'fix', batch_size,
            optimizer_type, learning_rate, label_schema)
    else:
        config.exp_name = '{}_{}_{}_{}_{}'.format(model_name, batch_size,
                                                  optimizer_type,
                                                  learning_rate, label_schema)
    if kwargs:
        config.exp_name += '_' + '_'.join(
            [str(k) + '_' + str(v) for k, v in kwargs.items()])
    callback_str = '_' + '_'.join(config.callbacks_to_add)
    callback_str = callback_str.replace('_modelcheckpoint',
                                        '').replace('_earlystopping', '')
    config.exp_name += callback_str

    config.use_bert_input = use_bert_input
    config.bert_type = bert_type
    config.bert_trainable = bert_trainable
    config.bert_layer_num = bert_layer_num
    assert config.use_char_input or config.use_bert_input
    if config.use_bert_input:
        config.exp_name += '_{}_layer_{}_{}'.format(
            bert_type, bert_layer_num,
            'tune' if config.bert_trainable else 'fix')
    config.use_bichar_input = use_bichar_input
    if config.use_bichar_input:
        config.bichar_vocab = pickle_load(
            format_filename(PROCESSED_DATA_DIR,
                            VOCABULARY_TEMPLATE,
                            level='bichar'))
        config.bichar_vocab_size = len(config.bichar_vocab) + 2
        if bichar_embed_type:
            config.bichar_embeddings = np.load(
                format_filename(PROCESSED_DATA_DIR,
                                EMBEDDING_MATRIX_TEMPLATE,
                                type=bichar_embed_type))
            config.bichar_embed_trainable = bichar_embed_trainable
            config.bichar_embed_dim = config.bichar_embeddings.shape[1]
        else:
            config.bichar_embeddings = None
            config.bichar_embed_trainable = True
        config.exp_name += '_bichar_{}_{}'.format(
            bichar_embed_type if bichar_embed_type else 'random',
            'tune' if config.bichar_embed_trainable else 'fix')
    config.use_word_input = use_word_input
    if config.use_word_input:
        config.word_vocab = pickle_load(
            format_filename(PROCESSED_DATA_DIR,
                            VOCABULARY_TEMPLATE,
                            level='word'))
        config.word_vocab_size = len(config.word_vocab) + 2
        if word_embed_type:
            config.word_embeddings = np.load(
                format_filename(PROCESSED_DATA_DIR,
                                EMBEDDING_MATRIX_TEMPLATE,
                                type=word_embed_type))
            config.word_embed_trainable = word_embed_trainable
            config.word_embed_dim = config.word_embeddings.shape[1]
        else:
            config.word_embeddings = None
            config.word_embed_trainable = True
        config.exp_name += '_word_{}_{}'.format(
            word_embed_type if word_embed_type else 'random',
            'tune' if config.word_embed_trainable else 'fix')
    config.use_charpos_input = use_charpos_input
    if config.use_charpos_input:
        config.charpos_vocab = pickle_load(
            format_filename(PROCESSED_DATA_DIR,
                            VOCABULARY_TEMPLATE,
                            level='charpos'))
        config.charpos_vocab_size = len(config.charpos_vocab) + 2
        if charpos_embed_type:
            config.charpos_embeddings = np.load(
                format_filename(PROCESSED_DATA_DIR,
                                EMBEDDING_MATRIX_TEMPLATE,
                                type=charpos_embed_type))
            config.charpos_embed_trainable = charpos_embed_trainable
            config.charpos_embed_dim = config.charpos_embeddings.shape[1]
        else:
            config.charpos_embeddings = None
            config.charpos_embed_trainable = True
        config.exp_name += '_charpos_{}_{}'.format(
            charpos_embed_type if charpos_embed_type else 'random',
            'tune' if config.charpos_embed_trainable else 'fix')
    config.use_softword_input = use_softword_input
    if config.use_softword_input:
        config.exp_name += '_softword'
    config.use_dictfeat_input = use_dictfeat_input
    if config.use_dictfeat_input:
        config.exp_name += '_dictfeat'
    config.use_maxmatch_input = use_maxmatch_input
    if config.use_maxmatch_input:
        config.exp_name += '_maxmatch'

    # logger to log output of training process
    predict_log.update({
        'er_exp_name': config.exp_name,
        'er_batch_size': batch_size,
        'er_optimizer': optimizer_type,
        'er_epoch': n_epoch,
        'er_learning_rate': learning_rate,
        'er_other_params': kwargs
    })

    print('Logging Info - Experiment: %s' % config.exp_name)
    model = RecognitionModel(config, **kwargs)

    dev_data_type = 'dev'
    if predict_on_final_test:
        test_data_type = 'test_final'
    else:
        test_data_type = 'test'
    valid_generator = RecognitionDataGenerator(
        dev_data_type, config.batch_size, config.label_schema,
        config.label_to_one_hot[config.label_schema],
        config.vocab if config.use_char_input else None,
        config.bert_vocab_file(config.bert_type) if config.use_bert_input else
        None, config.bert_seq_len, config.bichar_vocab, config.word_vocab,
        config.use_word_input, config.charpos_vocab, config.use_softword_input,
        config.use_dictfeat_input, config.use_maxmatch_input)
    test_generator = RecognitionDataGenerator(
        test_data_type, config.batch_size, config.label_schema,
        config.label_to_one_hot[config.label_schema],
        config.vocab if config.use_char_input else None,
        config.bert_vocab_file(config.bert_type) if config.use_bert_input else
        None, config.bert_seq_len, config.bichar_vocab, config.word_vocab,
        config.use_word_input, config.charpos_vocab, config.use_softword_input,
        config.use_dictfeat_input, config.use_maxmatch_input)

    model_save_path = os.path.join(config.checkpoint_dir,
                                   '{}.hdf5'.format(config.exp_name))
    if not os.path.exists(model_save_path):
        raise FileNotFoundError(
            'Recognition model not exist: {}'.format(model_save_path))

    if swa_type is None:
        model.load_best_model()
    elif 'swa' in callbacks_to_add:
        model.load_swa_model(swa_type)
        predict_log['er_exp_name'] += '_{}'.format(swa_type)

    if predict_on_dev:
        print('Logging Info - Generate submission for valid data:')
        dev_pred_mentions = model.predict(valid_generator)
    else:
        dev_pred_mentions = None
    print('Logging Info - Generate submission for test data:')
    test_pred_mentions = model.predict(test_generator)

    return dev_pred_mentions, test_pred_mentions
    def get_power_word(self):
        """
        计算数据中词语的影响力,格式如下:
        词语 --> [0. 出现语句对数量,1. 出现语句对比例,2. 正确语句对比例,3. 单侧语句对比例,4. 单侧语句对正确比例,
                 5. 双侧语句对比例,6. 双侧语句对正确比例]
        """
        print('Logging Info - Get power word...')
        words_power_path = os.path.join(FEATURE_DIR,
                                        '{}_power_word.pkl'.format(self.genre))
        if os.path.exists(words_power_path):
            words_power = pickle_load(words_power_path)
        else:
            words_power = {}
            x_a = [
                text.split() for text in self.train_data['premise'] +
                self.dev_data['premise'] + self.test_data['premise']
            ]
            x_b = [
                text.split() for text in self.train_data['hypothesis'] +
                self.dev_data['hypothesis'] + self.test_data['hypothesis']
            ]
            y = self.train_data['label'] + self.dev_data[
                'label'] + self.test_data['label']
            for i in range(len(x_a)):
                label = y[i]
                q1_words = x_a[i]
                q2_words = x_b[i]
                all_words = set(q1_words + q2_words)
                q1_words = set(q1_words)
                q2_words = set(q2_words)
                for word in all_words:
                    if word not in words_power:
                        words_power[word] = [0. for _ in range(7)]
                    words_power[word][0] += 1.  # 计算出现语句对的数量
                    words_power[word][1] += 1.  # 计算出现语句对比例

                    if ((word in q1_words) and
                        (word not in q2_words)) or ((word not in q1_words) and
                                                    (word in q2_words)):
                        words_power[word][3] += 1.  # 计算单侧语句对比例
                        if 0 == label:
                            words_power[word][2] += 1.  # 计算正确语句对比例
                            words_power[word][4] += 1.  # 计算单侧语句正确比例
                    if (word in q1_words) and (word in q2_words):
                        words_power[word][5] += 1.  # 计算双侧语句数量
                        if 2 == label:
                            words_power[word][2] += 1.  # 计算正确语句对比例
                            words_power[word][6] += 1.  # 计算双侧语句正确比例

            for word in words_power:
                words_power[word][1] /= len(x_a)  # 计算出现语句对比例=出现语句对数量/总的语句对数量
                words_power[word][2] /= words_power[word][
                    0]  # 计算正确语句对比例=正确语句对数量/出现语句对数量
                if words_power[word][3] > 1e-6:
                    words_power[word][4] /= words_power[word][
                        3]  # 计算单侧语句正确比例=单侧语句正确数量/出现单侧语句数量
                words_power[word][3] /= words_power[word][
                    0]  # 计算出现单侧语句对比例=出现单侧语句数量/出现语句对数量
                if words_power[word][5] > 1e-6:
                    words_power[word][6] /= words_power[word][
                        5]  # 计算双侧语句正确比例=双侧语句正确数量/出现双侧语句数量
                words_power[word][5] /= words_power[word][
                    0]  # 计算出现双侧语句对比例=出现双侧语句数量/出现语句数量
            del x_a, x_b, y
            pickle_dump(words_power_path, words_power)

        return words_power
Example #20
0
def train_link(model_name,
               batch_size=32,
               n_epoch=50,
               learning_rate=0.001,
               optimizer_type='adam',
               embed_type=None,
               embed_trainable=True,
               callbacks_to_add=None,
               use_relative_pos=False,
               n_neg=1,
               omit_one_cand=True,
               overwrite=False,
               swa_start=5,
               early_stopping_patience=3,
               **kwargs):
    config = ModelConfig()
    config.model_name = model_name
    config.batch_size = batch_size
    config.n_epoch = n_epoch
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    config.embed_type = embed_type
    if embed_type:
        config.embeddings = np.load(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            type=embed_type))
        config.embed_trainable = embed_trainable
    else:
        config.embeddings = None
        config.embed_trainable = True

    config.callbacks_to_add = callbacks_to_add or [
        'modelcheckpoint', 'earlystopping'
    ]
    if 'swa' in config.callbacks_to_add:
        config.swa_start = swa_start
        config.early_stopping_patience = early_stopping_patience

    config.vocab = pickle_load(
        format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='char'))
    config.vocab_size = len(config.vocab) + 2
    config.mention_to_entity = pickle_load(
        format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME))
    config.entity_desc = pickle_load(
        format_filename(PROCESSED_DATA_DIR, ENTITY_DESC_FILENAME))

    config.exp_name = '{}_{}_{}_{}_{}_{}'.format(
        model_name, embed_type if embed_type else 'random',
        'tune' if config.embed_trainable else 'fix', batch_size,
        optimizer_type, learning_rate)
    config.use_relative_pos = use_relative_pos
    if config.use_relative_pos:
        config.exp_name += '_rel'
    config.n_neg = n_neg
    if config.n_neg > 1:
        config.exp_name += '_neg_{}'.format(config.n_neg)
    config.omit_one_cand = omit_one_cand
    if not config.omit_one_cand:
        config.exp_name += '_not_omit'
    if kwargs:
        config.exp_name += '_' + '_'.join(
            [str(k) + '_' + str(v) for k, v in kwargs.items()])
    callback_str = '_' + '_'.join(config.callbacks_to_add)
    callback_str = callback_str.replace('_modelcheckpoint',
                                        '').replace('_earlystopping', '')
    config.exp_name += callback_str

    # logger to log output of training process
    train_log = {
        'exp_name': config.exp_name,
        'batch_size': batch_size,
        'optimizer': optimizer_type,
        'epoch': n_epoch,
        'learning_rate': learning_rate,
        'other_params': kwargs
    }

    print('Logging Info - Experiment: %s' % config.exp_name)
    model_save_path = os.path.join(config.checkpoint_dir,
                                   '{}.hdf5'.format(config.exp_name))
    model = LinkModel(config, **kwargs)

    train_data_type, dev_data_type = 'train', 'dev'
    train_generator = LinkDataGenerator(
        train_data_type, config.vocab, config.mention_to_entity,
        config.entity_desc, config.batch_size, config.max_desc_len,
        config.max_erl_len, config.use_relative_pos, config.n_neg,
        config.omit_one_cand)
    dev_data = load_data(dev_data_type)

    if not os.path.exists(model_save_path) or overwrite:
        start_time = time.time()
        model.train(train_generator, dev_data)
        elapsed_time = time.time() - start_time
        print('Logging Info - Training time: %s' %
              time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        train_log['train_time'] = time.strftime("%H:%M:%S",
                                                time.gmtime(elapsed_time))

    model.load_best_model()
    dev_text_data, dev_pred_mentions, dev_gold_mention_entities = [], [], []
    for data in dev_data:
        dev_text_data.append(data['text'])
        dev_pred_mentions.append(data['mention_data'])
        dev_gold_mention_entities.append(data['mention_data'])
    print('Logging Info - Evaluate over valid data:')
    r, p, f1 = model.evaluate(dev_text_data, dev_pred_mentions,
                              dev_gold_mention_entities)
    train_log['dev_performance'] = (r, p, f1)

    swa_type = None
    if 'swa' in config.callbacks_to_add:
        swa_type = 'swa'
    elif 'swa_clr' in config.callbacks_to_add:
        swa_type = 'swa_clr'
    if swa_type:
        model.load_swa_model(swa_type)
        print('Logging Info - Evaluate over valid data based on swa model:')
        r, p, f1 = model.evaluate(dev_text_data, dev_pred_mentions,
                                  dev_gold_mention_entities)
        train_log['swa_dev_performance'] = (r, p, f1)

    train_log['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                           time.localtime())
    write_log(format_filename(LOG_DIR, PERFORMANCE_LOG, model_type='2step_el'),
              log=train_log,
              mode='a')
    del model
    gc.collect()
    K.clear_session()
Example #21
0
    def __init__(self,
                 data_type,
                 batch_size,
                 label_schema,
                 label_to_onehot,
                 char_vocab=None,
                 bert_vocab=None,
                 bert_seq_len=None,
                 bichar_vocab=None,
                 word_vocab=None,
                 use_word_input=False,
                 charpos_vocab=None,
                 use_softword_input=False,
                 use_dictfeat_input=False,
                 use_maxmatch_input=False,
                 shuffle=True):
        self.data_type = data_type
        self.data = load_data(data_type)
        self.data_size = len(self.data)
        self.batch_size = batch_size
        self.indices = np.arange(self.data_size)
        self.steps = int(np.ceil(self.data_size / self.batch_size))

        assert label_schema in ['BIO', 'BIOES']
        self.label_schema = label_schema
        self.label_to_onehot = label_to_onehot

        # main input
        self.char_vocab = char_vocab
        self.use_char_input = False if self.char_vocab is None else True

        # additional feature input
        self.bert_vocab = bert_vocab
        self.use_bert_input = False if self.bert_vocab is None else True
        self.bert_seq_len = bert_seq_len if self.use_bert_input else None
        assert self.use_char_input or self.use_bert_input
        if self.use_bert_input:
            self.token_dict = {}
            with codecs.open(self.bert_vocab, 'r', 'utf8') as reader:
                for line in reader:
                    token = line.strip()
                    self.token_dict[token] = len(self.token_dict)
            self.bert_tokenizer = Tokenizer(self.token_dict)

        self.bichar_vocab = bichar_vocab
        self.use_bichar_input = False if self.bichar_vocab is None else True

        self.word_vocab = word_vocab
        self.use_word_input = use_word_input
        assert not (self.use_word_input and self.word_vocab is None)

        self.charpos_vocab = charpos_vocab
        self.use_charpos_input = False if self.charpos_vocab is None else True

        self.use_softword_input = use_softword_input
        self.use_dictfeat_input = use_dictfeat_input
        self.use_maxmatch_input = use_maxmatch_input

        self.mention_to_entity = None
        if self.use_word_input or self.use_charpos_input or self.use_softword_input:
            self.mention_to_entity = pickle_load(
                format_filename(PROCESSED_DATA_DIR,
                                MENTION_TO_ENTITY_FILENAME))
            for mention in self.mention_to_entity.keys():
                jieba.add_word(mention, freq=1000000)
        if (self.use_dictfeat_input
                or self.use_maxmatch_input) and self.mention_to_entity is None:
            self.mention_to_entity = pickle_load(
                format_filename(PROCESSED_DATA_DIR,
                                MENTION_TO_ENTITY_FILENAME))

        self.shuffle = shuffle
    def add_graph_feature(self, data_type):
        feat_file = self.format_feature_file(data_type, 'word_power')
        if os.path.exists(feat_file):
            graph_features = pickle_load(feat_file)
        else:
            sent2id, graph = self.generate_graph()

            n2clique = {}
            cliques = []
            for clique in nx.find_cliques(graph):
                for n in clique:
                    if n not in n2clique:
                        n2clique[n] = []
                    n2clique[n].append(len(cliques))
                cliques.append(clique)

            n2cc = {}
            ccs = []
            for cc in nx.connected_components(graph):
                for n in cc:
                    n2cc[n] = len(ccs)
                ccs.append(cc)

            pagerank = nx.pagerank(graph, alpha=0.9, max_iter=100)

            hits_h, hits_a = nx.hits(graph, max_iter=100)

            indegree_features = list()
            clique_features = list()
            cc_features = list()
            pagerank_features = list()
            hits_features = list()
            shortestpath_features = list()
            # neighbor_features = list()
            for premise, hypothesis in zip(
                    self.get_data(data_type)['premise'],
                    self.get_data(data_type)['hypothesis']):
                p_id = sent2id[premise]
                h_id = sent2id[hypothesis]

                # graph in-degree fetures
                indegree_features.append(
                    [graph.degree[p_id], graph.degree[h_id]])

                # clique features
                edge_max_clique_size = 0
                num_clique = 0
                for clique_id in n2clique[p_id]:
                    if h_id in cliques[clique_id]:
                        edge_max_clique_size = max(edge_max_clique_size,
                                                   len(cliques[clique_id]))
                        num_clique += 1
                clique_features.append([edge_max_clique_size, num_clique])

                lnode_max_clique_size = 0
                rnode_max_clique_size = 0
                for clique_id in n2clique[p_id]:
                    lnode_max_clique_size = max(lnode_max_clique_size,
                                                len(cliques[clique_id]))

                for clique_id in n2clique[h_id]:
                    rnode_max_clique_size = max(rnode_max_clique_size,
                                                len(cliques[clique_id]))

                clique_features[-1] += [
                    lnode_max_clique_size, rnode_max_clique_size,
                    max(lnode_max_clique_size, rnode_max_clique_size),
                    min(lnode_max_clique_size, rnode_max_clique_size)
                ]

                # connected components features
                cc_features.append([len(ccs[n2cc[p_id]])])

                # page rank features
                pr1 = pagerank[p_id] * 1e6
                pr2 = pagerank[h_id] * 1e6
                pagerank_features.append(
                    [pr1, pr2,
                     max(pr1, pr2),
                     min(pr1, pr2), (pr1 + pr2) / 2.])

                # graph hits features
                h1 = hits_h[p_id] * 1e6
                h2 = hits_h[h_id] * 1e6
                a1 = hits_a[p_id] * 1e6
                a2 = hits_a[h_id] * 1e6
                hits_features.append([
                    h1, h2, a1, a2,
                    max(h1, h2),
                    max(a1, a2),
                    min(h1, h2),
                    min(a1, a2), (h1 + h2) / 2., (a1 + a2) / 2.
                ])

                # graph shortest path features
                shortest_path = -1
                weight = graph[p_id][h_id]['weight']
                graph.remove_edge(p_id, h_id)
                if nx.has_path(graph, p_id, h_id):
                    shortest_path = nx.dijkstra_path_length(graph, p_id, h_id)
                graph.add_edge(p_id, h_id, weight=weight)
                shortestpath_features.append([shortest_path])

                # graph neighbour features
                # l = []
                # r = []
                # l_nb = graph.neighbors(p_id)
                # r_nb = graph.neighbors(h_id)
                # for n in l_nb:
                #     if (n != h_id) and (n != p_id):
                #         l.append(graph[p_id][n]['weight'])
                # for n in r_nb:
                #     if (n != h_id) and (n != p_id):
                #         r.append(graph[h_id][n]['weight'])
                # if len(l) == 0 or len(r) == 0:
                #     neighbor_features.append([0.0] * 11)
                # else:
                #     neighbor_features.append(l + r +
                #                              [len(list((set(l_nb).union(set(r_nb))) ^ (set(l_nb) ^ set(r_nb))))])

            graph_features = np.concatenate(
                (np.array(indegree_features), np.array(clique_features),
                 np.array(cc_features), np.array(pagerank_features),
                 np.array(hits_features), np.array(shortestpath_features)),
                axis=-1)
            pickle_dump(feat_file, graph_features)
        print('Logging Info - {} : graph feature shape : {}'.format(
            data_type, graph_features.shape))
        return graph_features
Example #23
0
def link(model_name,
         dev_pred_mentions,
         test_pred_mentions,
         predict_log,
         batch_size=32,
         n_epoch=50,
         learning_rate=0.001,
         optimizer_type='adam',
         embed_type=None,
         embed_trainable=True,
         use_relative_pos=False,
         n_neg=1,
         omit_one_cand=True,
         callbacks_to_add=None,
         swa_type=None,
         predict_on_final_test=True,
         **kwargs):
    config = ModelConfig()
    config.model_name = model_name
    config.batch_size = batch_size
    config.n_epoch = n_epoch
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    config.embed_type = embed_type
    if embed_type:
        config.embeddings = np.load(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            type=embed_type))
        config.embed_trainable = embed_trainable
    else:
        config.embeddings = None
        config.embed_trainable = True

    config.callbacks_to_add = callbacks_to_add or [
        'modelcheckpoint', 'earlystopping'
    ]

    config.vocab = pickle_load(
        format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='char'))
    config.vocab_size = len(config.vocab) + 2
    config.mention_to_entity = pickle_load(
        format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME))
    config.entity_desc = pickle_load(
        format_filename(PROCESSED_DATA_DIR, ENTITY_DESC_FILENAME))

    config.exp_name = '{}_{}_{}_{}_{}_{}'.format(
        model_name, embed_type if embed_type else 'random',
        'tune' if embed_trainable else 'fix', batch_size, optimizer_type,
        learning_rate)
    config.use_relative_pos = use_relative_pos
    if config.use_relative_pos:
        config.exp_name += '_rel'
    config.n_neg = n_neg
    if config.n_neg > 1:
        config.exp_name += '_neg_{}'.format(config.n_neg)
    config.omit_one_cand = omit_one_cand
    if not config.omit_one_cand:
        config.exp_name += '_not_omit'
    if kwargs:
        config.exp_name += '_' + '_'.join(
            [str(k) + '_' + str(v) for k, v in kwargs.items()])
    callback_str = '_' + '_'.join(config.callbacks_to_add)
    callback_str = callback_str.replace('_modelcheckpoint',
                                        '').replace('_earlystopping', '')
    config.exp_name += callback_str

    # logger to log output of training process
    predict_log.update({
        'el_exp_name': config.exp_name,
        'el_batch_size': batch_size,
        'el_optimizer': optimizer_type,
        'el_epoch': n_epoch,
        'el_learning_rate': learning_rate,
        'el_other_params': kwargs
    })

    print('Logging Info - Experiment: %s' % config.exp_name)
    model = LinkModel(config, **kwargs)

    model_save_path = os.path.join(config.checkpoint_dir,
                                   '{}.hdf5'.format(config.exp_name))
    if not os.path.exists(model_save_path):
        raise FileNotFoundError(
            'Recognition model not exist: {}'.format(model_save_path))
    if swa_type is None:
        model.load_best_model()
    elif 'swa' in callbacks_to_add:
        model.load_swa_model(swa_type)
        predict_log['er_exp_name'] += '_{}'.format(swa_type)

    dev_data_type = 'dev'
    dev_data = load_data(dev_data_type)
    dev_text_data, dev_gold_mention_entities = [], []
    for data in dev_data:
        dev_text_data.append(data['text'])
        dev_gold_mention_entities.append(data['mention_data'])

    if predict_on_final_test:
        test_data_type = 'test_final'
    else:
        test_data_type = 'test'
    test_data = load_data(test_data_type)
    test_text_data = [data['text'] for data in test_data]

    if dev_pred_mentions is not None:
        print(
            'Logging Info - Evaluate over valid data based on predicted mention:'
        )
        r, p, f1 = model.evaluate(dev_text_data, dev_pred_mentions,
                                  dev_gold_mention_entities)
        dev_performance = 'dev_performance' if swa_type is None else '%s_dev_performance' % swa_type
        predict_log[dev_performance] = (r, p, f1)
    print('Logging Info - Generate submission for test data:')
    test_pred_mention_entities = model.predict(test_text_data,
                                               test_pred_mentions)
    test_submit_file = predict_log[
        'er_exp_name'] + '_' + config.exp_name + '_%s%ssubmit.json' % (
            swa_type + '_' if swa_type else '',
            'final_' if predict_on_final_test else '')
    submit_result(test_submit_file, test_data, test_pred_mention_entities)

    predict_log['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime())
    write_log(format_filename(LOG_DIR, PERFORMANCE_LOG, model_type='2step'),
              log=predict_log,
              mode='a')
    return predict_log
Example #24
0
def train_recognition(model_name, label_schema='BIOES', batch_size=32, n_epoch=50, learning_rate=0.001,
                      optimizer_type='adam', use_char_input=True, embed_type=None, embed_trainable=True,
                      use_bert_input=False, bert_type='bert', bert_trainable=True, bert_layer_num=1,
                      use_bichar_input=False, bichar_embed_type=None, bichar_embed_trainable=True,
                      use_word_input=False, word_embed_type=None, word_embed_trainable=True,
                      use_charpos_input=False, charpos_embed_type=None, charpos_embed_trainable=True,
                      use_softword_input=False, use_dictfeat_input=False, use_maxmatch_input=False,
                      callbacks_to_add=None, overwrite=False, swa_start=3, early_stopping_patience=3, **kwargs):
    config = ModelConfig()
    config.model_name = model_name
    config.label_schema = label_schema
    config.batch_size = batch_size
    config.n_epoch = n_epoch
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    config.embed_type = embed_type
    config.use_char_input = use_char_input
    if embed_type:
        config.embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, type=embed_type))
        config.embed_trainable = embed_trainable
        config.embed_dim = config.embeddings.shape[1]
    else:
        config.embeddings = None
        config.embed_trainable = True

    config.callbacks_to_add = callbacks_to_add or ['modelcheckpoint', 'earlystopping']
    if 'swa' in config.callbacks_to_add:
        config.swa_start = swa_start
        config.early_stopping_patience = early_stopping_patience

    config.vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='char'))
    config.vocab_size = len(config.vocab) + 2
    config.mention_to_entity = pickle_load(format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME))

    if config.use_char_input:
        config.exp_name = '{}_{}_{}_{}_{}_{}_{}'.format(model_name, config.embed_type if config.embed_type else 'random',
                                                        'tune' if config.embed_trainable else 'fix', batch_size,
                                                        optimizer_type, learning_rate, label_schema)
    else:
        config.exp_name = '{}_{}_{}_{}_{}'.format(model_name, batch_size, optimizer_type, learning_rate, label_schema)
    if config.n_epoch != 50:
        config.exp_name += '_{}'.format(config.n_epoch)
    if kwargs:
        config.exp_name += '_' + '_'.join([str(k) + '_' + str(v) for k, v in kwargs.items()])
    callback_str = '_' + '_'.join(config.callbacks_to_add)
    callback_str = callback_str.replace('_modelcheckpoint', '').replace('_earlystopping', '')
    config.exp_name += callback_str

    config.use_bert_input = use_bert_input
    config.bert_type = bert_type
    config.bert_trainable = bert_trainable
    config.bert_layer_num = bert_layer_num
    assert config.use_char_input or config.use_bert_input
    if config.use_bert_input:
        config.exp_name += '_{}_layer_{}_{}'.format(bert_type, bert_layer_num, 'tune' if config.bert_trainable else 'fix')
    config.use_bichar_input = use_bichar_input
    if config.use_bichar_input:
        config.bichar_vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='bichar'))
        config.bichar_vocab_size = len(config.bichar_vocab) + 2
        if bichar_embed_type:
            config.bichar_embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE,
                                                               type=bichar_embed_type))
            config.bichar_embed_trainable = bichar_embed_trainable
            config.bichar_embed_dim = config.bichar_embeddings.shape[1]
        else:
            config.bichar_embeddings = None
            config.bichar_embed_trainable = True
        config.exp_name += '_bichar_{}_{}'.format(bichar_embed_type if bichar_embed_type else 'random',
                                                  'tune' if config.bichar_embed_trainable else 'fix')
    config.use_word_input = use_word_input
    if config.use_word_input:
        config.word_vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='word'))
        config.word_vocab_size = len(config.word_vocab) + 2
        if word_embed_type:
            config.word_embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE,
                                                             type=word_embed_type))
            config.word_embed_trainable = word_embed_trainable
            config.word_embed_dim = config.word_embeddings.shape[1]
        else:
            config.word_embeddings = None
            config.word_embed_trainable = True
        config.exp_name += '_word_{}_{}'.format(word_embed_type if word_embed_type else 'random',
                                                'tune' if config.word_embed_trainable else 'fix')
    config.use_charpos_input = use_charpos_input
    if config.use_charpos_input:
        config.charpos_vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='charpos'))
        config.charpos_vocab_size = len(config.charpos_vocab) + 2
        if charpos_embed_type:
            config.charpos_embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE,
                                                                type=charpos_embed_type))
            config.charpos_embed_trainable = charpos_embed_trainable
            config.charpos_embed_dim = config.charpos_embeddings.shape[1]
        else:
            config.charpos_embeddings = None
            config.charpos_embed_trainable = True
        config.exp_name += '_charpos_{}_{}'.format(charpos_embed_type if charpos_embed_type else 'random',
                                                   'tune' if config.charpos_embed_trainable else 'fix')
    config.use_softword_input = use_softword_input
    if config.use_softword_input:
        config.exp_name += '_softword'
    config.use_dictfeat_input = use_dictfeat_input
    if config.use_dictfeat_input:
        config.exp_name += '_dictfeat'
    config.use_maxmatch_input = use_maxmatch_input
    if config.use_maxmatch_input:
        config.exp_name += '_maxmatch'

    # logger to log output of training process
    train_log = {'exp_name': config.exp_name, 'batch_size': batch_size, 'optimizer': optimizer_type, 'epoch': n_epoch,
                 'learning_rate': learning_rate, 'other_params': kwargs}

    print('Logging Info - Experiment: %s' % config.exp_name)
    model_save_path = os.path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name))
    model = RecognitionModel(config, **kwargs)

    train_data_type, dev_data_type = 'train', 'dev'
    train_generator = RecognitionDataGenerator(train_data_type, config.batch_size, config.label_schema,
                                               config.label_to_one_hot[config.label_schema],
                                               config.vocab if config.use_char_input else None,
                                               config.bert_vocab_file(config.bert_type) if config.use_bert_input else None,
                                               config.bert_seq_len, config.bichar_vocab, config.word_vocab,
                                               config.use_word_input, config.charpos_vocab, config.use_softword_input,
                                               config.use_dictfeat_input, config.use_maxmatch_input)
    valid_generator = RecognitionDataGenerator(dev_data_type, config.batch_size, config.label_schema,
                                               config.label_to_one_hot[config.label_schema],
                                               config.vocab if config.use_char_input else None,
                                               config.bert_vocab_file(config.bert_type) if config.use_bert_input else None,
                                               config.bert_seq_len, config.bichar_vocab, config.word_vocab,
                                               config.use_word_input, config.charpos_vocab, config.use_softword_input,
                                               config.use_dictfeat_input, config.use_maxmatch_input)

    if not os.path.exists(model_save_path) or overwrite:
        start_time = time.time()
        model.train(train_generator, valid_generator)
        elapsed_time = time.time() - start_time
        print('Logging Info - Training time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        train_log['train_time'] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

    model.load_best_model()

    print('Logging Info - Evaluate over valid data:')
    r, p, f1 = model.evaluate(valid_generator)
    train_log['dev_performance'] = (r, p, f1)

    swa_type = None
    if 'swa' in config.callbacks_to_add:
        swa_type = 'swa'
    elif 'swa_clr' in config.callbacks_to_add:
        swa_type = 'swa_clr'
    if swa_type:
        model.load_swa_model(swa_type)
        print('Logging Info - Evaluate over valid data based on swa model:')
        r, p, f1 = model.evaluate(valid_generator)
        train_log['swa_dev_performance'] = (r, p, f1)

    train_log['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
    write_log(format_filename(LOG_DIR, PERFORMANCE_LOG, model_type='2step_er'), log=train_log, mode='a')

    del model
    gc.collect()
    K.clear_session()
Example #25
0
def train_model(genre, input_level, word_embed_type, word_embed_trainable, batch_size, learning_rate,
                optimizer_type, model_name, n_epoch=50, add_features=False, scale_features=False, overwrite=False,
                lr_range_test=False, callbacks_to_add=None, eval_on_train=False, **kwargs):
    config = ModelConfig()
    config.genre = genre
    config.input_level = input_level
    config.max_len = config.word_max_len[genre] if input_level == 'word' else config.char_max_len[genre]
    config.word_embed_type = word_embed_type
    config.word_embed_trainable = word_embed_trainable
    config.callbacks_to_add = callbacks_to_add or []
    config.add_features = add_features
    config.batch_size = batch_size
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    config.n_epoch = n_epoch
    config.word_embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre,
                                                     word_embed_type))
    vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, genre, input_level))
    config.idx2token = dict((idx, token) for token, idx in vocab.items())

    # experiment name configuration
    config.exp_name = '{}_{}_{}_{}_{}_{}_{}_{}'.format(genre, model_name, input_level, word_embed_type,
                                                       'tune' if word_embed_trainable else 'fix', batch_size,
                                                       '_'.join([str(k) + '_' + str(v) for k, v in kwargs.items()]),
                                                       optimizer_type)
    if config.add_features:
        config.exp_name = config.exp_name + '_feature_scaled' if scale_features else config.exp_name + '_featured'
    if len(config.callbacks_to_add) > 0:
        callback_str = '_' + '_'.join(config.callbacks_to_add)
        callback_str = callback_str.replace('_modelcheckpoint', '').replace('_earlystopping', '')
        config.exp_name += callback_str

    input_config = kwargs['input_config'] if 'input_config' in kwargs else 'token'  # input default is word embedding
    if input_config in ['cache_elmo', 'token_combine_cache_elmo']:
        # get elmo embedding based on cache, we first get a ELMoCache instance
        if 'elmo_model_type' in kwargs:
            elmo_model_type = kwargs['elmo_model_type']
            kwargs.pop('elmo_model_type')   # we don't need it in kwargs any more
        else:
            elmo_model_type = 'allennlp'
        if 'elmo_output_mode' in kwargs:
            elmo_output_mode = kwargs['elmo_output_mode']
            kwargs.pop('elmo_output_mode')  # we don't need it in kwargs any more
        else:
            elmo_output_mode ='elmo'
        elmo_cache = ELMoCache(options_file=config.elmo_options_file, weight_file=config.elmo_weight_file,
                               cache_dir=config.cache_dir, idx2token=config.idx2token,
                               max_sentence_length=config.max_len, elmo_model_type=elmo_model_type,
                               elmo_output_mode=elmo_output_mode)
    elif input_config in ['elmo_id', 'elmo_s', 'token_combine_elmo_id', 'token_combine_elmo_s']:
        # get elmo embedding using tensorflow_hub, we must provide a tfhub_url
        kwargs['elmo_model_url'] = config.elmo_model_url

    # logger to log output of training process
    train_log = {'exp_name': config.exp_name, 'batch_size': batch_size, 'optimizer': optimizer_type, 'epoch': n_epoch,
                 'learning_rate': learning_rate, 'other_params': kwargs}

    print('Logging Info - Experiment: %s' % config.exp_name)
    if model_name == 'KerasInfersent':
        model = KerasInfersentModel(config, **kwargs)
    elif model_name == 'KerasEsim':
        model = KerasEsimModel(config, **kwargs)
    elif model_name == 'KerasDecomposable':
        model = KerasDecomposableAttentionModel(config, **kwargs)
    elif model_name == 'KerasSiameseBiLSTM':
        model = KerasSimaeseBiLSTMModel(config, **kwargs)
    elif model_name == 'KerasSiameseCNN':
        model = KerasSiameseCNNModel(config, **kwargs)
    elif model_name == 'KerasIACNN':
        model = KerasIACNNModel(config, **kwargs)
    elif model_name == 'KerasSiameseLSTMCNNModel':
        model = KerasSiameseLSTMCNNModel(config, **kwargs)
    elif model_name == 'KerasRefinedSSAModel':
        model = KerasRefinedSSAModel(config, **kwargs)
    else:
        raise ValueError('Model Name Not Understood : {}'.format(model_name))
    # model.summary()

    train_input, dev_input, test_input = None, None, None
    if lr_range_test:   # conduct lr range test to find optimal learning rate (not train model)
        train_input = load_input_data(genre, input_level, 'train', input_config, config.add_features, scale_features)
        dev_input = load_input_data(genre, input_level, 'dev', input_config, config.add_features, scale_features)
        model.lr_range_test(x_train=train_input['x'], y_train=train_input['y'], x_valid=dev_input['x'],
                            y_valid=dev_input['y'])
        return

    model_save_path = os.path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name))
    if not os.path.exists(model_save_path) or overwrite:
        start_time = time.time()

        if input_config in ['cache_elmo', 'token_combine_cache_elmo']:
            train_input = ELMoGenerator(genre, input_level, 'train', config.batch_size, elmo_cache,
                                        return_data=(input_config == 'token_combine_cache_elmo'),
                                        return_features=config.add_features)
            dev_input = ELMoGenerator(genre, input_level, 'dev', config.batch_size, elmo_cache,
                                      return_data=(input_config == 'token_combine_cache_elmo'),
                                      return_features=config.add_features)
            model.train_with_generator(train_input, dev_input)
        else:
            train_input = load_input_data(genre, input_level, 'train', input_config, config.add_features, scale_features)
            dev_input = load_input_data(genre, input_level, 'dev', input_config, config.add_features, scale_features)
            model.train(x_train=train_input['x'], y_train=train_input['y'], x_valid=dev_input['x'],
                        y_valid=dev_input['y'])
        elapsed_time = time.time() - start_time
        print('Logging Info - Training time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        train_log['train_time'] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

    def eval_on_data(eval_with_generator, input_data, data_type):
        model.load_best_model()
        if eval_with_generator:
            acc = model.evaluate_with_generator(generator=input_data, y=input_data.input_label)
        else:
            acc = model.evaluate(x=input_data['x'], y=input_data['y'])
        train_log['%s_acc' % data_type] = acc

        swa_type = None
        if 'swa' in config.callbacks_to_add:
            swa_type = 'swa'
        elif 'swa_clr' in config.callbacks_to_add:
            swa_type = 'swa_clr'
        if swa_type:
            print('Logging Info - %s Model' % swa_type)
            model.load_swa_model(swa_type=swa_type)
            swa_acc = model.evaluate(x=input_data['x'], y=input_data['y'])
            train_log['%s_%s_acc' % (swa_type, data_type)] = swa_acc

        ensemble_type = None
        if 'sse' in config.callbacks_to_add:
            ensemble_type = 'sse'
        elif 'fge' in config.callbacks_to_add:
            ensemble_type = 'fge'
        if ensemble_type:
            print('Logging Info - %s Ensemble Model' % ensemble_type)
            ensemble_predict = {}
            for model_file in os.listdir(config.checkpoint_dir):
                if model_file.startswith(config.exp_name+'_%s' % ensemble_type):
                    match = re.match(r'(%s_%s_)([\d+])(.hdf5)' % (config.exp_name, ensemble_type), model_file)
                    model_id = int(match.group(2))
                    model_path = os.path.join(config.checkpoint_dir, model_file)
                    print('Logging Info: Loading {} ensemble model checkpoint: {}'.format(ensemble_type, model_file))
                    model.load_model(model_path)
                    ensemble_predict[model_id] = model.predict(x=input_data['x'])
            '''
            we expect the models saved towards the end of run may have better performance than models saved earlier 
            in the run, we sort the models so that the older models ('s id) are first.
            '''
            sorted_ensemble_predict = sorted(ensemble_predict.items(), key=lambda x: x[0], reverse=True)
            model_predicts = []
            for model_id, model_predict in sorted_ensemble_predict:
                single_acc = eval_acc(model_predict, input_data['y'])
                print('Logging Info - %s_single_%d_%s Acc : %f' % (ensemble_type, model_id, data_type, single_acc))
                train_log['%s_single_%d_%s_acc' % (ensemble_type, model_id, data_type)] = single_acc

                model_predicts.append(model_predict)
                ensemble_acc = eval_acc(np.mean(np.array(model_predicts), axis=0), input_data['y'])
                print('Logging Info - %s_ensemble_%d_%s Acc : %f' % (ensemble_type, model_id, data_type, ensemble_acc))
                train_log['%s_ensemble_%d_%s_acc' % (ensemble_type, model_id, data_type)] = ensemble_acc

    if eval_on_train:
        # might take a long time
        print('Logging Info - Evaluate over train data:')
        if input_config in ['cache_elmo', 'token_combine_cache_elmo']:
            train_input = ELMoGenerator(genre, input_level, 'train', config.batch_size, elmo_cache,
                                        return_data=(input_config == 'token_combine_cache_elmo'),
                                        return_features=config.add_features, return_label=False)
            eval_on_data(eval_with_generator=True, input_data=train_input, data_type='train')
        else:
            train_input = load_input_data(genre, input_level, 'train', input_config, config.add_features, scale_features)
            eval_on_data(eval_with_generator=False, input_data=train_input, data_type='train')

    print('Logging Info - Evaluate over valid data:')
    if input_config in ['cache_elmo', 'token_combine_cache_elmo']:
        dev_input = ELMoGenerator(genre, input_level, 'dev', config.batch_size, elmo_cache,
                                  return_data=(input_config == 'token_combine_cache_elmo'),
                                  return_features=config.add_features, return_label=False)
        eval_on_data(eval_with_generator=True, input_data=dev_input, data_type='dev')
    else:
        if dev_input is None:
            dev_input = load_input_data(genre, input_level, 'dev', input_config, config.add_features, scale_features)
        eval_on_data(eval_with_generator=False, input_data=dev_input, data_type='dev')

    print('Logging Info - Evaluate over test data:')
    if input_config in ['cache_elmo', 'token_combine_cache_elmo']:
        test_input = ELMoGenerator(genre, input_level, 'test', config.batch_size, elmo_cache,
                                   return_data=(input_config == 'token_combine_cache_elmo'),
                                   return_features=config.add_features, return_label=False)
        eval_on_data(eval_with_generator=True, input_data=test_input, data_type='test')
    else:
        if test_input is None:
            test_input = load_input_data(genre, input_level, 'test', input_config, config.add_features, scale_features)
        eval_on_data(eval_with_generator=False, input_data=test_input, data_type='test')

    train_log['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
    write_log(format_filename(LOG_DIR, PERFORMANCE_LOG, genre), log=train_log, mode='a')
    return train_log
Example #26
0
if __name__ == '__main__':
    if not os.path.exists(PREDICT_DIR):
        os.makedirs(PREDICT_DIR)
    config = ModelConfig()

    raw_data = dict()
    raw_data['simplified'] = read_raw_test_data(SIMP_TEST_FILENAME)
    raw_data['traditional'] = read_raw_test_data(TRAD_TEST_FILENAME)

    for variation in raw_data.keys():
        test_data = raw_data[variation]
        # prepare word embedding input
        word_tokenizer = pickle_load(
            format_filename(PROCESSED_DATA_DIR,
                            TOKENIZER_TEMPLATE,
                            variation=variation,
                            level='word'))
        word_ids_test = create_token_ids_matrix(word_tokenizer,
                                                raw_data[variation],
                                                config.word_max_len)

        # prepare n-gram input
        vectorizer = pickle_load(
            format_filename(PROCESSED_DATA_DIR,
                            VECTORIZER_TEMPLATE,
                            variation=variation,
                            type='binary',
                            level='char',
                            ngram_range=(2, 3)))
        n_gram_test = vectorizer.transform(raw_data[variation])