def train_model_peoe( ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')

    char_embeds = []
    char_embed_matrix_oe = data_utils.load_our_embedding(vocab_dict)
    char_embeds.append( char_embed_matrix_oe )
    for windows in [3, 5, 8]:
        sg = 1
        # for sg in [0,1]:
        embed_file = 'data/char_embed_{}_{}.model'.format(windows, sg)
        char_embed_tmp = data_utils.load_our_embedding(vocab_dict, model_file = embed_file,
                                                       dump_path = 'data/our_char_embed_{}_{}.pkl'.format(windows, sg))
        char_embeds.append(char_embed_tmp)

    term_embeds = []
    term_embed_matrix_oe = data_utils.load_our_embedding(term_vocab_dict, model_file='data/term_embed.model',
                                                         dump_path='data/our_term_embed.pkl')
    term_embeds.append( term_embed_matrix_oe )
    for windows in [3, 5, 8]:
        sg = 1
        # for sg in [0,1]:
        embed_file = 'data/term_embed_{}_{}.model'.format(windows, sg)
        term_embed_tmp = data_utils.load_our_embedding(term_vocab_dict, model_file=embed_file,
                                                       dump_path='data/our_term_embed_{}_{}.pkl'.format(windows,
                                                                                                        sg))
        term_embeds.append(term_embed_tmp)


    MAX_LEN_TERM = 300
    name = 'hybriddpcnnmodel_PEOE.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    #加入更多embedding模型以后,学习率要降低才能正常学习
    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                             name=name, char_embeds=char_embeds, term_embeds = term_embeds, lr=0.0004)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                             name=name, train_embed=True, train_top=False, lr=0.001, char_embeds=char_embeds, term_embeds = term_embeds)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Exemple #2
0

if __name__ == '__main__':
    import sys
    tn_conf = TrainConfigure()
    if len(sys.argv) > 1 and sys.argv[1] == 'char':
        if len(sys.argv) > 2 and sys.argv[2] == 'pe':
            print('define char model with position embedding')
            print('load data')
            import data_utils, training_utils

            data_dict = data_utils.pickle_load(tn_conf.char_file)
            y = to_categorical(data_dict['y'])
            char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
            char_embed_matrix = data_utils.load_embedding(
                char_vocab_dict,
                'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                dump_path='data/char_embed.pkl')
            xe = [[i for i in range(600)] for _ in range(y.shape[0])]
            xe = np.array(xe)
            x_tn, y_tn, x_ts, y_ts = training_utils.split([data_dict['x'], xe],
                                                          y,
                                                          shuffle=False)
            x_tn, y_tn, x_val, y_val = training_utils.split(x_tn,
                                                            y_tn,
                                                            shuffle=False)
            print('train')
            model = RCNNModel(MAX_LEN=600,
                              name='RCNNmodel_char_PE.h5',
                              embed_matrix=char_embed_matrix,
                              PE=True)
            model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
train_sq_len = get_sequence_length(train_tokens)
# MAXLEN = max(train_sq_len)
# AVR = sum(train_sq_len)/ len(train_sq_len)
# print(AVR)

valid_sq_len = get_sequence_length(valid_tokens)
test_sq_len = get_sequence_length(test_tokens)

# create vocabulary from the data itself
wordlist = itertools.chain.from_iterable(train_tokens)
word_index, _ = build_vocab(wordlist)

# load dependency embedding
dep_embedding_path = "dep_embedding/deps.contexts"
dep_embedding_index = load_embedding(dep_embedding_path)
dep_embedding_matrix = get_embedding_matrix(word_index, dep_embedding_index,
                                            FLAGS.word_embedding_size)
print("finish loading dep embedding")
fast_embedding_path = "fast-text/wiki.simple.vec"
fast_embedding_index = load_embedding(fast_embedding_path)
fast_embedding_matrix = get_embedding_matrix(word_index, fast_embedding_index,
                                             FLAGS.word_embedding_size)
print("finish loading fast embedding")
embedding_path = "glove.6B/glove.6B.{}d.txt".format(FLAGS.word_embedding_size)
embedding_index = load_embedding(embedding_path)
embedding_matrix = get_embedding_matrix(word_index, embedding_index,
                                        FLAGS.word_embedding_size)
print("finish loading linear embedding")
vocab_size = len(word_index)
Exemple #4
0
def main(args):
    # dev data
    result_dev = data_utils.load_data(args.valid_data_file)
    print('result dev size : %s ' % len(result_dev))

    datasets_dev = data_utils.connect_metadata(result_dev)
    print('datasets dev size : %s ' % len(datasets_dev))

    datasets_pos_dev = data_utils.sperate_data(datasets_dev)

    # train data
    result = data_utils.load_data(args.train_data_file)
    print('result size : %s ' % len(result))

    datasets = data_utils.connect_metadata(result)
    print('dataset size : %s ' % len(datasets))

    datasets_pos = data_utils.sperate_data(datasets)

    word_vocab_list, pos_vocab_list, char_vocab_list = data_utils.creat_vocab(
        args)

    args.word_vocab_size = len(word_vocab_list)
    print('word size: %s' % len(word_vocab_list))

    args.pos_vocab_size = len(pos_vocab_list)
    print('pos size: %s ' % len(pos_vocab_list))

    args.char_vocab_size = len(char_vocab_list)
    print('char size: %s ' % len(char_vocab_list))

    word2id = data_utils.get_vocab2id(word_vocab_list)
    pos2id = data_utils.get_vocab2id(pos_vocab_list)
    char2id = data_utils.get_vocab2id(char_vocab_list)

    embed_word2id = data_utils.get_embed_word(args)
    print('embedding_word size: %s' % len(embed_word2id.keys()))

    pretrained_embedding = data_utils.load_embedding(args, word2id)
    pretrained_char_embedding = data_utils.load_embedding(args, char2id)

    train_datasets = data_utils.make_pading(datasets_pos, word2id, pos2id,
                                            char2id, args.max_length,
                                            args.max_topic_length,
                                            args.max_word_length)

    dev_datasets = data_utils.make_pading(datasets_pos_dev, word2id, pos2id,
                                          char2id, args.max_length,
                                          args.max_topic_length,
                                          args.max_word_length)

    with tf.Session() as sess:
        model = Triangulation_Approach_Community_QA.QASystem(
            sess,
            pretrained_embedding,
            pretrained_char_embedding,
            init_word_embed=True,
            args=args)

        sess.run(tf.global_variables_initializer())

        # model.load_checkpoints()

        if args.bool_train:
            print("-" * 20 + " Start Training: %s " % datetime.datetime.now() +
                  "-" * 20)
            start_time = time.time()
            max_apk = 0
            dev_datasets_batchs = data_utils.batch_iter(
                dev_datasets, args.batch_size)
            print('the dev data sets batch is : %s' % len(dev_datasets_batchs))

            for epoch in range(args.num_epochs):
                for batch in data_utils.batch_iter(train_datasets,
                                                   args.batch_size):
                    # print(batch)
                    # loss, acc, global_step, label, pred, logits = run_train(model, batch)
                    question_topic, question_topic_pos, question_topic_char, question_topic_length, question_topic_char_length = batch[
                        0], batch[1], batch[2], batch[3], batch[4]

                    question, question_pos, question_char, question_length, question_char_length = batch[
                        5], batch[6], batch[7], batch[8], batch[9]

                    question_1_topic, question_1_topic_pos, question_1_topic_char, question_1_topic_length, question_1_topic_char_length = batch[
                        10], batch[11], batch[12], batch[13], batch[14]

                    question_1, question_1_pos, question_1_char, question_1_length, question_1_char_length = batch[
                        15], batch[16], batch[17], batch[18], batch[19]

                    comment_1, comment_1_pos, comment_1_char, comment_1_length, comment_1_char_length = batch[
                        20], batch[21], batch[22], batch[23], batch[24]

                    question_2_topic, question_2_topic_pos, question_2_topic_char, question_2_topic_length, question_2_topic_char_length = batch[
                        25], batch[26], batch[27], batch[28], batch[29]

                    question_2, question_2_pos, question_2_char, question_2_length, question_2_char_length = batch[
                        30], batch[31], batch[32], batch[33], batch[34]

                    comment_2, comment_2_pos, comment_2_char, comment_2_length, comment_2_char_length = batch[
                        35], batch[36], batch[37], batch[38], batch[39]

                    label = batch[40]
                    '''
                     def train(self, q, q_pos, q_char, q_topic, q_l, q_char_l,
                      q1, q1_pos, q1_char, q1_topic, q1_l, q1_char_l,
                      c1, c1_pos, c1_char, c1_topic, c1_l, c1_char_l,
                      q2, q2_pos, q2_char, q2_topic, q2_l, q2_char_l,
                      c2, c2_pos, c2_char, c2_topic, c2_l, c2_char_l,
                      label, keep_prob):
                    '''
                    loss, acc, global_step, pred, logits = model.train(
                        np.array(question), np.array(question_pos),
                        np.array(question_char), np.array(question_topic),
                        question_length, question_char_length,
                        np.array(question_1), np.array(question_1_pos),
                        np.array(question_1_char), np.array(question_1_topic),
                        question_1_length, question_1_char_length,
                        np.array(comment_1), np.array(comment_1_pos),
                        np.array(comment_1_char), np.array(question_1_topic),
                        comment_1_length, comment_1_char_length,
                        np.array(question_2), np.array(question_2_pos),
                        np.array(question_2_char), np.array(question_2_topic),
                        question_2_length, question_2_char_length,
                        np.array(comment_2), np.array(comment_2_pos),
                        np.array(comment_2_char), np.array(question_2_topic),
                        comment_2_length, comment_2_char_length, label,
                        args.keep_prob)

                    print(
                        "| Epoch: {:2d}".format(epoch),
                        "| Step: {:4d}".format(global_step),
                        "| Time: {:3d}s".format(int(time.time() - start_time)),
                        "| Train Loss: {:.4f}".format(loss),
                        "| Train Acc: {:.4f}".format(acc))
                    # 直观地观测效果
                    pred = [p for p in pred]
                    print("Label: {}".format(label))
                    print("Pred:  {}".format(pred))
                    pred, label = merge_sort_part(logits, pred, label)
                    score = apk(label, pred)
                    if score > max_apk:
                        max_apk = score
                    print('the score is : %s , and the max apk is : %s ' %
                          (score, max_apk))
                    print('-' * 20)

                    if acc > args.beach_mark and args.model_count and epoch > 50:
                        model.save()
                        args.model_count -= 1

            print("-" * 20 +
                  " Finished Training. %s " % datetime.datetime.now() +
                  "-" * 20)

        ###########################################################################

        if args.bool_test:
            test_data_result = data_utils.load_data(args.test_data_file_1)
            print('test data result size : %s' % len(test_data_result))

            test_data_datasets_pos = data_utils.sperate_test_data(
                test_data_result)

            test_datasets = data_utils.make_test_padding(
                test_data_datasets_pos, word2id, pos2id, char2id,
                args.max_length, args.max_topic_length, args.max_word_length)

            print("-" * 20 + " Start Testing: %s " % datetime.datetime.now() +
                  "-" * 20)
            start_time = time.time()

            NEG = [[0] * args.max_length]
            NEG_char = [[0] * args.max_length * 10]
            NEG_topic = [[0] * args.max_topic_length]
            NEG_topic_char = [[0] * args.max_topic_length * 10]
            for key in test_datasets.keys():
                length = len(test_datasets[key])
                logits_list, pred_list, label_list = [], [], []
                for i in range(length):
                    '''
                    question_topic, question_topic_pos, question_topic_char, question_topic_length, question_topic_char_length,
                     question, question_pos, question_char, question_length, question_char_length,
                     question_1_topic, question_1_topic_pos, question_1_topic_char, question_1_topic_length,
                     question_1_topic_char_length,
                     question_1, question_1_pos, question_1_char, question_1_length, question_1_char_length,
                     comment_1, comment_1_pos, comment_1_char, comment_1_length, comment_1_char_length,
                    '''
                    question_topic = [test_datasets[key][i][0]]
                    question_topic_pos = [test_datasets[key][i][1]]
                    question_topic_char = [test_datasets[key][i][2]]
                    question_topic_length = [test_datasets[key][i][3]]
                    question_topic_char_length = [test_datasets[key][i][4]]

                    question = [test_datasets[key][i][5]]
                    question_pos = [test_datasets[key][i][6]]
                    question_char = [test_datasets[key][i][7]]
                    question_length = [test_datasets[key][i][8]]
                    question_char_length = [test_datasets[key][i][9]]

                    question_1_topic = [test_datasets[key][i][10]]
                    question_1_topic_pos = [test_datasets[key][i][11]]
                    question_1_topic_char = [test_datasets[key][i][12]]
                    question_1_topic_length = [test_datasets[key][i][13]]
                    question_1_topic_char_length = [test_datasets[key][i][14]]

                    question_1 = [test_datasets[key][i][15]]
                    question_1_pos = [test_datasets[key][i][16]]
                    question_1_char = [test_datasets[key][i][17]]
                    question_1_length = [test_datasets[key][i][18]]
                    question_1_char_length = [test_datasets[key][i][19]]

                    comment_1 = [test_datasets[key][i][20]]
                    comment_1_pos = [test_datasets[key][i][21]]
                    comment_1_char = [test_datasets[key][i][22]]
                    comment_1_length = [test_datasets[key][i][23]]
                    comment_1_char_length = [test_datasets[key][i][24]]

                    question_2_topic = NEG_topic
                    question_2_topic_pos = NEG_topic
                    question_2_topic_char = NEG_topic_char
                    question_2_topic_length = [0]
                    question_2_topic_char_length = [0]
                    question_2 = NEG
                    question_2_pos = NEG
                    question_2_char = NEG_char
                    question_2_length = [0]
                    question_2_char_length = [0]
                    comment_2 = NEG
                    comment_2_pos = NEG
                    comment_2_char = NEG_char
                    comment_2_length = [0]
                    comment_2_char_length = [0]

                    label = [test_datasets[key][i][25]]
                    '''
                     def train(self, q, q_pos, q_char, q_topic, q_l, q_char_l,
                      q1, q1_pos, q1_char, q1_topic, q1_l, q1_char_l,
                      c1, c1_pos, c1_char, c1_topic, c1_l, c1_char_l,
                      q2, q2_pos, q2_char, q2_topic, q2_l, q2_char_l,
                      c2, c2_pos, c2_char, c2_topic, c2_l, c2_char_l,
                      label, keep_prob):
                    '''
                    loss, acc, pred, logits = model.test(
                        np.array(question), np.array(question_pos),
                        np.array(question_char), np.array(question_topic),
                        question_length, question_char_length,
                        np.array(question_1), np.array(question_1_pos),
                        np.array(question_1_char), np.array(question_1_topic),
                        question_1_length, question_1_char_length,
                        np.array(comment_1), np.array(comment_1_pos),
                        np.array(comment_1_char), np.array(question_1_topic),
                        comment_1_length, comment_1_char_length,
                        np.array(question_2), np.array(question_2_pos),
                        np.array(question_2_char), np.array(question_2_topic),
                        question_2_length, question_2_char_length,
                        np.array(comment_2), np.array(comment_2_pos),
                        np.array(comment_2_char), np.array(question_2_topic),
                        comment_2_length, comment_2_char_length, label,
                        args.keep_prob)

                    print(
                        "| Time: {:3d}s".format(int(time.time() - start_time)),
                        "| Test Loss: {:.4f}".format(loss),
                        "| Test Acc: {:.4f}".format(acc))
                    # 直观地观测效果
                    logits_list += list(logits)
                    pred_list += [p for p in pred]
                    label_list += label
                    # print("Label: {}".format(label))
                    # print("Pred:  {}".format(pred))
                    print("-" * 20)

                pred, label = merge_sort_part(logits_list, pred_list,
                                              label_list)
                score = apk(label, pred, length)
                if score > max_apk:
                    max_apk = score
                if score < min_apk:
                    min_apk = score
                print('the max apk is : %s and the min apk is : %s ' %
                      (max_apk, min_apk))

            print("-" * 20 + " End Testing: %s " % datetime.datetime.now() +
                  "-" * 20)

            print('max_apk:{}'.format(max_apk), 'min_apk:{}'.format(min_apk))
Exemple #5
0
def train_main():
    print('load data')
    import data_utils, training_utils
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat],
                                                  y,
                                                  shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('define model')
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8)  # +37
    print('feat shape', xfeat.shape)
    import sys
    if len(sys.argv) <= 1 or sys.argv[1] == 'train':
        print('train')
        model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    if len(sys.argv) > 1 and sys.argv[1] == 'val':
        val_conf = ValidConfigure()
        data_dict = data_utils.pickle_load(val_conf.char_file)
        y = to_categorical(data_dict['y'])
        x = data_dict['x']
        ids = data_dict['id']
        xterm = data_utils.pickle_load(val_conf.term_file)
        xfeat = data_utils.pickle_load(val_conf.feat_file)
        xfeat = scaler.transform(xfeat)
        model.load_weights()
        model.test([x, xterm, xfeat], ids, val_conf.out_file)

    if len(sys.argv) > 1 and sys.argv[1] == 'error':
        start_index = y_tn.shape[0] + y_val.shape[0]
        texts = data_utils.load_all_text(tn_conf)
        model.load_weights()
        model.error_analysis(x_ts, y_ts, texts, start_index)
def train_model_cv(cv_index, cv_num):
    print('train condition conv model with PE\n load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = 300
    name = 'conditionconvmodel_cv{}.h5'.format(cv_index)
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split_cv(
        [x, xe, xterm, xe_term, xfeat, xt],
        y,
        cv_index=cv_index,
        cv_num=cv_num)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               MAX_LEN=MAX_LEN,
                               NUM_FEAT=8,
                               PE=True,
                               name=name)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               MAX_LEN=MAX_LEN,
                               NUM_FEAT=8,
                               PE=True,
                               name=name,
                               train_embed=True,
                               train_top=False,
                               lr=0.001)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Exemple #7
0
def train_model_ftoe(model_conf,
                     name,
                     ModelClass,
                     char_embed_file=None,
                     term_embed_file=None):
    print(name)
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    # char_embed_matrix = data_utils.load_embedding(vocab_dict,
    #                                               'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        char_embed_file,
        dump_path='data/{}.pkl'.format(char_embed_file[5:]))
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        term_embed_file,
        dump_path='data/{}.pkl'.format(term_embed_file[5:]))
    MAX_LEN_TERM = 300
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xe, xterm, xe_term, xfeat, xt], y, split_ratio=0.95, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn,
                                                    y_tn,
                                                    split_ratio=0.95,
                                                    shuffle=False)
    print('train')
    print('define model')
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=name)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=name,
                       train_embed=True,
                       train_top=False)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Exemple #8
0
    def __init__(self,
                 session,
                 batch,
                 vocab,
                 load_embeddings=False,
                 sentence_len=30,
                 calculate_loss=True):
        """
        Sets up the graph of our RNN model.
            session           Tensorflow session object
            batch             The batch of sentences
            vocab             A dictionary mapping token strings to vocabulary IDs
            load_embeddings   Word embeddings our model will use (default: our own)
            sentence_len      The length of  one sentence  (default: 30)
            calculate_loss    True if the model is in training phase (default:True)
        
        """
        self.sentence_len = sentence_len
        self.calculate_loss = calculate_loss
        self.word_inputs = batch
        self.initializer = tf.contrib.layers.xavier_initializer

        if FLAGS.lstm_size > FLAGS.default_lstm_size:
            print(
                f"Running with downsize layer from {FLAGS.lstm_size} to {FLAGS.default_lstm_size}!"
            )

        with tf.name_scope("embedding"):

            self.embedding_matrix = tf.get_variable(
                "embedding_matrix",
                initializer=tf.random_uniform(
                    [FLAGS.vocab_size, FLAGS.embedding_dim], -1.0, 1.0),
                dtype=tf.float32,
                trainable=True)

            if load_embeddings:
                data_utils.load_embedding(session, vocab,
                                          self.embedding_matrix,
                                          FLAGS.path_embeddings,
                                          FLAGS.embedding_dim,
                                          FLAGS.vocab_size)

            self.embedded_words = tf.nn.embedding_lookup(
                self.embedding_matrix, self.word_inputs
            )  # DIM [batch_size, sentence_len, embedding_dim]

        with tf.name_scope("rnn"):
            # Stacked LSTM layers architecture, with 2 layers
            lstms = (tf.contrib.rnn.LSTMBlockCell(num_units=FLAGS.lstm_size,
                                                  dtype=tf.float32),
                     tf.contrib.rnn.LSTMBlockCell(num_units=FLAGS.lstm_size,
                                                  dtype=tf.float32))

            with tf.variable_scope('hidden_state'):
                #Placeholders holding the value of the hidden states (default: zero matrix)
                self.lstm_c1 = tf.placeholder_with_default(
                    np.zeros((FLAGS.batch_size, FLAGS.lstm_size),
                             dtype=np.float32),
                    shape=[None, lstms[0].state_size.c],
                    name='c1_in')
                self.lstm_h1 = tf.placeholder_with_default(
                    np.zeros((FLAGS.batch_size, FLAGS.lstm_size),
                             dtype=np.float32),
                    shape=[None, lstms[0].state_size.h],
                    name='h1_in')
                self.lstm_c2 = tf.placeholder_with_default(
                    np.zeros((FLAGS.batch_size, FLAGS.lstm_size),
                             dtype=np.float32),
                    shape=[None, lstms[1].state_size.c],
                    name='c2_in')
                self.lstm_h2 = tf.placeholder_with_default(
                    np.zeros((FLAGS.batch_size, FLAGS.lstm_size),
                             dtype=np.float32),
                    shape=[None, lstms[1].state_size.h],
                    name='h2_in')

                state_in1 = tf.contrib.rnn.LSTMStateTuple(
                    self.lstm_c1, self.lstm_h1)
                state_in2 = tf.contrib.rnn.LSTMStateTuple(
                    self.lstm_c2, self.lstm_h2)
                self.states = [state_in1, state_in2]

            # Add a down size matrix if necessary
            if FLAGS.lstm_size > FLAGS.default_lstm_size:
                down_size = tf.get_variable(
                    "down_size", [FLAGS.lstm_size, FLAGS.default_lstm_size])

            self.W_h = tf.get_variable(
                "W_h", [FLAGS.default_lstm_size, FLAGS.vocab_size],
                tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.b_h = tf.get_variable("b_h", [FLAGS.vocab_size],
                                       tf.float32,
                                       initializer=tf.zeros_initializer())

            self.predictions = []
            self.next_words_probs = []
            self.loss = 0.0
            extra_count_for_predict = int(self.calculate_loss == False)

            for i in range(self.sentence_len - 1 + extra_count_for_predict):
                # words = [batch_size, embeddings] #next_words_index = [batch_size]
                words = self.embedded_words[:, i, :]
                # First layer: input word is the actual word
                output, self.states[0] = lstms[0](
                    words,
                    self.states[0])  # output = [batch_size, embedding_dim]
                # add dropout layer
                if FLAGS.enable_dropout:
                    output = tf.nn.dropout(output, rate=1 - FLAGS.keep_prob)
                # Second layer: input word is the prediction from the first layer
                output, self.states[1] = lstms[1](output, self.states[1])
                # add second dropout layer
                if FLAGS.enable_dropout:
                    output = tf.nn.dropout(output, rate=1 - FLAGS.keep_prob)
                # Down-project working if necessary
                if FLAGS.lstm_size > FLAGS.default_lstm_size:
                    output = tf.matmul(output, down_size)
                # The output comes from he second layer
                self.logits = tf.matmul(
                    output, self.W_h
                ) + self.b_h  # logits = [batch_size, VOCABULARY_LEN]

                probabilities = tf.nn.softmax(
                    self.logits, name="softmax_probs"
                )  # size = [batch_size, VOCABULARY_LENGTH]

                if self.calculate_loss:
                    next_words_index = self.word_inputs[:, i + 1]
                    # [0, word i + 1 from B0], ... , [63, word i + 1 from B63]
                    indices_of_next_words = tf.stack(
                        [tf.range(FLAGS.batch_size), next_words_index], axis=1)
                    # The probas of the words that should have been predicted
                    next_word_prob = tf.gather_nd(probabilities,
                                                  indices_of_next_words)
                    self.next_words_probs.append(next_word_prob)

                    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.logits, labels=next_words_index)
                    self.loss = tf.math.add(self.loss,
                                            tf.reduce_mean(losses),
                                            name="cross_entropy_loss")

                self.predicted_words = tf.argmax(self.logits,
                                                 axis=1,
                                                 name="predicted_words")
                self.predictions.append(self.predicted_words)

            if self.calculate_loss:
                self.next_words_probs = tf.stack(
                    self.next_words_probs, axis=1,
                    name="probs")  # [batch_size, sentence_length]
                words = self.word_inputs[:, 1:]  # Skipping <bos>

        # Calculating accuracy
        with tf.name_scope("accuracy"):
            if self.calculate_loss:
                correct_predictions = tf.equal(
                    self.predictions,
                    tf.transpose(tf.cast(self.word_inputs[:, 1:], tf.int64)))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                       tf.float64),
                                               name="accuracy")
Exemple #9
0
def stacking_main_condition():
    print('load data')
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)
    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)
    xtopic = data_utils.pickle_load('data/lda_vec.pkl')

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(char_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')
    print('load embed done.')

    name = 'model/stack_condition_model.pkl'
    model_dir = 'model/stack/'
    n_fold = 3
    name = 'model/stack_condition_model5.pkl'
    model_dir = 'model/stack5/'
    n_fold = 5
    stk_model = stacking(n_fold, name=name, is_condition=True)
    conf = conditionmodelbase.ModelConfigure()
    conf.PE = True
    stk_model.add_model(ConditionConvModel, {"conf":conf,"char_embed_matrix":char_embed_matrix,
                            "term_embed_matrix":term_embed_matrix,
                                             "name":model_dir+'conditionconvmodel_PE.h5'})
    stk_model.add_model(ConditionGatedConvModel, {"conf":conf,"char_embed_matrix": char_embed_matrix,
                                          "term_embed_matrix": term_embed_matrix,
                                                  "name": model_dir+'conditiongatedconvmodel_PE.h5'})
    stk_model.add_model(ConditionGatedDeepCNNModel, {"conf":conf,"char_embed_matrix": char_embed_matrix,
                                          "term_embed_matrix": term_embed_matrix,
                                            "name": model_dir+'conditiongateddeepcnnmodel_PE.h5'})
    conf.lr = 0.0005
    stk_model.add_model(ConditionDPCNNModel, {"conf": conf, "char_embed_matrix": char_embed_matrix,
                                              "term_embed_matrix": term_embed_matrix,
                                              "name": model_dir + 'conditiondpcnnmodel_PE.h5'})
    #采样0.1用于测试
    # x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic], y, split_ratio=0.005, shuffle=False)
    # x_tn, y_tn, x_ts, y_ts = training_utils.split(x_tn, y_tn, shuffle=False)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic],  y, split_ratio=0.95)
    stk_model.fit(x_tn, y_tn)
    # joblib.dump(stk_model, 'model/stack_model_3.pkl')
    y_pred = stk_model.predict(x_ts)
    acc = accuracy_score(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) )
    print(acc)
    cnf_matrix = confusion_matrix(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) )
    print(cnf_matrix)
    stk_model.save( )
def predict(tn_conf, lda_file, val_conf, val_conf100, val_conf200):
    """
    根据概率集成
    :return:
    """
    print('load data')
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)
    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    xtopic = data_utils.pickle_load(lda_file)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')
    data_dict = data_utils.pickle_load(val_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    ids = data_dict['id']
    xterm = data_utils.pickle_load(val_conf.term_file)
    xfeat = data_utils.pickle_load(val_conf.feat_file)
    xfeat = scaler.transform(xfeat)
    print('feat shape', xfeat.shape)

    data_dict100 = data_utils.pickle_load(val_conf100.char_file)
    x100 = data_dict100['x']
    xterm100 = data_utils.pickle_load(val_conf100.term_file)
    xe100 = [[i for i in range(100)] for _ in range(y.shape[0])]
    xe100 = np.array(xe100)
    xe_term100 = [[i for i in range(100)] for _ in range(y.shape[0])]
    xe_term100 = np.array(xe_term100)

    data_dict200 = data_utils.pickle_load(val_conf200.char_file)
    x200 = data_dict200['x']
    xterm200 = data_utils.pickle_load(val_conf200.term_file)
    xe200 = [[i for i in range(200)] for _ in range(y.shape[0])]
    xe200 = np.array(xe200)
    xe_term200 = [[i for i in range(200)] for _ in range(y.shape[0])]
    xe_term200 = np.array(xe_term200)

    ys = []
    print('define model')
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddensemodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model

    model = HybridDenseMAModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               NUM_FEAT=8,
                               PE=True,
                               name='hybriddensemodelma_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('dense model done.')

    model = HybridSEModel(char_embed_matrix=char_embed_matrix,
                          term_embed_matrix=term_embed_matrix,
                          NUM_FEAT=8,
                          PE=True,
                          name='hybridsemodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('se model done.')

    print('start len 100 model')
    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            MAX_LEN=100,
                            MAX_LEN_TERM=100,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridconvmodel_n100.h5')
    model.load_weights()
    y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid conv model done.')

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    MAX_LEN=100,
                                    MAX_LEN_TERM=100,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='hybridgateddeepcnnmodel_n100.h5')
    model.load_weights()
    y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid gated deep cnn model done.')

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            MAX_LEN=100,
                            MAX_LEN_TERM=100,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridrcnnmodel_n100.h5')
    model.load_weights()
    y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid RCNN model done.')

    # print('start len 200 model')
    # model = HybridConvModel(char_embed_matrix=char_embed_matrix,
    #                         term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                         PE=True, name='hybridconvmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid conv model done.')
    #
    # model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
    #                          term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                          PE=True, name='hybriddpcnnmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid dpcnn model done.')
    #
    # model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
    #                                 term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                                 PE=True, name='hybridgateddeepcnnmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid gated deep cnn model done.')
    #
    # model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
    #                         term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                         PE=True, name='hybridrcnnmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model

    #这个模型太慢
    # model = ConditionAttModel(char_embed_matrix=char_embed_matrix,
    #                           term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True,
    #                           name='conditionattmodel_PE.h5', lr=0.001)
    # model.load_weights()
    # y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    # ys.append(y)
    # print('condition att model done.')

    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               NUM_FEAT=8,
                               PE=True,
                               name='conditionconvmodel_PE.h5',
                               lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition conv model done.')

    model = ConditionDPCNNModel(char_embed_matrix=char_embed_matrix,
                                term_embed_matrix=term_embed_matrix,
                                NUM_FEAT=8,
                                PE=True,
                                name='conditiondpcnnmodel_PE.h5',
                                lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition dpcnn model done.')

    model = ConditionGatedConvModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='conditiongatedconvmodel_PE.h5',
                                    lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition gated conv model done.')

    model = ConditionGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                       term_embed_matrix=term_embed_matrix,
                                       NUM_FEAT=8,
                                       PE=True,
                                       name='conditiongateddeepcnnmodel_PE.h5',
                                       lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition gated deepcnn model done.')

    model = HybridAttModel(char_embed_matrix=char_embed_matrix,
                           term_embed_matrix=term_embed_matrix,
                           NUM_FEAT=8,
                           PE=True,
                           name='hybridattmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    print('hybrid att model done.')

    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridconvmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid conv model done.')

    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddpcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid dpcnn model done.')

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='hybridgateddeepcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid gated deep cnn model done.')

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridrcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid rcnn model done.')

    model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix,
                                      term_embed_matrix=term_embed_matrix,
                                      NUM_FEAT=8,
                                      PE=True,
                                      name='hybridgatedconvtopicmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    print('hybrid gated conv topic done.')

    y = fasttextmodel.predict_char()
    ys.append(y)

    y = fasttextmodel.predict_term()
    ys.append(y)
    print(y.shape)
    print('fast text done.')

    #hybrid model
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8)  # + 37
    model.load_weights()
    y = model.predict([x, xterm, xfeat])
    ys.append(y)
    print(y.shape)
    print('hybrid model done.')
    ys = np.array(ys)
    print(ys.shape)
    return ys
def predict_all():
    """
    根据概率集成
    :return:
    """
    print('load data')
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)
    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    xtopic = data_utils.pickle_load('data/lda_vec_val.pkl')

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')
    val_conf = ValidConfigure()
    data_dict = data_utils.pickle_load(val_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    ids = data_dict['id']
    xterm = data_utils.pickle_load(val_conf.term_file)
    xfeat = data_utils.pickle_load(val_conf.feat_file)
    xfeat = scaler.transform(xfeat)
    print('feat shape', xfeat.shape)

    import data_utils100
    val_conf100 = data_utils100.ValidConfigure()
    data_dict100 = data_utils.pickle_load(val_conf100.char_file)
    x100 = data_dict100['x']
    xterm100 = data_utils.pickle_load(val_conf100.term_file)
    xe100 = [[i for i in range(100)] for _ in range(y.shape[0])]
    xe100 = np.array(xe100)
    xe_term100 = [[i for i in range(100)] for _ in range(y.shape[0])]
    xe_term100 = np.array(xe_term100)

    import data_utils200
    val_conf200 = data_utils200.ValidConfigure()
    data_dict200 = data_utils.pickle_load(val_conf200.char_file)
    x200 = data_dict200['x']
    xterm200 = data_utils.pickle_load(val_conf200.term_file)
    xe200 = [[i for i in range(200)] for _ in range(y.shape[0])]
    xe200 = np.array(xe200)
    xe_term200 = [[i for i in range(200)] for _ in range(y.shape[0])]
    xe_term200 = np.array(xe_term200)

    ys = []
    print('define model')
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddensemodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model

    model = HybridDenseMAModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               NUM_FEAT=8,
                               PE=True,
                               name='hybriddensemodelma_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('dense model done.')

    model = HybridSEModel(char_embed_matrix=char_embed_matrix,
                          term_embed_matrix=term_embed_matrix,
                          NUM_FEAT=8,
                          PE=True,
                          name='hybridsemodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('se model done.')

    print('start len 100 model')
    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            MAX_LEN=100,
                            MAX_LEN_TERM=100,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridconvmodel_n100.h5')
    model.load_weights()
    y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid conv model done.')

    # model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
    #                          term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100,NUM_FEAT=8,
    #                          PE=True, name='hybriddpcnnmodel_n100.h5')
    # model.load_weights()
    # y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid dpcnn model done.')

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    MAX_LEN=100,
                                    MAX_LEN_TERM=100,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='hybridgateddeepcnnmodel_n100.h5')
    model.load_weights()
    y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid gated deep cnn model done.')

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            MAX_LEN=100,
                            MAX_LEN_TERM=100,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridrcnnmodel_n100.h5')
    model.load_weights()
    y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid RCNN model done.')

    # print('start len 200 model')
    # model = HybridConvModel(char_embed_matrix=char_embed_matrix,
    #                         term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                         PE=True, name='hybridconvmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid conv model done.')
    #
    # model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
    #                          term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                          PE=True, name='hybriddpcnnmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid dpcnn model done.')
    #
    # model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
    #                                 term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                                 PE=True, name='hybridgateddeepcnnmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid gated deep cnn model done.')
    #
    # model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
    #                         term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                         PE=True, name='hybridrcnnmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model

    #这个模型太慢
    # model = ConditionAttModel(char_embed_matrix=char_embed_matrix,
    #                           term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True,
    #                           name='conditionattmodel_PE.h5', lr=0.001)
    # model.load_weights()
    # y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    # ys.append(y)
    # print('condition att model done.')

    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               NUM_FEAT=8,
                               PE=True,
                               name='conditionconvmodel_PE.h5',
                               lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition conv model done.')

    model = ConditionDPCNNModel(char_embed_matrix=char_embed_matrix,
                                term_embed_matrix=term_embed_matrix,
                                NUM_FEAT=8,
                                PE=True,
                                name='conditiondpcnnmodel_PE.h5',
                                lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition dpcnn model done.')

    model = ConditionGatedConvModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='conditiongatedconvmodel_PE.h5',
                                    lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition gated conv model done.')

    model = ConditionGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                       term_embed_matrix=term_embed_matrix,
                                       NUM_FEAT=8,
                                       PE=True,
                                       name='conditiongateddeepcnnmodel_PE.h5',
                                       lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition gated deepcnn model done.')

    #这个模型太慢
    # model = ConditionRCNNModel(char_embed_matrix=char_embed_matrix,
    #                                    term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True,
    #                                    name='conditionrcnnmodel_PE.h5', lr=0.001)
    # model.load_weights()
    # y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    # ys.append(y)
    # print('condition rcnn model done.')

    model = HybridAttModel(char_embed_matrix=char_embed_matrix,
                           term_embed_matrix=term_embed_matrix,
                           NUM_FEAT=8,
                           PE=True,
                           name='hybridattmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)

    model = HybridAttModel(char_embed_matrix=char_embed_matrix,
                           term_embed_matrix=term_embed_matrix,
                           NUM_FEAT=8,
                           name='hybridattmodel.h5')
    model.load_weights()
    y = model.predict([x, xterm, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid att model done.')

    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridconvmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model

    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            name='hybridconvmodel.h5')
    model.load_weights()
    y = model.predict([x, xterm, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid conv model done.')

    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddpcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model

    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             name='hybriddpcnnmodel.h5')
    model.load_weights()
    y = model.predict([x, xterm, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid dpcnn model done.')

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='hybridgateddeepcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    name='hybridgateddeepcnnmodel.h5')
    model.load_weights()
    y = model.predict([x, xterm, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid gated deep cnn model done.')

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridrcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            name='hybridrcnnmodel.h5')
    model.load_weights()
    y = model.predict([x, xterm, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid rcnn model done.')

    model = ConditionModel(embed_matrix=char_embed_matrix)
    model.load_weights()
    y = model.predict(x)
    ys.append(y)

    model = ConditionModel(embed_matrix=char_embed_matrix,
                           PE=True,
                           name='conditionmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(y)
    del model

    model = ConditionModel(embed_matrix=term_embed_matrix,
                           MAX_LEN=300,
                           name='conditionmodel_term.h5')
    model.load_weights()
    y = model.predict(xterm)
    ys.append(y)

    model = ConditionModel(embed_matrix=term_embed_matrix,
                           MAX_LEN=300,
                           PE=True,
                           name='conditionmodel_term_PE.h5')
    model.load_weights()
    y = model.predict([xterm, xe_term])
    ys.append(y)
    del model
    print('condition model done.')

    model = GatedConvTopicModel(embed_matrix=char_embed_matrix,
                                PE=True,
                                name='gatedconvtopicmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xtopic])
    ys.append(y)
    print('gated conv topic done.')

    model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix,
                                      term_embed_matrix=term_embed_matrix,
                                      NUM_FEAT=8,
                                      PE=True,
                                      name='hybridgatedconvtopicmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)

    model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix,
                                      term_embed_matrix=term_embed_matrix,
                                      NUM_FEAT=8,
                                      name='hybridgatedconvtopicmodel.h5')
    model.load_weights()
    y = model.predict([x, xterm, xfeat, xtopic])
    ys.append(y)
    print('hybrid gated conv topic done.')

    model = RCNNModel(MAX_LEN=300,
                      embed_matrix=term_embed_matrix,
                      name='RCNNmodel.h5')
    model.load_weights()
    y = model.predict(xterm)
    ys.append(y)
    print('RCNN done.')

    y = fasttextmodel.predict_char()
    ys.append(y)

    y = fasttextmodel.predict_term()
    ys.append(y)
    print(y.shape)
    print('fast text done.')

    #hybrid model
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8)  # + 37
    model.load_weights()
    y = model.predict([x, xterm, xfeat])
    ys.append(y)
    print(y.shape)
    print('hybrid model done.')
    #CNN model (char)
    model = CharModel(embed_matrix=char_embed_matrix)
    model.load_weights()
    y = model.predict(x)
    ys.append(y)

    model = CharModel(embed_matrix=char_embed_matrix,
                      name='charmodel_PE.h5',
                      PE=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(y)

    model = CharModel(embed_matrix=char_embed_matrix,
                      name='charmodel_PE_OE.h5',
                      PE=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(y)

    print('char model done.')

    #CNN (term)
    model = TermModel(embed_matrix=term_embed_matrix)
    model.load_weights()
    y = model.predict(xterm)
    ys.append(y)
    print('term model done.')

    model = DeepCNNModel(embed_matrix=char_embed_matrix)
    model.load_weights()
    y = model.predict(x)
    ys.append(y)
    print('deep cnn done.')
    # attention model (char)
    model = AttModel(MAX_LEN=600,
                     name='charattmodel.h5',
                     embed_matrix=char_embed_matrix)
    model.load_weights()
    y = model.predict(x)
    ys.append(y)
    print('att char done.')

    # attention model (term)
    model = AttModel(MAX_LEN=300, embed_matrix=term_embed_matrix)
    model.load_weights()
    y = model.predict(xterm)
    ys.append(y)
    print('att term done.')

    model = SSCharModel(embed_matrix=char_embed_matrix,
                        name='sscharmodel_PE.h5',
                        PE=True,
                        train_embed=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(y)

    model = SSCharModel(embed_matrix=char_embed_matrix, train_embed=True)
    model.load_weights()
    y = model.predict(x)
    ys.append(y)
    print('conv model with second learning passes done.')

    model = GatedConvModel(embed_matrix=char_embed_matrix,
                           name='gatedconvmodel_PE.h5',
                           PE=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(convert_onehot(y))

    model = GatedConvModel(embed_matrix=char_embed_matrix, train_embed=True)
    model.load_weights()
    y = model.predict(x)
    ys.append(y)
    print('gated conv done.')

    model = GatedDeepCNNModel(embed_matrix=char_embed_matrix,
                              name='gateddeepcnnmodel_PE.h5',
                              PE=True,
                              train_embed=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(y)

    model = GatedDeepCNNModel(embed_matrix=char_embed_matrix, train_embed=True)
    model.load_weights()
    y = model.predict(x)
    ys.append(y)
    print('gated deep cnn done.')

    labels = ['人类作者', '自动摘要', '机器作者', '机器翻译']
    y_pred = np.mean(ys, axis=0)
    y_pred = convert_y(y_pred)
    out_file = 'result.csv'
    with open(out_file, 'w', encoding='utf-8') as fout:
        for id, yi in zip(ids, y_pred):
            label = labels[yi]
            fout.write('{},{}\n'.format(id, label))
    print('done.')
Exemple #12
0
def train_model_tfidf(model_conf,
                      model_name='hybridconvmodel_tfidf.h5',
                      ModelClass=HybridModelBase):
    print(model_name)
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = 300
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    x_tfidf, xterm_tfidf = data_utils.pickle_load(conf.tfidf_file)
    print('tfidf shape', x_tfidf.shape, xterm_tfidf.shape)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xe, xterm, xe_term, xfeat, xt, x_tfidf, xterm_tfidf],
        y,
        shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=model_name,
                       train_embed=False,
                       train_top=True)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model_conf.lr *= 0.5
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=model_name,
                       train_embed=True,
                       train_top=False)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
Exemple #13
0
# create word vocabulary from the data itself
wordlist = itertools.chain.from_iterable(train_tokens)
word_index, _ = build_vocab(wordlist)

# load dependency embedding
dep_embedding_path = "dep_embedding/deps.contexts"
#dep_embedding_index = load_embedding(dep_embedding_path)
dep_embedding_index = load_dep_embedding(dep_embedding_path)
dep_embedding_matrix = get_embedding_matrix(word_index, dep_embedding_index, FLAGS.word_embedding_size)
print("finish loading dependency embedding")
print(dep_embedding_index.get("the"))
exit()

embedding_path = "glove.6B/glove.6B.{}d.txt".format(FLAGS.word_embedding_size)
embedding_index = load_embedding(embedding_path)
embedding_matrix = get_embedding_matrix(word_index, embedding_index, FLAGS.word_embedding_size)
print("finish loading Glove embedding")
print(len(dep_embedding_matrix[5]))
print(len(embedding_matrix[5]))
exit()
vocab_size = len(word_index)

# convert words to indices including padding and cutting
train_x = tokens_to_indices(word_index, train_tokens, MAXLEN)
valid_x = tokens_to_indices(word_index, valid_tokens, MAXLEN)
test_x = tokens_to_indices(word_index, test_tokens, MAXLEN)

# get topic sequence
max_topic = 5
train_topic = tokens_to_indices(topic_index, train_topic, max_topic)