コード例 #1
0
def train_model(use_char=True):
    print('train condition conv model.\nload data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = 300
    name = 'conditionconvmodel.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat, xt],
                                                  y,
                                                  shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               MAX_LEN=MAX_LEN,
                               NUM_FEAT=8,
                               name=name)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               MAX_LEN=MAX_LEN,
                               NUM_FEAT=8,
                               name=name,
                               train_embed=True,
                               train_top=False,
                               lr=0.001)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
コード例 #2
0
def train_model_n200(model_conf, name, ModelClass):
    print(name)
    import data_utils200 as data_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')

    MAX_LEN = conf.MAX_LEN
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = conf.MAX_LEN
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=name)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=name,
                       train_embed=True,
                       train_top=False)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
コード例 #3
0
def train_model_cv( cv_index, cv_num ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = 300
    name = 'hybriddensemodel_cv{}.h5'.format(cv_index)
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split_cv([x, xe, xterm, xe_term, xfeat, xt], y, cv_index=cv_index,cv_num=cv_num)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix,
                            MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix,
                            MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                            name=name, train_embed=True, train_top=False, lr=0.001)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
コード例 #4
0
def train_main_pe():
    print('load data')
    import data_utils, training_utils
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')
    xe_char = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe_char = np.array(xe_char)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)
    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xterm, xfeat, xe_char, xe_term], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('define model')
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8,
                        name='hybridmodel_PE.h5',
                        PE=True)  # +37
    print('feat shape', xfeat.shape)
    import sys
    print('train')
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
コード例 #5
0
def encode_main():
    lda_file = 'data/lda_vec_val.pkl'
    import data_utils100
    import data_utils200
    tn_conf = TrainConfigure()
    val_conf = ValidConfigure()
    val_conf200 = data_utils200.ValidConfigure()
    val_conf100 = data_utils100.ValidConfigure()
    ys_val = predict(tn_conf, lda_file, val_conf, val_conf100, val_conf200)

    import data_utils100
    import data_utils200
    tn_conf = TrainConfigure()
    val_conf = TrainConfigure()
    val_conf200 = data_utils200.TrainConfigure()
    val_conf100 = data_utils100.TrainConfigure()
    ys_train = predict(tn_conf, lda_file, val_conf, val_conf100, val_conf200)
    data_utils.pickle_dump((ys_train, ys_val), 'data/stack_y.pkl')
コード例 #6
0
def cache_data(name, txt_cols, sep_sent, dir, MAX_SENT = 8 ):
    vocab_file = data_dir+'{}_vocab.txt'.format(name)
    train_file = dir + 'train.csv'
    test_file = dir + 'test.csv'
    txt_train, y_train = load_csv(train_file, txt_cols)
    txt_test, y_test = load_csv(test_file, txt_cols)
    txt_all = []
    txt_all.extend( txt_train)
    txt_all.extend( txt_test )
    vocab_list = data_utils._create_vocabulary(txt_all, max_vocab_size=50000)
    data_utils.save_vocab( vocab_list, vocab_file )
    vocab_dict,dict_res = data_utils.load_vocab( vocab_file )
    embed_mat = data_utils.load_embedding300( vocab_dict, glove_file)
    if sep_sent:
        MAX_LEN = get_max_sent_len( txt_train )
        x_tn = padding_texts_sent(txt_train, vocab_dict,MAX_LEN, MAX_SENT )
    else:
        x_tn = data_utils.convert2idlist(txt_train, vocab_dict)
        MAX_LEN = 10
        for x in x_tn:
            if len(x)>MAX_LEN:
                MAX_LEN = len(x)
        if MAX_LEN > 1000:
            print('max len: ',MAX_LEN)
            MAX_LEN = 1000
        x_tn = data_utils.pad_data(x_tn, MAX_LEN, pad_pre=True)
    x_tn = np.array( x_tn )
    y_tn = np.array( y_train )
    if sep_sent:
        x_ts = padding_texts_sent(txt_test, vocab_dict,MAX_LEN, MAX_SENT )
    else:
        x_ts = data_utils.convert2idlist(txt_test, vocab_dict)
        x_ts = data_utils.pad_data(x_ts, MAX_LEN, pad_pre=True)
    x_ts = np.array( x_ts )
    y_ts = np.array( y_test )
    data_dict = {'MAX_LEN':MAX_LEN,'x_tn':x_tn, 'y_tn':y_tn, 'x_ts':x_ts, 'y_ts':y_ts,'embed_mat':embed_mat}
    if sep_sent:
        filename = './data/'+name+'_sent.pkl'
    else:
        filename = './data/'+name+'_word.pkl'
    data_utils.pickle_dump( data_dict, filename )
コード例 #7
0
def get_vec(out_file, mode="tn"):
    index = 0
    vec_len = 20
    dictionary = corpora.Dictionary.load('./data/lda.dict')
    lda = gensim.models.LdaModel.load('data/LDA20.model')
    if mode == "train":
        print('train')
        tn_conf = TrainConfigure()
    elif mode == "val":
        print('val')
        tn_conf = data_utils.ValidConfigure()
    else:
        print("test")
        tn_conf = data_utils.TestConfigure()
    # data_dict = data_utils.pickle_load(tn_conf.char_file)
    xterm = data_utils.pickle_load(tn_conf.term_file)
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    reverse_dict = dict()
    for k, v in term_vocab_dict.items():
        reverse_dict[v] = k

    all_lda = []
    for xi in xterm:
        doc = []
        for idx in xi:
            if idx != 0 and idx != 1 and idx in reverse_dict:
                doc.append(reverse_dict[idx])
        doc_bow = dictionary.doc2bow(doc)
        lda_vec_tmp = lda[doc_bow]
        lda_vec = np.zeros(vec_len)
        for (index, p) in lda_vec_tmp:
            lda_vec[index] = p
        index += 1
        all_lda.append(lda_vec)
    data_utils.pickle_dump(np.array(all_lda), out_file)
    print('done.')
コード例 #8
0
def predict():
    """
    根据概率集成
    :return:
    """
    print('load data')
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')

    val_conf = ValidConfigure()
    data_dict = data_utils.pickle_load(val_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    ids = data_dict['id']
    xterm = data_utils.pickle_load(val_conf.term_file)
    xfeat = data_utils.pickle_load(val_conf.feat_file)
    xfeat = scaler.transform(xfeat)
    print('feat shape', xfeat.shape)
    xtopic = data_utils.pickle_load('data/lda_vec_val.pkl')
    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    import data_utils100
    val_conf100 = data_utils100.ValidConfigure()
    data_dict100 = data_utils.pickle_load(val_conf100.char_file)
    x100 = data_dict100['x']
    xterm100 = data_utils.pickle_load(val_conf100.term_file)
    xe100 = [[i for i in range(100)] for _ in range(y.shape[0])]
    xe100 = np.array(xe100)
    xe_term100 = [[i for i in range(100)] for _ in range(y.shape[0])]
    xe_term100 = np.array(xe_term100)

    import data_utils200
    val_conf200 = data_utils200.ValidConfigure()
    data_dict200 = data_utils.pickle_load(val_conf200.char_file)
    x200 = data_dict200['x']
    xterm200 = data_utils.pickle_load(val_conf200.term_file)
    xe200 = [[i for i in range(200)] for _ in range(y.shape[0])]
    xe200 = np.array(xe200)
    xe_term200 = [[i for i in range(200)] for _ in range(y.shape[0])]
    xe_term200 = np.array(xe_term200)

    ys = []
    print('define model')
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddensemodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model

    model = HybridDenseMAModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               NUM_FEAT=8,
                               PE=True,
                               name='hybriddensemodelma_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('dense model done.')

    model = HybridSEModel(char_embed_matrix=char_embed_matrix,
                          term_embed_matrix=term_embed_matrix,
                          NUM_FEAT=8,
                          PE=True,
                          name='hybridsemodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('se model done.')

    # print('start len 100 model')
    # model = HybridConvModel(char_embed_matrix=char_embed_matrix,
    #                         term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100,NUM_FEAT=8,
    #                         PE=True, name='hybridconvmodel_n100.h5')
    # model.load_weights()
    # y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid conv model done.')
    #
    # model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
    #                                 term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100,NUM_FEAT=8,
    #                                 PE=True, name='hybridgateddeepcnnmodel_n100.h5')
    # model.load_weights()
    # y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid gated deep cnn model done.')
    #
    # model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
    #                         term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100,NUM_FEAT=8,
    #                         PE=True, name='hybridrcnnmodel_n100.h5')
    # model.load_weights()
    # y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid RCNN model done.')

    print('start len 200 model')
    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            MAX_LEN=200,
                            MAX_LEN_TERM=200,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridconvmodel_n200.h5')
    model.load_weights()
    y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid conv model done.')

    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             MAX_LEN=200,
                             MAX_LEN_TERM=200,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddpcnnmodel_n200.h5')
    model.load_weights()
    y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid dpcnn model done.')

    model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix,
                                      term_embed_matrix=term_embed_matrix,
                                      MAX_LEN=200,
                                      MAX_LEN_TERM=200,
                                      NUM_FEAT=8,
                                      PE=True,
                                      name='hybridgatedconvtopicmodel_n200.h5')
    model.load_weights()
    y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid dpcnn model done.')

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    MAX_LEN=200,
                                    MAX_LEN_TERM=200,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='hybridgateddeepcnnmodel_n200.h5')
    model.load_weights()
    y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid gated deep cnn model done.')

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            MAX_LEN=200,
                            MAX_LEN_TERM=200,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridrcnnmodel_n200.h5')
    model.load_weights()
    y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    ys.append(y)
    del model

    #这个模型太慢
    model = ConditionAttModel(char_embed_matrix=char_embed_matrix,
                              term_embed_matrix=term_embed_matrix,
                              NUM_FEAT=8,
                              PE=True,
                              name='conditionattmodel_PE.h5',
                              lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    print('condition att model done.')

    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               NUM_FEAT=8,
                               PE=True,
                               name='conditionconvmodel_PE.h5',
                               lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition conv model done.')

    model = ConditionDPCNNModel(char_embed_matrix=char_embed_matrix,
                                term_embed_matrix=term_embed_matrix,
                                NUM_FEAT=8,
                                PE=True,
                                name='conditiondpcnnmodel_PE.h5',
                                lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition dpcnn model done.')

    model = ConditionGatedConvModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='conditiongatedconvmodel_PE.h5',
                                    lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition gated conv model done.')

    model = ConditionGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                       term_embed_matrix=term_embed_matrix,
                                       NUM_FEAT=8,
                                       PE=True,
                                       name='conditiongateddeepcnnmodel_PE.h5',
                                       lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition gated deepcnn model done.')

    model = HybridAttModel(char_embed_matrix=char_embed_matrix,
                           term_embed_matrix=term_embed_matrix,
                           NUM_FEAT=8,
                           PE=True,
                           name='hybridattmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    print('hybrid att model done.')

    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridconvmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid conv model done.')

    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddpcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid dpcnn model done.')

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='hybridgateddeepcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid gated deep cnn model done.')

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridrcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid rcnn model done.')

    model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix,
                                      term_embed_matrix=term_embed_matrix,
                                      NUM_FEAT=8,
                                      PE=True,
                                      name='hybridgatedconvtopicmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    print('hybrid gated conv topic done.')

    y = fasttextmodel.predict_char()
    ys.append(y)

    y = fasttextmodel.predict_term()
    ys.append(y)
    print(y.shape)
    print('fast text done.')

    #hybrid model
    # model = HybridModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8)# + 37
    # model.load_weights()
    # y = model.predict([x, xterm, xfeat])
    # ys.append( y )
    # print(y.shape)
    # print('hybrid model done.')

    labels = ['人类作者', '自动摘要', '机器作者', '机器翻译']
    y_pred = np.mean(ys, axis=0)
    y_pred = convert_y(y_pred)
    out_file = 'result.csv'
    with open(out_file, 'w', encoding='utf-8') as fout:
        for id, yi in zip(ids, y_pred):
            label = labels[yi]
            fout.write('{},{}\n'.format(id, label))
    print('done.')
コード例 #9
0
def predict2():
    """
    根据分类结果集成
    :return:
    """
    print('load data')
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')
    val_conf = ValidConfigure()
    data_dict = data_utils.pickle_load(val_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    ids = data_dict['id']
    xterm = data_utils.pickle_load(val_conf.term_file)
    xfeat = data_utils.pickle_load(val_conf.feat_file)
    xfeat = scaler.transform(xfeat)
    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)

    ys = []
    print('define model')
    #hybrid model
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8)  # + 37
    print('feat shape', xfeat.shape)
    model.load_weights()
    y = model.predict([x, xterm, xfeat])
    ys.append(convert_onehot(y))
    print('hybrid model done.')
    #CNN model (char)
    model = CharModel(embed_matrix=char_embed_matrix)
    model.load_weights()
    y = model.predict(x)
    ys.append(convert_onehot(y))
    print('char model done.')

    model = CharModel(embed_matrix=char_embed_matrix,
                      name='charmodel_PE.h5',
                      PE=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(convert_onehot(y))
    print('char model done.')

    model = CharModel(embed_matrix=char_embed_matrix,
                      name='charmodel_PE_OE.h5',
                      PE=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(convert_onehot(y))
    print('char model done.')

    #CNN (term)
    model = TermModel(embed_matrix=term_embed_matrix)
    model.load_weights()
    y = model.predict(xterm)
    ys.append(convert_onehot(y))
    print('term model done.')

    model = DeepCNNModel(embed_matrix=char_embed_matrix)
    model.load_weights()
    y = model.predict(x)
    ys.append(convert_onehot(y))
    print('deep cnn done.')
    # # attention model (char)
    # model = AttModel(MAX_LEN=600, name='charattmodel.h5', embed_matrix=char_embed_matrix)
    # model.load_weights()
    # y = model.predict(x)
    # ys.append(convert_onehot(y))
    # # attention model (term)
    # model = AttModel(MAX_LEN=300, embed_matrix=term_embed_matrix)
    # model.load_weights()
    # y = model.predict(xterm)
    # ys.append(convert_onehot(y))
    #
    # model = ConditionModel(embed_matrix=char_embed_matrix)
    # model.load_weights()
    # y = model.predict(x)
    # ys.append(convert_onehot(y))

    model = SSCharModel(embed_matrix=char_embed_matrix,
                        name='sscharmodel_PE.h5',
                        PE=True,
                        train_embed=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(convert_onehot(y))

    model = SSCharModel(embed_matrix=char_embed_matrix, train_embed=True)
    model.load_weights()
    y = model.predict(x)
    ys.append(convert_onehot(y))

    model = GatedConvModel(embed_matrix=char_embed_matrix,
                           name='gatedconvmodel_PE.h5',
                           PE=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(convert_onehot(y))

    model = GatedConvModel(embed_matrix=char_embed_matrix, train_embed=True)
    model.load_weights()
    y = model.predict(x)
    ys.append(convert_onehot(y))

    model = GatedDeepCNNModel(embed_matrix=char_embed_matrix,
                              name='gateddeepcnnmodel_PE.h5',
                              PE=True,
                              train_embed=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(convert_onehot(y))

    model = GatedDeepCNNModel(embed_matrix=char_embed_matrix, train_embed=True)
    model.load_weights()
    y = model.predict(x)
    ys.append(convert_onehot(y))

    labels = ['人类作者', '自动摘要', '机器作者', '机器翻译']
    y_pred = np.mean(ys, axis=0)
    y_pred = convert_y(y_pred)
    out_file = 'result.csv'
    with open(out_file, 'w', encoding='utf-8') as fout:
        for id, yi in zip(ids, y_pred):
            label = labels[yi]
            fout.write('{},{}\n'.format(id, label))
    print('done.')
コード例 #10
0
def train_model_peoe( ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')

    char_embeds = []
    char_embed_matrix_oe = data_utils.load_our_embedding(vocab_dict)
    char_embeds.append( char_embed_matrix_oe )
    for windows in [3, 5, 8]:
        sg = 1
        # for sg in [0,1]:
        embed_file = 'data/char_embed_{}_{}.model'.format(windows, sg)
        char_embed_tmp = data_utils.load_our_embedding(vocab_dict, model_file = embed_file,
                                                       dump_path = 'data/our_char_embed_{}_{}.pkl'.format(windows, sg))
        char_embeds.append(char_embed_tmp)

    term_embeds = []
    term_embed_matrix_oe = data_utils.load_our_embedding(term_vocab_dict, model_file='data/term_embed.model',
                                                         dump_path='data/our_term_embed.pkl')
    term_embeds.append( term_embed_matrix_oe )
    for windows in [3, 5, 8]:
        sg = 1
        # for sg in [0,1]:
        embed_file = 'data/term_embed_{}_{}.model'.format(windows, sg)
        term_embed_tmp = data_utils.load_our_embedding(term_vocab_dict, model_file=embed_file,
                                                       dump_path='data/our_term_embed_{}_{}.pkl'.format(windows,
                                                                                                        sg))
        term_embeds.append(term_embed_tmp)


    MAX_LEN_TERM = 300
    name = 'hybriddpcnnmodel_PEOE.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    #加入更多embedding模型以后,学习率要降低才能正常学习
    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                             name=name, char_embeds=char_embeds, term_embeds = term_embeds, lr=0.0004)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                             name=name, train_embed=True, train_top=False, lr=0.001, char_embeds=char_embeds, term_embeds = term_embeds)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
コード例 #11
0
def train_main():
    print('load data')
    import data_utils, training_utils
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat],
                                                  y,
                                                  shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('define model')
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8)  # +37
    print('feat shape', xfeat.shape)
    import sys
    if len(sys.argv) <= 1 or sys.argv[1] == 'train':
        print('train')
        model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    if len(sys.argv) > 1 and sys.argv[1] == 'val':
        val_conf = ValidConfigure()
        data_dict = data_utils.pickle_load(val_conf.char_file)
        y = to_categorical(data_dict['y'])
        x = data_dict['x']
        ids = data_dict['id']
        xterm = data_utils.pickle_load(val_conf.term_file)
        xfeat = data_utils.pickle_load(val_conf.feat_file)
        xfeat = scaler.transform(xfeat)
        model.load_weights()
        model.test([x, xterm, xfeat], ids, val_conf.out_file)

    if len(sys.argv) > 1 and sys.argv[1] == 'error':
        start_index = y_tn.shape[0] + y_val.shape[0]
        texts = data_utils.load_all_text(tn_conf)
        model.load_weights()
        model.error_analysis(x_ts, y_ts, texts, start_index)
コード例 #12
0
 def on_batch_end(self, batch, logs={}):
     self.losses.append(logs.get('loss'))
     pickle_dump(self.losses, "loss_history.pkl")
コード例 #13
0
def stacking_main_condition():
    print('load data')
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)
    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)
    xtopic = data_utils.pickle_load('data/lda_vec.pkl')

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(char_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')
    print('load embed done.')

    name = 'model/stack_condition_model.pkl'
    model_dir = 'model/stack/'
    n_fold = 3
    name = 'model/stack_condition_model5.pkl'
    model_dir = 'model/stack5/'
    n_fold = 5
    stk_model = stacking(n_fold, name=name, is_condition=True)
    conf = conditionmodelbase.ModelConfigure()
    conf.PE = True
    stk_model.add_model(ConditionConvModel, {"conf":conf,"char_embed_matrix":char_embed_matrix,
                            "term_embed_matrix":term_embed_matrix,
                                             "name":model_dir+'conditionconvmodel_PE.h5'})
    stk_model.add_model(ConditionGatedConvModel, {"conf":conf,"char_embed_matrix": char_embed_matrix,
                                          "term_embed_matrix": term_embed_matrix,
                                                  "name": model_dir+'conditiongatedconvmodel_PE.h5'})
    stk_model.add_model(ConditionGatedDeepCNNModel, {"conf":conf,"char_embed_matrix": char_embed_matrix,
                                          "term_embed_matrix": term_embed_matrix,
                                            "name": model_dir+'conditiongateddeepcnnmodel_PE.h5'})
    conf.lr = 0.0005
    stk_model.add_model(ConditionDPCNNModel, {"conf": conf, "char_embed_matrix": char_embed_matrix,
                                              "term_embed_matrix": term_embed_matrix,
                                              "name": model_dir + 'conditiondpcnnmodel_PE.h5'})
    #采样0.1用于测试
    # x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic], y, split_ratio=0.005, shuffle=False)
    # x_tn, y_tn, x_ts, y_ts = training_utils.split(x_tn, y_tn, shuffle=False)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic],  y, split_ratio=0.95)
    stk_model.fit(x_tn, y_tn)
    # joblib.dump(stk_model, 'model/stack_model_3.pkl')
    y_pred = stk_model.predict(x_ts)
    acc = accuracy_score(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) )
    print(acc)
    cnf_matrix = confusion_matrix(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) )
    print(cnf_matrix)
    stk_model.save( )
コード例 #14
0
def predict(tn_conf, lda_file, val_conf, val_conf100, val_conf200):
    """
    根据概率集成
    :return:
    """
    print('load data')
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)
    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    xtopic = data_utils.pickle_load(lda_file)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')
    data_dict = data_utils.pickle_load(val_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    ids = data_dict['id']
    xterm = data_utils.pickle_load(val_conf.term_file)
    xfeat = data_utils.pickle_load(val_conf.feat_file)
    xfeat = scaler.transform(xfeat)
    print('feat shape', xfeat.shape)

    data_dict100 = data_utils.pickle_load(val_conf100.char_file)
    x100 = data_dict100['x']
    xterm100 = data_utils.pickle_load(val_conf100.term_file)
    xe100 = [[i for i in range(100)] for _ in range(y.shape[0])]
    xe100 = np.array(xe100)
    xe_term100 = [[i for i in range(100)] for _ in range(y.shape[0])]
    xe_term100 = np.array(xe_term100)

    data_dict200 = data_utils.pickle_load(val_conf200.char_file)
    x200 = data_dict200['x']
    xterm200 = data_utils.pickle_load(val_conf200.term_file)
    xe200 = [[i for i in range(200)] for _ in range(y.shape[0])]
    xe200 = np.array(xe200)
    xe_term200 = [[i for i in range(200)] for _ in range(y.shape[0])]
    xe_term200 = np.array(xe_term200)

    ys = []
    print('define model')
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddensemodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model

    model = HybridDenseMAModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               NUM_FEAT=8,
                               PE=True,
                               name='hybriddensemodelma_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('dense model done.')

    model = HybridSEModel(char_embed_matrix=char_embed_matrix,
                          term_embed_matrix=term_embed_matrix,
                          NUM_FEAT=8,
                          PE=True,
                          name='hybridsemodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('se model done.')

    print('start len 100 model')
    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            MAX_LEN=100,
                            MAX_LEN_TERM=100,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridconvmodel_n100.h5')
    model.load_weights()
    y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid conv model done.')

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    MAX_LEN=100,
                                    MAX_LEN_TERM=100,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='hybridgateddeepcnnmodel_n100.h5')
    model.load_weights()
    y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid gated deep cnn model done.')

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            MAX_LEN=100,
                            MAX_LEN_TERM=100,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridrcnnmodel_n100.h5')
    model.load_weights()
    y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid RCNN model done.')

    # print('start len 200 model')
    # model = HybridConvModel(char_embed_matrix=char_embed_matrix,
    #                         term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                         PE=True, name='hybridconvmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid conv model done.')
    #
    # model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
    #                          term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                          PE=True, name='hybriddpcnnmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid dpcnn model done.')
    #
    # model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
    #                                 term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                                 PE=True, name='hybridgateddeepcnnmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid gated deep cnn model done.')
    #
    # model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
    #                         term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                         PE=True, name='hybridrcnnmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model

    #这个模型太慢
    # model = ConditionAttModel(char_embed_matrix=char_embed_matrix,
    #                           term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True,
    #                           name='conditionattmodel_PE.h5', lr=0.001)
    # model.load_weights()
    # y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    # ys.append(y)
    # print('condition att model done.')

    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               NUM_FEAT=8,
                               PE=True,
                               name='conditionconvmodel_PE.h5',
                               lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition conv model done.')

    model = ConditionDPCNNModel(char_embed_matrix=char_embed_matrix,
                                term_embed_matrix=term_embed_matrix,
                                NUM_FEAT=8,
                                PE=True,
                                name='conditiondpcnnmodel_PE.h5',
                                lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition dpcnn model done.')

    model = ConditionGatedConvModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='conditiongatedconvmodel_PE.h5',
                                    lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition gated conv model done.')

    model = ConditionGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                       term_embed_matrix=term_embed_matrix,
                                       NUM_FEAT=8,
                                       PE=True,
                                       name='conditiongateddeepcnnmodel_PE.h5',
                                       lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition gated deepcnn model done.')

    model = HybridAttModel(char_embed_matrix=char_embed_matrix,
                           term_embed_matrix=term_embed_matrix,
                           NUM_FEAT=8,
                           PE=True,
                           name='hybridattmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    print('hybrid att model done.')

    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridconvmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid conv model done.')

    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddpcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid dpcnn model done.')

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='hybridgateddeepcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid gated deep cnn model done.')

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridrcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid rcnn model done.')

    model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix,
                                      term_embed_matrix=term_embed_matrix,
                                      NUM_FEAT=8,
                                      PE=True,
                                      name='hybridgatedconvtopicmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    print('hybrid gated conv topic done.')

    y = fasttextmodel.predict_char()
    ys.append(y)

    y = fasttextmodel.predict_term()
    ys.append(y)
    print(y.shape)
    print('fast text done.')

    #hybrid model
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8)  # + 37
    model.load_weights()
    y = model.predict([x, xterm, xfeat])
    ys.append(y)
    print(y.shape)
    print('hybrid model done.')
    ys = np.array(ys)
    print(ys.shape)
    return ys
コード例 #15
0
def train_model_ftoe(model_conf,
                     model_name='hybridconvmodel_FTOE.h5',
                     ModelClass=HybridModelBase,
                     char_embed_file=None,
                     term_embed_file=None):
    print(model_name)
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    # char_embed_matrix = data_utils.load_embedding(vocab_dict,
    #                                               'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed.pkl')
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        char_embed_file,
        dump_path='data/{}.pkl'.format(char_embed_file[5:]))
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        term_embed_file,
        dump_path='data/{}.pkl'.format(term_embed_file[5:]))
    MAX_LEN_TERM = 300
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xe, xterm, xe_term, xfeat, xt], y, split_ratio=0.95, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn,
                                                    y_tn,
                                                    split_ratio=0.95,
                                                    shuffle=False)
    print('train')
    print('define model')
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=model_name,
                       train_embed=False,
                       train_top=True)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model_conf.lr *= 0.5
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=model_name,
                       train_embed=True,
                       train_top=False)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model