def train_model(use_char = False ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    if use_char:
        vocab_dict = data_utils.pickle_load(conf.char_dict)
        embed_matrix = data_utils.load_embedding(vocab_dict,
                                                 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                 dump_path='data/char_embed.pkl')

        MAX_LEN = 600
        name = 'deepcnn_model.h5'
        x = data_dict['x']
    else:
        x = data_utils.pickle_load(conf.term_file)
        vocab_dict = data_utils.pickle_load(conf.term_dict)
        embed_matrix = data_utils.load_embedding(vocab_dict,
                                                 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                 dump_path='data/term_embed.pkl')
        MAX_LEN = 300
        name = 'deepcnn_model_term.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    x_tn, y_tn, x_ts, y_ts = training_utils.split(x, y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = DeepCNNModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Exemple #2
0
def load_condition_stacking_main():
    tn_conf = TrainConfigure()
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(char_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')
    name = 'model/stack_condition_model.pkl'
    model_dir = 'model/stack/'
    n_fold = 3
    name = 'model/stack_condition_model5.pkl'
    model_dir = 'model/stack5/'
    n_fold = 5
    conf = conditionmodelbase.ModelConfigure()
    stk_model = stacking(n_fold, name=name, is_condition=True)
    stk_model.add_model(ConditionConvModel, {"conf":conf,"char_embed_matrix": char_embed_matrix,
                                             "term_embed_matrix": term_embed_matrix,
                                             "name": model_dir + 'conditionconvmodel_PE.h5'})
    stk_model.add_model(ConditionDPCNNModel, {"conf":conf,"char_embed_matrix": char_embed_matrix,
                                              "term_embed_matrix": term_embed_matrix,
                                              "name": model_dir + 'conditiondpcnnmodel_PE.h5'})
    stk_model.add_model(ConditionGatedConvModel, {"conf":conf,"char_embed_matrix": char_embed_matrix,
                                                  "term_embed_matrix": term_embed_matrix,
                                                  "name": model_dir + 'conditiongatedconvmodel_PE.h5'})
    stk_model.add_model(ConditionGatedDeepCNNModel, {"conf":conf,"char_embed_matrix": char_embed_matrix,
                                                     "term_embed_matrix": term_embed_matrix,
                                                     "name": model_dir + 'conditiongateddeepcnnmodel_PE.h5'})
    stk_model.load( )
    return stk_model
def train_model_pe():
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')
    y = to_categorical(data_dict['y'])

    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([data_dict['x'], xe],
                                                  y,
                                                  shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = SSCharModel(embed_matrix=embed_matrix,
                        name='sscharmodel_PE.h5',
                        PE=True)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = SSCharModel(embed_matrix=embed_matrix,
                        name='sscharmodel_PE.h5',
                        PE=True,
                        train_embed=True)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Exemple #4
0
def train_model_pe( use_char = True):
    print('define model')
    print('load data')
    import data_utils, training_utils
    tn_conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load('data.dict')
    y = to_categorical(data_dict['y'])
    if use_char:
        vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
        embed_matrix = data_utils.load_embedding(vocab_dict,
                                                      'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                      dump_path='data/char_embed.pkl')
        x = data_dict['x']
        MAX_LEN = 600
        name = 'conditionmodel_PE.h5'
    else:
        x = data_utils.pickle_load(tn_conf.term_file)
        vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
        embed_matrix = data_utils.load_embedding(vocab_dict,
                                                      'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                      dump_path='data/term_embed.pkl')
        MAX_LEN = 300
        name = 'conditionmodel_term_PE.h5'
    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')

    model = ConditionModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name, PE=True)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Exemple #5
0
def train_main_pe():
    print('load data')
    import data_utils, training_utils
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')
    xe_char = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe_char = np.array(xe_char)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)
    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xterm, xfeat, xe_char, xe_term], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('define model')
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8,
                        name='hybridmodel_PE.h5',
                        PE=True)  # +37
    print('feat shape', xfeat.shape)
    import sys
    print('train')
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Exemple #6
0
def load_data_cache(name, sep_sent = True , sample = False):
    if sep_sent:
        filename = './data/'+name+'_sent.pkl'
    else:
        filename = './data/'+name+'_word.pkl'
    r = data_utils.pickle_load( filename )
    x_tn = r['x_tn']
    y_tn = r['y_tn']
    x_ts = r['x_ts']
    y_ts = r['y_ts']
    embedding_matrix = r['embed_mat']
    y_tn = to_categorical(y_tn)
    y_ts = to_categorical(y_ts)
    indices = np.arange(y_tn.shape[0])
    np.random.shuffle(indices)
    x_tn = x_tn[indices]
    y_tn = y_tn[indices]
    if sample:
        sample_num = min( 500000, y_tn.shape[0] )
        # indices = np.arange( y_train.shape[0] )
        # np.random.shuffle( indices )
        # indices = indices[0:sample_num]
        x_tn = x_tn[0:sample_num]
        y_tn = y_tn[0:sample_num]
        sample_num = min(100000, y_ts.shape[0] )
        indices = np.arange( y_ts.shape[0] )
        np.random.shuffle( indices )
        indices = indices[0:sample_num ]
        y_ts = y_ts[indices]
        x_ts = x_ts[indices]
    return x_tn, y_tn, x_ts, y_ts, embedding_matrix
Exemple #7
0
def train_model_pe(use_char=False):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    if use_char:
        vocab_dict = data_utils.pickle_load(conf.char_dict)
        embed_matrix = data_utils.load_embedding(
            vocab_dict,
            'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
            dump_path='data/char_embed.pkl')

        MAX_LEN = 600
        name = 'gateddeepcnnmodel_PE.h5'
        x = data_dict['x']
    else:
        x = data_utils.pickle_load(conf.term_file)
        vocab_dict = data_utils.pickle_load(conf.term_dict)
        embed_matrix = data_utils.load_embedding(
            vocab_dict,
            'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
            dump_path='data/term_embed.pkl')
        MAX_LEN = 300
        name = 'gateddeepcnnmodel_term_PE.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = GatedDeepCNNModel(embed_matrix=embed_matrix,
                              MAX_LEN=MAX_LEN,
                              name=name,
                              PE=True)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = GatedDeepCNNModel(embed_matrix=embed_matrix,
                              MAX_LEN=MAX_LEN,
                              name=name,
                              PE=True,
                              train_embed=True)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def prepare_tn_data(filename='data/topic_train.txt'):
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    xterm = data_utils.pickle_load(tn_conf.term_file)
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    reverse_dict = dict()
    for k, v in term_vocab_dict.items():
        reverse_dict[v] = k
    N = xterm.shape[0]
    with open(filename, 'w') as fout:
        for i in range(N):
            xi = xterm[i]
            term_list = []
            for idx in xi:
                if idx != 0 and idx != 1 and idx in reverse_dict:
                    term_list.append(reverse_dict[idx])
            fout.write(' '.join(term_list) + '\n')
    print('prepare data done.')
def train_model():
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.term_dict)
    embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    print('load embed done.')
    x_tn, y_tn, x_ts, y_ts = training_utils.split(xterm, y, shuffle=True)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=True)
    print('train')
    print('define model')
    model = TermModel(embed_matrix=embed_matrix)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Exemple #10
0
def train_model_n200(model_conf, name, ModelClass):
    print(name)
    import data_utils200 as data_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')

    MAX_LEN = conf.MAX_LEN
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = conf.MAX_LEN
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=name)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=name,
                       train_embed=True,
                       train_top=False)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model(use_char=True):
    print('train condition conv model.\nload data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = 300
    name = 'conditionconvmodel.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat, xt],
                                                  y,
                                                  shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               MAX_LEN=MAX_LEN,
                               NUM_FEAT=8,
                               name=name)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               MAX_LEN=MAX_LEN,
                               NUM_FEAT=8,
                               name=name,
                               train_embed=True,
                               train_top=False,
                               lr=0.001)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def get_vec(out_file, mode="tn"):
    index = 0
    vec_len = 20
    dictionary = corpora.Dictionary.load('./data/lda.dict')
    lda = gensim.models.LdaModel.load('data/LDA20.model')
    if mode == "train":
        print('train')
        tn_conf = TrainConfigure()
    elif mode == "val":
        print('val')
        tn_conf = data_utils.ValidConfigure()
    else:
        print("test")
        tn_conf = data_utils.TestConfigure()
    # data_dict = data_utils.pickle_load(tn_conf.char_file)
    xterm = data_utils.pickle_load(tn_conf.term_file)
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    reverse_dict = dict()
    for k, v in term_vocab_dict.items():
        reverse_dict[v] = k

    all_lda = []
    for xi in xterm:
        doc = []
        for idx in xi:
            if idx != 0 and idx != 1 and idx in reverse_dict:
                doc.append(reverse_dict[idx])
        doc_bow = dictionary.doc2bow(doc)
        lda_vec_tmp = lda[doc_bow]
        lda_vec = np.zeros(vec_len)
        for (index, p) in lda_vec_tmp:
            lda_vec[index] = p
        index += 1
        all_lda.append(lda_vec)
    data_utils.pickle_dump(np.array(all_lda), out_file)
    print('done.')
def train_model_cv( cv_index, cv_num ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = 300
    name = 'hybriddensemodel_cv{}.h5'.format(cv_index)
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split_cv([x, xe, xterm, xe_term, xfeat, xt], y, cv_index=cv_index,cv_num=cv_num)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix,
                            MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix,
                            MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                            name=name, train_embed=True, train_top=False, lr=0.001)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
Exemple #14
0
        self.model.fit(x,
                       y,
                       validation_data=[x_val, y_val],
                       batch_size=512,
                       epochs=20,
                       callbacks=[early_stop, save_best])
        metric = self.model.evaluate(x_ts, y_ts)
        print(metric)
        self.model.load_weights(save_path)
        metric = self.model.evaluate(x_ts, y_ts, batch_size=512)
        print(metric)
        y_pred = self.model.predict(x_ts, batch_size=512)

        cnf_matrix = confusion_matrix(convert_y(y_ts), convert_y(y_pred))
        print(cnf_matrix)


if __name__ == '__main__':
    import sys
    print('define char model')
    model = HanModel(name='hanmodel.h5')
    print('load data')
    import data_utils, training_utils
    data_dict = data_utils.pickle_load('data_sent.dict')
    y = to_categorical(data_dict['y'])
    x_tn, y_tn, x_ts, y_ts = training_utils.split(data_dict['x'],
                                                  y,
                                                  shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def predict():
    """
    根据概率集成
    :return:
    """
    print('load data')
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')

    val_conf = ValidConfigure()
    data_dict = data_utils.pickle_load(val_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    ids = data_dict['id']
    xterm = data_utils.pickle_load(val_conf.term_file)
    xfeat = data_utils.pickle_load(val_conf.feat_file)
    xfeat = scaler.transform(xfeat)
    print('feat shape', xfeat.shape)
    xtopic = data_utils.pickle_load('data/lda_vec_val.pkl')
    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    import data_utils100
    val_conf100 = data_utils100.ValidConfigure()
    data_dict100 = data_utils.pickle_load(val_conf100.char_file)
    x100 = data_dict100['x']
    xterm100 = data_utils.pickle_load(val_conf100.term_file)
    xe100 = [[i for i in range(100)] for _ in range(y.shape[0])]
    xe100 = np.array(xe100)
    xe_term100 = [[i for i in range(100)] for _ in range(y.shape[0])]
    xe_term100 = np.array(xe_term100)

    import data_utils200
    val_conf200 = data_utils200.ValidConfigure()
    data_dict200 = data_utils.pickle_load(val_conf200.char_file)
    x200 = data_dict200['x']
    xterm200 = data_utils.pickle_load(val_conf200.term_file)
    xe200 = [[i for i in range(200)] for _ in range(y.shape[0])]
    xe200 = np.array(xe200)
    xe_term200 = [[i for i in range(200)] for _ in range(y.shape[0])]
    xe_term200 = np.array(xe_term200)

    ys = []
    print('define model')
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddensemodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model

    model = HybridDenseMAModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               NUM_FEAT=8,
                               PE=True,
                               name='hybriddensemodelma_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('dense model done.')

    model = HybridSEModel(char_embed_matrix=char_embed_matrix,
                          term_embed_matrix=term_embed_matrix,
                          NUM_FEAT=8,
                          PE=True,
                          name='hybridsemodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('se model done.')

    # print('start len 100 model')
    # model = HybridConvModel(char_embed_matrix=char_embed_matrix,
    #                         term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100,NUM_FEAT=8,
    #                         PE=True, name='hybridconvmodel_n100.h5')
    # model.load_weights()
    # y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid conv model done.')
    #
    # model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
    #                                 term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100,NUM_FEAT=8,
    #                                 PE=True, name='hybridgateddeepcnnmodel_n100.h5')
    # model.load_weights()
    # y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid gated deep cnn model done.')
    #
    # model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
    #                         term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100,NUM_FEAT=8,
    #                         PE=True, name='hybridrcnnmodel_n100.h5')
    # model.load_weights()
    # y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid RCNN model done.')

    print('start len 200 model')
    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            MAX_LEN=200,
                            MAX_LEN_TERM=200,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridconvmodel_n200.h5')
    model.load_weights()
    y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid conv model done.')

    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             MAX_LEN=200,
                             MAX_LEN_TERM=200,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddpcnnmodel_n200.h5')
    model.load_weights()
    y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid dpcnn model done.')

    model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix,
                                      term_embed_matrix=term_embed_matrix,
                                      MAX_LEN=200,
                                      MAX_LEN_TERM=200,
                                      NUM_FEAT=8,
                                      PE=True,
                                      name='hybridgatedconvtopicmodel_n200.h5')
    model.load_weights()
    y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid dpcnn model done.')

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    MAX_LEN=200,
                                    MAX_LEN_TERM=200,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='hybridgateddeepcnnmodel_n200.h5')
    model.load_weights()
    y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid gated deep cnn model done.')

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            MAX_LEN=200,
                            MAX_LEN_TERM=200,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridrcnnmodel_n200.h5')
    model.load_weights()
    y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    ys.append(y)
    del model

    #这个模型太慢
    model = ConditionAttModel(char_embed_matrix=char_embed_matrix,
                              term_embed_matrix=term_embed_matrix,
                              NUM_FEAT=8,
                              PE=True,
                              name='conditionattmodel_PE.h5',
                              lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    print('condition att model done.')

    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               NUM_FEAT=8,
                               PE=True,
                               name='conditionconvmodel_PE.h5',
                               lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition conv model done.')

    model = ConditionDPCNNModel(char_embed_matrix=char_embed_matrix,
                                term_embed_matrix=term_embed_matrix,
                                NUM_FEAT=8,
                                PE=True,
                                name='conditiondpcnnmodel_PE.h5',
                                lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition dpcnn model done.')

    model = ConditionGatedConvModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='conditiongatedconvmodel_PE.h5',
                                    lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition gated conv model done.')

    model = ConditionGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                       term_embed_matrix=term_embed_matrix,
                                       NUM_FEAT=8,
                                       PE=True,
                                       name='conditiongateddeepcnnmodel_PE.h5',
                                       lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition gated deepcnn model done.')

    model = HybridAttModel(char_embed_matrix=char_embed_matrix,
                           term_embed_matrix=term_embed_matrix,
                           NUM_FEAT=8,
                           PE=True,
                           name='hybridattmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    print('hybrid att model done.')

    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridconvmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid conv model done.')

    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddpcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid dpcnn model done.')

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='hybridgateddeepcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid gated deep cnn model done.')

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridrcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid rcnn model done.')

    model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix,
                                      term_embed_matrix=term_embed_matrix,
                                      NUM_FEAT=8,
                                      PE=True,
                                      name='hybridgatedconvtopicmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    print('hybrid gated conv topic done.')

    y = fasttextmodel.predict_char()
    ys.append(y)

    y = fasttextmodel.predict_term()
    ys.append(y)
    print(y.shape)
    print('fast text done.')

    #hybrid model
    # model = HybridModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8)# + 37
    # model.load_weights()
    # y = model.predict([x, xterm, xfeat])
    # ys.append( y )
    # print(y.shape)
    # print('hybrid model done.')

    labels = ['人类作者', '自动摘要', '机器作者', '机器翻译']
    y_pred = np.mean(ys, axis=0)
    y_pred = convert_y(y_pred)
    out_file = 'result.csv'
    with open(out_file, 'w', encoding='utf-8') as fout:
        for id, yi in zip(ids, y_pred):
            label = labels[yi]
            fout.write('{},{}\n'.format(id, label))
    print('done.')
def predict2():
    """
    根据分类结果集成
    :return:
    """
    print('load data')
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')
    val_conf = ValidConfigure()
    data_dict = data_utils.pickle_load(val_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    ids = data_dict['id']
    xterm = data_utils.pickle_load(val_conf.term_file)
    xfeat = data_utils.pickle_load(val_conf.feat_file)
    xfeat = scaler.transform(xfeat)
    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)

    ys = []
    print('define model')
    #hybrid model
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8)  # + 37
    print('feat shape', xfeat.shape)
    model.load_weights()
    y = model.predict([x, xterm, xfeat])
    ys.append(convert_onehot(y))
    print('hybrid model done.')
    #CNN model (char)
    model = CharModel(embed_matrix=char_embed_matrix)
    model.load_weights()
    y = model.predict(x)
    ys.append(convert_onehot(y))
    print('char model done.')

    model = CharModel(embed_matrix=char_embed_matrix,
                      name='charmodel_PE.h5',
                      PE=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(convert_onehot(y))
    print('char model done.')

    model = CharModel(embed_matrix=char_embed_matrix,
                      name='charmodel_PE_OE.h5',
                      PE=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(convert_onehot(y))
    print('char model done.')

    #CNN (term)
    model = TermModel(embed_matrix=term_embed_matrix)
    model.load_weights()
    y = model.predict(xterm)
    ys.append(convert_onehot(y))
    print('term model done.')

    model = DeepCNNModel(embed_matrix=char_embed_matrix)
    model.load_weights()
    y = model.predict(x)
    ys.append(convert_onehot(y))
    print('deep cnn done.')
    # # attention model (char)
    # model = AttModel(MAX_LEN=600, name='charattmodel.h5', embed_matrix=char_embed_matrix)
    # model.load_weights()
    # y = model.predict(x)
    # ys.append(convert_onehot(y))
    # # attention model (term)
    # model = AttModel(MAX_LEN=300, embed_matrix=term_embed_matrix)
    # model.load_weights()
    # y = model.predict(xterm)
    # ys.append(convert_onehot(y))
    #
    # model = ConditionModel(embed_matrix=char_embed_matrix)
    # model.load_weights()
    # y = model.predict(x)
    # ys.append(convert_onehot(y))

    model = SSCharModel(embed_matrix=char_embed_matrix,
                        name='sscharmodel_PE.h5',
                        PE=True,
                        train_embed=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(convert_onehot(y))

    model = SSCharModel(embed_matrix=char_embed_matrix, train_embed=True)
    model.load_weights()
    y = model.predict(x)
    ys.append(convert_onehot(y))

    model = GatedConvModel(embed_matrix=char_embed_matrix,
                           name='gatedconvmodel_PE.h5',
                           PE=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(convert_onehot(y))

    model = GatedConvModel(embed_matrix=char_embed_matrix, train_embed=True)
    model.load_weights()
    y = model.predict(x)
    ys.append(convert_onehot(y))

    model = GatedDeepCNNModel(embed_matrix=char_embed_matrix,
                              name='gateddeepcnnmodel_PE.h5',
                              PE=True,
                              train_embed=True)
    model.load_weights()
    y = model.predict([x, xe])
    ys.append(convert_onehot(y))

    model = GatedDeepCNNModel(embed_matrix=char_embed_matrix, train_embed=True)
    model.load_weights()
    y = model.predict(x)
    ys.append(convert_onehot(y))

    labels = ['人类作者', '自动摘要', '机器作者', '机器翻译']
    y_pred = np.mean(ys, axis=0)
    y_pred = convert_y(y_pred)
    out_file = 'result.csv'
    with open(out_file, 'w', encoding='utf-8') as fout:
        for id, yi in zip(ids, y_pred):
            label = labels[yi]
            fout.write('{},{}\n'.format(id, label))
    print('done.')
Exemple #17
0
    def predict(self, x):
        y_pred = self.model.predict(x, batch_size=512)
        return y_pred


if __name__ == '__main__':
    import sys
    tn_conf = TrainConfigure()
    if len(sys.argv) > 1 and sys.argv[1] == 'char':
        if len(sys.argv) > 2 and sys.argv[2] == 'pe':
            print('define char model with position embedding')
            print('load data')
            import data_utils, training_utils

            data_dict = data_utils.pickle_load(tn_conf.char_file)
            y = to_categorical(data_dict['y'])
            char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
            char_embed_matrix = data_utils.load_embedding(
                char_vocab_dict,
                'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                dump_path='data/char_embed.pkl')
            xe = [[i for i in range(600)] for _ in range(y.shape[0])]
            xe = np.array(xe)
            x_tn, y_tn, x_ts, y_ts = training_utils.split([data_dict['x'], xe],
                                                          y,
                                                          shuffle=False)
            x_tn, y_tn, x_val, y_val = training_utils.split(x_tn,
                                                            y_tn,
                                                            shuffle=False)
            print('train')
def train_model_peoe( ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')

    char_embeds = []
    char_embed_matrix_oe = data_utils.load_our_embedding(vocab_dict)
    char_embeds.append( char_embed_matrix_oe )
    for windows in [3, 5, 8]:
        sg = 1
        # for sg in [0,1]:
        embed_file = 'data/char_embed_{}_{}.model'.format(windows, sg)
        char_embed_tmp = data_utils.load_our_embedding(vocab_dict, model_file = embed_file,
                                                       dump_path = 'data/our_char_embed_{}_{}.pkl'.format(windows, sg))
        char_embeds.append(char_embed_tmp)

    term_embeds = []
    term_embed_matrix_oe = data_utils.load_our_embedding(term_vocab_dict, model_file='data/term_embed.model',
                                                         dump_path='data/our_term_embed.pkl')
    term_embeds.append( term_embed_matrix_oe )
    for windows in [3, 5, 8]:
        sg = 1
        # for sg in [0,1]:
        embed_file = 'data/term_embed_{}_{}.model'.format(windows, sg)
        term_embed_tmp = data_utils.load_our_embedding(term_vocab_dict, model_file=embed_file,
                                                       dump_path='data/our_term_embed_{}_{}.pkl'.format(windows,
                                                                                                        sg))
        term_embeds.append(term_embed_tmp)


    MAX_LEN_TERM = 300
    name = 'hybriddpcnnmodel_PEOE.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    #加入更多embedding模型以后,学习率要降低才能正常学习
    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                             name=name, char_embeds=char_embeds, term_embeds = term_embeds, lr=0.0004)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                             name=name, train_embed=True, train_top=False, lr=0.001, char_embeds=char_embeds, term_embeds = term_embeds)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def predict(tn_conf, lda_file, val_conf, val_conf100, val_conf200):
    """
    根据概率集成
    :return:
    """
    print('load data')
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)
    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    xtopic = data_utils.pickle_load(lda_file)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')
    data_dict = data_utils.pickle_load(val_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    ids = data_dict['id']
    xterm = data_utils.pickle_load(val_conf.term_file)
    xfeat = data_utils.pickle_load(val_conf.feat_file)
    xfeat = scaler.transform(xfeat)
    print('feat shape', xfeat.shape)

    data_dict100 = data_utils.pickle_load(val_conf100.char_file)
    x100 = data_dict100['x']
    xterm100 = data_utils.pickle_load(val_conf100.term_file)
    xe100 = [[i for i in range(100)] for _ in range(y.shape[0])]
    xe100 = np.array(xe100)
    xe_term100 = [[i for i in range(100)] for _ in range(y.shape[0])]
    xe_term100 = np.array(xe_term100)

    data_dict200 = data_utils.pickle_load(val_conf200.char_file)
    x200 = data_dict200['x']
    xterm200 = data_utils.pickle_load(val_conf200.term_file)
    xe200 = [[i for i in range(200)] for _ in range(y.shape[0])]
    xe200 = np.array(xe200)
    xe_term200 = [[i for i in range(200)] for _ in range(y.shape[0])]
    xe_term200 = np.array(xe_term200)

    ys = []
    print('define model')
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddensemodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model

    model = HybridDenseMAModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               NUM_FEAT=8,
                               PE=True,
                               name='hybriddensemodelma_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('dense model done.')

    model = HybridSEModel(char_embed_matrix=char_embed_matrix,
                          term_embed_matrix=term_embed_matrix,
                          NUM_FEAT=8,
                          PE=True,
                          name='hybridsemodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('se model done.')

    print('start len 100 model')
    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            MAX_LEN=100,
                            MAX_LEN_TERM=100,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridconvmodel_n100.h5')
    model.load_weights()
    y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid conv model done.')

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    MAX_LEN=100,
                                    MAX_LEN_TERM=100,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='hybridgateddeepcnnmodel_n100.h5')
    model.load_weights()
    y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid gated deep cnn model done.')

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            MAX_LEN=100,
                            MAX_LEN_TERM=100,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridrcnnmodel_n100.h5')
    model.load_weights()
    y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid RCNN model done.')

    # print('start len 200 model')
    # model = HybridConvModel(char_embed_matrix=char_embed_matrix,
    #                         term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                         PE=True, name='hybridconvmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid conv model done.')
    #
    # model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
    #                          term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                          PE=True, name='hybriddpcnnmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid dpcnn model done.')
    #
    # model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
    #                                 term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                                 PE=True, name='hybridgateddeepcnnmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model
    # print('hybrid gated deep cnn model done.')
    #
    # model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
    #                         term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8,
    #                         PE=True, name='hybridrcnnmodel_n200.h5')
    # model.load_weights()
    # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic])
    # ys.append(y)
    # del model

    #这个模型太慢
    # model = ConditionAttModel(char_embed_matrix=char_embed_matrix,
    #                           term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True,
    #                           name='conditionattmodel_PE.h5', lr=0.001)
    # model.load_weights()
    # y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    # ys.append(y)
    # print('condition att model done.')

    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               NUM_FEAT=8,
                               PE=True,
                               name='conditionconvmodel_PE.h5',
                               lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition conv model done.')

    model = ConditionDPCNNModel(char_embed_matrix=char_embed_matrix,
                                term_embed_matrix=term_embed_matrix,
                                NUM_FEAT=8,
                                PE=True,
                                name='conditiondpcnnmodel_PE.h5',
                                lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition dpcnn model done.')

    model = ConditionGatedConvModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='conditiongatedconvmodel_PE.h5',
                                    lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition gated conv model done.')

    model = ConditionGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                       term_embed_matrix=term_embed_matrix,
                                       NUM_FEAT=8,
                                       PE=True,
                                       name='conditiongateddeepcnnmodel_PE.h5',
                                       lr=0.001)
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('condition gated deepcnn model done.')

    model = HybridAttModel(char_embed_matrix=char_embed_matrix,
                           term_embed_matrix=term_embed_matrix,
                           NUM_FEAT=8,
                           PE=True,
                           name='hybridattmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    print('hybrid att model done.')

    model = HybridConvModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridconvmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid conv model done.')

    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix,
                             NUM_FEAT=8,
                             PE=True,
                             name='hybriddpcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid dpcnn model done.')

    model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix,
                                    term_embed_matrix=term_embed_matrix,
                                    NUM_FEAT=8,
                                    PE=True,
                                    name='hybridgateddeepcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid gated deep cnn model done.')

    model = HybridRCNNModel(char_embed_matrix=char_embed_matrix,
                            term_embed_matrix=term_embed_matrix,
                            NUM_FEAT=8,
                            PE=True,
                            name='hybridrcnnmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    del model
    print('hybrid rcnn model done.')

    model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix,
                                      term_embed_matrix=term_embed_matrix,
                                      NUM_FEAT=8,
                                      PE=True,
                                      name='hybridgatedconvtopicmodel_PE.h5')
    model.load_weights()
    y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic])
    ys.append(y)
    print('hybrid gated conv topic done.')

    y = fasttextmodel.predict_char()
    ys.append(y)

    y = fasttextmodel.predict_term()
    ys.append(y)
    print(y.shape)
    print('fast text done.')

    #hybrid model
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8)  # + 37
    model.load_weights()
    y = model.predict([x, xterm, xfeat])
    ys.append(y)
    print(y.shape)
    print('hybrid model done.')
    ys = np.array(ys)
    print(ys.shape)
    return ys
def load_grid(num_psf):
    """
    Load grid
    """
    log.info("Load Grid data")
    return pickle_load('grid_{}.pickle.gz'.format(num_psf), compressed=True)
Exemple #21
0
def stacking_main_condition():
    print('load data')
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)
    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)
    xtopic = data_utils.pickle_load('data/lda_vec.pkl')

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(char_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')
    print('load embed done.')

    name = 'model/stack_condition_model.pkl'
    model_dir = 'model/stack/'
    n_fold = 3
    name = 'model/stack_condition_model5.pkl'
    model_dir = 'model/stack5/'
    n_fold = 5
    stk_model = stacking(n_fold, name=name, is_condition=True)
    conf = conditionmodelbase.ModelConfigure()
    conf.PE = True
    stk_model.add_model(ConditionConvModel, {"conf":conf,"char_embed_matrix":char_embed_matrix,
                            "term_embed_matrix":term_embed_matrix,
                                             "name":model_dir+'conditionconvmodel_PE.h5'})
    stk_model.add_model(ConditionGatedConvModel, {"conf":conf,"char_embed_matrix": char_embed_matrix,
                                          "term_embed_matrix": term_embed_matrix,
                                                  "name": model_dir+'conditiongatedconvmodel_PE.h5'})
    stk_model.add_model(ConditionGatedDeepCNNModel, {"conf":conf,"char_embed_matrix": char_embed_matrix,
                                          "term_embed_matrix": term_embed_matrix,
                                            "name": model_dir+'conditiongateddeepcnnmodel_PE.h5'})
    conf.lr = 0.0005
    stk_model.add_model(ConditionDPCNNModel, {"conf": conf, "char_embed_matrix": char_embed_matrix,
                                              "term_embed_matrix": term_embed_matrix,
                                              "name": model_dir + 'conditiondpcnnmodel_PE.h5'})
    #采样0.1用于测试
    # x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic], y, split_ratio=0.005, shuffle=False)
    # x_tn, y_tn, x_ts, y_ts = training_utils.split(x_tn, y_tn, shuffle=False)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic],  y, split_ratio=0.95)
    stk_model.fit(x_tn, y_tn)
    # joblib.dump(stk_model, 'model/stack_model_3.pkl')
    y_pred = stk_model.predict(x_ts)
    acc = accuracy_score(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) )
    print(acc)
    cnf_matrix = confusion_matrix(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) )
    print(cnf_matrix)
    stk_model.save( )
Exemple #22
0
def train_model_ftoe(model_conf,
                     model_name='hybridconvmodel_FTOE.h5',
                     ModelClass=HybridModelBase,
                     char_embed_file=None,
                     term_embed_file=None):
    print(model_name)
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    # char_embed_matrix = data_utils.load_embedding(vocab_dict,
    #                                               'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed.pkl')
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        char_embed_file,
        dump_path='data/{}.pkl'.format(char_embed_file[5:]))
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        term_embed_file,
        dump_path='data/{}.pkl'.format(term_embed_file[5:]))
    MAX_LEN_TERM = 300
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xe, xterm, xe_term, xfeat, xt], y, split_ratio=0.95, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn,
                                                    y_tn,
                                                    split_ratio=0.95,
                                                    shuffle=False)
    print('train')
    print('define model')
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=model_name,
                       train_embed=False,
                       train_top=True)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model_conf.lr *= 0.5
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=model_name,
                       train_embed=True,
                       train_top=False)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
Exemple #23
0
def train_main():
    print('load data')
    import data_utils, training_utils
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat],
                                                  y,
                                                  shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('define model')
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8)  # +37
    print('feat shape', xfeat.shape)
    import sys
    if len(sys.argv) <= 1 or sys.argv[1] == 'train':
        print('train')
        model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    if len(sys.argv) > 1 and sys.argv[1] == 'val':
        val_conf = ValidConfigure()
        data_dict = data_utils.pickle_load(val_conf.char_file)
        y = to_categorical(data_dict['y'])
        x = data_dict['x']
        ids = data_dict['id']
        xterm = data_utils.pickle_load(val_conf.term_file)
        xfeat = data_utils.pickle_load(val_conf.feat_file)
        xfeat = scaler.transform(xfeat)
        model.load_weights()
        model.test([x, xterm, xfeat], ids, val_conf.out_file)

    if len(sys.argv) > 1 and sys.argv[1] == 'error':
        start_index = y_tn.shape[0] + y_val.shape[0]
        texts = data_utils.load_all_text(tn_conf)
        model.load_weights()
        model.error_analysis(x_ts, y_ts, texts, start_index)