Exemple #1
0
def train_model_pe( use_char = True):
    print('define model')
    print('load data')
    import data_utils, training_utils
    tn_conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load('data.dict')
    y = to_categorical(data_dict['y'])
    if use_char:
        vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
        embed_matrix = data_utils.load_embedding(vocab_dict,
                                                      'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                      dump_path='data/char_embed.pkl')
        x = data_dict['x']
        MAX_LEN = 600
        name = 'conditionmodel_PE.h5'
    else:
        x = data_utils.pickle_load(tn_conf.term_file)
        vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
        embed_matrix = data_utils.load_embedding(vocab_dict,
                                                      'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                      dump_path='data/term_embed.pkl')
        MAX_LEN = 300
        name = 'conditionmodel_term_PE.h5'
    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')

    model = ConditionModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name, PE=True)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model(use_char = False ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    if use_char:
        vocab_dict = data_utils.pickle_load(conf.char_dict)
        embed_matrix = data_utils.load_embedding(vocab_dict,
                                                 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                 dump_path='data/char_embed.pkl')

        MAX_LEN = 600
        name = 'deepcnn_model.h5'
        x = data_dict['x']
    else:
        x = data_utils.pickle_load(conf.term_file)
        vocab_dict = data_utils.pickle_load(conf.term_dict)
        embed_matrix = data_utils.load_embedding(vocab_dict,
                                                 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                 dump_path='data/term_embed.pkl')
        MAX_LEN = 300
        name = 'deepcnn_model_term.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    x_tn, y_tn, x_ts, y_ts = training_utils.split(x, y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = DeepCNNModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_pe():
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')
    y = to_categorical(data_dict['y'])

    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([data_dict['x'], xe],
                                                  y,
                                                  shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = SSCharModel(embed_matrix=embed_matrix,
                        name='sscharmodel_PE.h5',
                        PE=True)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = SSCharModel(embed_matrix=embed_matrix,
                        name='sscharmodel_PE.h5',
                        PE=True,
                        train_embed=True)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model(use_char=True):
    print('train condition conv model.\nload data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = 300
    name = 'conditionconvmodel.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat, xt],
                                                  y,
                                                  shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               MAX_LEN=MAX_LEN,
                               NUM_FEAT=8,
                               name=name)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               MAX_LEN=MAX_LEN,
                               NUM_FEAT=8,
                               name=name,
                               train_embed=True,
                               train_top=False,
                               lr=0.001)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Exemple #5
0
def train_model_n200(model_conf, name, ModelClass):
    print(name)
    import data_utils200 as data_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')

    MAX_LEN = conf.MAX_LEN
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = conf.MAX_LEN
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=name)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=name,
                       train_embed=True,
                       train_top=False)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_cv( cv_index, cv_num ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = 300
    name = 'hybriddensemodel_cv{}.h5'.format(cv_index)
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split_cv([x, xe, xterm, xe_term, xfeat, xt], y, cv_index=cv_index,cv_num=cv_num)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix,
                            MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix,
                            MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                            name=name, train_embed=True, train_top=False, lr=0.001)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
Exemple #7
0
def train_model_pe(use_char=False):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    if use_char:
        vocab_dict = data_utils.pickle_load(conf.char_dict)
        embed_matrix = data_utils.load_embedding(
            vocab_dict,
            'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
            dump_path='data/char_embed.pkl')

        MAX_LEN = 600
        name = 'gateddeepcnnmodel_PE.h5'
        x = data_dict['x']
    else:
        x = data_utils.pickle_load(conf.term_file)
        vocab_dict = data_utils.pickle_load(conf.term_dict)
        embed_matrix = data_utils.load_embedding(
            vocab_dict,
            'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
            dump_path='data/term_embed.pkl')
        MAX_LEN = 300
        name = 'gateddeepcnnmodel_term_PE.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = GatedDeepCNNModel(embed_matrix=embed_matrix,
                              MAX_LEN=MAX_LEN,
                              name=name,
                              PE=True)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = GatedDeepCNNModel(embed_matrix=embed_matrix,
                              MAX_LEN=MAX_LEN,
                              name=name,
                              PE=True,
                              train_embed=True)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model():
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.term_dict)
    embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    print('load embed done.')
    x_tn, y_tn, x_ts, y_ts = training_utils.split(xterm, y, shuffle=True)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=True)
    print('train')
    print('define model')
    model = TermModel(embed_matrix=embed_matrix)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_peoe( ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')

    char_embeds = []
    char_embed_matrix_oe = data_utils.load_our_embedding(vocab_dict)
    char_embeds.append( char_embed_matrix_oe )
    for windows in [3, 5, 8]:
        sg = 1
        # for sg in [0,1]:
        embed_file = 'data/char_embed_{}_{}.model'.format(windows, sg)
        char_embed_tmp = data_utils.load_our_embedding(vocab_dict, model_file = embed_file,
                                                       dump_path = 'data/our_char_embed_{}_{}.pkl'.format(windows, sg))
        char_embeds.append(char_embed_tmp)

    term_embeds = []
    term_embed_matrix_oe = data_utils.load_our_embedding(term_vocab_dict, model_file='data/term_embed.model',
                                                         dump_path='data/our_term_embed.pkl')
    term_embeds.append( term_embed_matrix_oe )
    for windows in [3, 5, 8]:
        sg = 1
        # for sg in [0,1]:
        embed_file = 'data/term_embed_{}_{}.model'.format(windows, sg)
        term_embed_tmp = data_utils.load_our_embedding(term_vocab_dict, model_file=embed_file,
                                                       dump_path='data/our_term_embed_{}_{}.pkl'.format(windows,
                                                                                                        sg))
        term_embeds.append(term_embed_tmp)


    MAX_LEN_TERM = 300
    name = 'hybriddpcnnmodel_PEOE.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    #加入更多embedding模型以后,学习率要降低才能正常学习
    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                             name=name, char_embeds=char_embeds, term_embeds = term_embeds, lr=0.0004)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                             name=name, train_embed=True, train_top=False, lr=0.001, char_embeds=char_embeds, term_embeds = term_embeds)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Exemple #10
0
def train_model_ftoe(model_conf,
                     model_name='hybridconvmodel_FTOE.h5',
                     ModelClass=HybridModelBase,
                     char_embed_file=None,
                     term_embed_file=None):
    print(model_name)
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    # char_embed_matrix = data_utils.load_embedding(vocab_dict,
    #                                               'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed.pkl')
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        char_embed_file,
        dump_path='data/{}.pkl'.format(char_embed_file[5:]))
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        term_embed_file,
        dump_path='data/{}.pkl'.format(term_embed_file[5:]))
    MAX_LEN_TERM = 300
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xe, xterm, xe_term, xfeat, xt], y, split_ratio=0.95, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn,
                                                    y_tn,
                                                    split_ratio=0.95,
                                                    shuffle=False)
    print('train')
    print('define model')
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=model_name,
                       train_embed=False,
                       train_top=True)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model_conf.lr *= 0.5
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=model_name,
                       train_embed=True,
                       train_top=False)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model