Ejemplo n.º 1
0
def train_model_pe( use_char = True):
    print('define model')
    print('load data')
    import data_utils, training_utils
    tn_conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load('data.dict')
    y = to_categorical(data_dict['y'])
    if use_char:
        vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
        embed_matrix = data_utils.load_embedding(vocab_dict,
                                                      'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                      dump_path='data/char_embed.pkl')
        x = data_dict['x']
        MAX_LEN = 600
        name = 'conditionmodel_PE.h5'
    else:
        x = data_utils.pickle_load(tn_conf.term_file)
        vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
        embed_matrix = data_utils.load_embedding(vocab_dict,
                                                      'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                      dump_path='data/term_embed.pkl')
        MAX_LEN = 300
        name = 'conditionmodel_term_PE.h5'
    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')

    model = ConditionModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name, PE=True)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_pe():
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')
    y = to_categorical(data_dict['y'])

    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([data_dict['x'], xe],
                                                  y,
                                                  shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = SSCharModel(embed_matrix=embed_matrix,
                        name='sscharmodel_PE.h5',
                        PE=True)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = SSCharModel(embed_matrix=embed_matrix,
                        name='sscharmodel_PE.h5',
                        PE=True,
                        train_embed=True)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Ejemplo n.º 3
0
def train_model(use_char = False ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    if use_char:
        vocab_dict = data_utils.pickle_load(conf.char_dict)
        embed_matrix = data_utils.load_embedding(vocab_dict,
                                                 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                 dump_path='data/char_embed.pkl')

        MAX_LEN = 600
        name = 'deepcnn_model.h5'
        x = data_dict['x']
    else:
        x = data_utils.pickle_load(conf.term_file)
        vocab_dict = data_utils.pickle_load(conf.term_dict)
        embed_matrix = data_utils.load_embedding(vocab_dict,
                                                 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                 dump_path='data/term_embed.pkl')
        MAX_LEN = 300
        name = 'deepcnn_model_term.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    x_tn, y_tn, x_ts, y_ts = training_utils.split(x, y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = DeepCNNModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model(use_char=True):
    print('train condition conv model.\nload data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = 300
    name = 'conditionconvmodel.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat, xt],
                                                  y,
                                                  shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               MAX_LEN=MAX_LEN,
                               NUM_FEAT=8,
                               name=name)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = ConditionConvModel(char_embed_matrix=char_embed_matrix,
                               term_embed_matrix=term_embed_matrix,
                               MAX_LEN=MAX_LEN,
                               NUM_FEAT=8,
                               name=name,
                               train_embed=True,
                               train_top=False,
                               lr=0.001)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Ejemplo n.º 5
0
def train_model_n200(model_conf, name, ModelClass):
    print(name)
    import data_utils200 as data_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')

    MAX_LEN = conf.MAX_LEN
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = conf.MAX_LEN
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=name)
    x_tn, y_tn = model.gen_train(x_tn, y_tn)
    x_val, y_val = model.gen_train(x_val, y_val)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=name,
                       train_embed=True,
                       train_top=False)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Ejemplo n.º 6
0
def train_main_pe():
    print('load data')
    import data_utils, training_utils
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')
    xe_char = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe_char = np.array(xe_char)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)
    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xterm, xfeat, xe_char, xe_term], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('define model')
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8,
                        name='hybridmodel_PE.h5',
                        PE=True)  # +37
    print('feat shape', xfeat.shape)
    import sys
    print('train')
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Ejemplo n.º 7
0
def train_model_pe(use_char=False):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    if use_char:
        vocab_dict = data_utils.pickle_load(conf.char_dict)
        embed_matrix = data_utils.load_embedding(
            vocab_dict,
            'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
            dump_path='data/char_embed.pkl')

        MAX_LEN = 600
        name = 'gateddeepcnnmodel_PE.h5'
        x = data_dict['x']
    else:
        x = data_utils.pickle_load(conf.term_file)
        vocab_dict = data_utils.pickle_load(conf.term_dict)
        embed_matrix = data_utils.load_embedding(
            vocab_dict,
            'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
            dump_path='data/term_embed.pkl')
        MAX_LEN = 300
        name = 'gateddeepcnnmodel_term_PE.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = GatedDeepCNNModel(embed_matrix=embed_matrix,
                              MAX_LEN=MAX_LEN,
                              name=name,
                              PE=True)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = GatedDeepCNNModel(embed_matrix=embed_matrix,
                              MAX_LEN=MAX_LEN,
                              name=name,
                              PE=True,
                              train_embed=True)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Ejemplo n.º 8
0
def train_model():
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.term_dict)
    embed_matrix = data_utils.load_embedding(
        vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    print('load embed done.')
    x_tn, y_tn, x_ts, y_ts = training_utils.split(xterm, y, shuffle=True)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=True)
    print('train')
    print('define model')
    model = TermModel(embed_matrix=embed_matrix)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_cv( cv_index, cv_num ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = 300
    name = 'hybriddensemodel_cv{}.h5'.format(cv_index)
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split_cv([x, xe, xterm, xe_term, xfeat, xt], y, cv_index=cv_index,cv_num=cv_num)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix,
                            MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix,
                            MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                            name=name, train_embed=True, train_top=False, lr=0.001)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
Ejemplo n.º 10
0
 def fit(self, X_train, y_train):
     self.X_train = X_train
     self.y_train = y_train
     skf = StratifiedKFold(n_splits=self.n_fold, shuffle=True, random_state=0)
     # i = 0
     y_test_list = []
     y_pred_list = []
     # for train_idx, test_idx in skf.split(self.X_train, self.y_train):
     #     X_train_s, y_train_s = self.X_train[train_idx], self.y_train[train_idx]
     #     X_test_s, y_test_s = self.X_train[test_idx], self.y_train[test_idx]
     for i in range(self.n_fold):
         print('cross ', i+1, '/', self.n_fold)
         X_train_s, y_train_s, X_test_s, y_test_s = training_utils.split_cv(X_train, y_train, cv_num=self.n_fold, cv_index=i)
         # X_train_s, X_val_s, y_train_s, y_val_s = train_test_split(X_train_s, y_train_s, test_size=0.1,
         #                                                           stratify=y_train_s)
         X_train_s, y_train_s, X_val_s, y_val_s = training_utils.split(X_train_s, y_train_s, split_ratio=0.95)
         y_pred_s = None
         for models in self.base_models:
             model = models[i]
             print(model.name)
             if self.is_condition:
                 model.train_exp(X_train_s, y_train_s, X_val_s, y_val_s, X_test_s, y_test_s)
             else:
                 model.train(X_train_s, y_train_s, X_val_s, y_val_s, X_test_s, y_test_s)
             #作为特征的时候使用one-hot表示
             if y_pred_s is None:
                 y_pred_s = model.predict(X_test_s)
             else:
                 # import ipdb
                 # ipdb.set_trace( )
                 y_pred_s = np.hstack( (y_pred_s, model.predict(X_test_s) ) )
         i += 1
         y_test_list.append(y_test_s)
         y_pred_list.append(y_pred_s)
     # 使用y_pred_list做特征,y_test_list做目标,再次训练模型
     X_top = np.vstack(y_pred_list)
     y_top = np.vstack(y_test_list)
     y_top = training_utils.convert_y( y_top )
     if self.is_condition:
         X_top = np.squeeze(X_top, axis=-1)
     print(X_top.shape, y_top.shape)
     self.top_model = LogisticRegression()
     self.top_model.fit(X_top, y_top)
     print(X_top)
Ejemplo n.º 11
0
 def fit_tmp(self, X_train, y_train):
     self.X_train = X_train
     self.y_train = y_train
     y_test_list = []
     y_pred_list = []
     for i in range(self.n_fold):
         print('cross ', i+1, '/', self.n_fold)
         X_train_s, y_train_s, X_test_s, y_test_s = training_utils.split_cv(X_train, y_train, cv_num=self.n_fold, cv_index=i)
         X_train_s, y_train_s, X_val_s, y_val_s = training_utils.split(X_train_s, y_train_s, split_ratio=0.9)
         y_pred_s = None
         for models in self.base_models:
             model = models[i]
             print(model.name)
             # if self.is_condition:
             #     model.train_exp(X_train_s, y_train_s, X_val_s, y_val_s, X_test_s, y_test_s)
             # else:
             #     model.train(X_train_s, y_train_s, X_val_s, y_val_s, X_test_s, y_test_s)
             #作为特征的时候使用one-hot表示
             if y_pred_s is None:
                 y_pred_s = model.predict(X_test_s)
             else:
                 # import ipdb
                 # ipdb.set_trace( )
                 y_pred_s = np.hstack( (y_pred_s, model.predict(X_test_s) ) )
         i += 1
         y_test_list.append(y_test_s)
         y_pred_list.append(y_pred_s)
     # 使用y_pred_list做特征,y_test_list做目标,再次训练模型
     X_top = np.vstack(y_pred_list)
     if self.is_condition:
         X_top = np.squeeze(X_top, axis=-1)
     y_top = np.vstack(y_test_list)
     y_top = training_utils.convert_y( y_top )
     print(X_top.shape, y_top.shape)
     self.top_model = LogisticRegression()
     self.top_model.fit(X_top, y_top)
     print(X_top)
     joblib.dump(self.top_model, self.name)
Ejemplo n.º 12
0
        if len(sys.argv) > 2 and sys.argv[2] == 'pe':
            print('define char model with position embedding')
            print('load data')
            import data_utils, training_utils

            data_dict = data_utils.pickle_load(tn_conf.char_file)
            y = to_categorical(data_dict['y'])
            char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
            char_embed_matrix = data_utils.load_embedding(
                char_vocab_dict,
                'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                dump_path='data/char_embed.pkl')
            xe = [[i for i in range(600)] for _ in range(y.shape[0])]
            xe = np.array(xe)
            x_tn, y_tn, x_ts, y_ts = training_utils.split([data_dict['x'], xe],
                                                          y,
                                                          shuffle=False)
            x_tn, y_tn, x_val, y_val = training_utils.split(x_tn,
                                                            y_tn,
                                                            shuffle=False)
            print('train')
            model = RCNNModel(MAX_LEN=600,
                              name='RCNNmodel_char_PE.h5',
                              embed_matrix=char_embed_matrix,
                              PE=True)
            model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
        else:
            print('define char model')
            print('load data')
            import data_utils, training_utils
            data_dict = data_utils.pickle_load(tn_conf.char_file)
def train_model_peoe( ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')

    char_embeds = []
    char_embed_matrix_oe = data_utils.load_our_embedding(vocab_dict)
    char_embeds.append( char_embed_matrix_oe )
    for windows in [3, 5, 8]:
        sg = 1
        # for sg in [0,1]:
        embed_file = 'data/char_embed_{}_{}.model'.format(windows, sg)
        char_embed_tmp = data_utils.load_our_embedding(vocab_dict, model_file = embed_file,
                                                       dump_path = 'data/our_char_embed_{}_{}.pkl'.format(windows, sg))
        char_embeds.append(char_embed_tmp)

    term_embeds = []
    term_embed_matrix_oe = data_utils.load_our_embedding(term_vocab_dict, model_file='data/term_embed.model',
                                                         dump_path='data/our_term_embed.pkl')
    term_embeds.append( term_embed_matrix_oe )
    for windows in [3, 5, 8]:
        sg = 1
        # for sg in [0,1]:
        embed_file = 'data/term_embed_{}_{}.model'.format(windows, sg)
        term_embed_tmp = data_utils.load_our_embedding(term_vocab_dict, model_file=embed_file,
                                                       dump_path='data/our_term_embed_{}_{}.pkl'.format(windows,
                                                                                                        sg))
        term_embeds.append(term_embed_tmp)


    MAX_LEN_TERM = 300
    name = 'hybriddpcnnmodel_PEOE.h5'
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    #加入更多embedding模型以后,学习率要降低才能正常学习
    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                             name=name, char_embeds=char_embeds, term_embeds = term_embeds, lr=0.0004)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix,
                             term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                             name=name, train_embed=True, train_top=False, lr=0.001, char_embeds=char_embeds, term_embeds = term_embeds)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
Ejemplo n.º 14
0
def train_main():
    print('load data')
    import data_utils, training_utils
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(
        char_vocab_dict,
        'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
        dump_path='data/char_embed.pkl')
    print('load embed done.')

    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat],
                                                  y,
                                                  shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('define model')
    model = HybridModel(char_embed_matrix=char_embed_matrix,
                        term_embed_matrix=term_embed_matrix,
                        NUM_FEAT=8)  # +37
    print('feat shape', xfeat.shape)
    import sys
    if len(sys.argv) <= 1 or sys.argv[1] == 'train':
        print('train')
        model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    if len(sys.argv) > 1 and sys.argv[1] == 'val':
        val_conf = ValidConfigure()
        data_dict = data_utils.pickle_load(val_conf.char_file)
        y = to_categorical(data_dict['y'])
        x = data_dict['x']
        ids = data_dict['id']
        xterm = data_utils.pickle_load(val_conf.term_file)
        xfeat = data_utils.pickle_load(val_conf.feat_file)
        xfeat = scaler.transform(xfeat)
        model.load_weights()
        model.test([x, xterm, xfeat], ids, val_conf.out_file)

    if len(sys.argv) > 1 and sys.argv[1] == 'error':
        start_index = y_tn.shape[0] + y_val.shape[0]
        texts = data_utils.load_all_text(tn_conf)
        model.load_weights()
        model.error_analysis(x_ts, y_ts, texts, start_index)
Ejemplo n.º 15
0
def stacking_main_condition():
    print('load data')
    tn_conf = TrainConfigure()
    data_dict = data_utils.pickle_load(tn_conf.char_file)
    y = to_categorical(data_dict['y'])
    x = data_dict['x']
    xterm = data_utils.pickle_load(tn_conf.term_file)
    xfeat = data_utils.pickle_load(tn_conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, tn_conf.feat_norm)
    xfeat = scaler.transform(xfeat)
    xe = [[i for i in range(600)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(300)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)
    xtopic = data_utils.pickle_load('data/lda_vec.pkl')

    print('loading embed ...')
    term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed_ww.pkl')
    char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(char_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')
    print('load embed done.')

    name = 'model/stack_condition_model.pkl'
    model_dir = 'model/stack/'
    n_fold = 3
    name = 'model/stack_condition_model5.pkl'
    model_dir = 'model/stack5/'
    n_fold = 5
    stk_model = stacking(n_fold, name=name, is_condition=True)
    conf = conditionmodelbase.ModelConfigure()
    conf.PE = True
    stk_model.add_model(ConditionConvModel, {"conf":conf,"char_embed_matrix":char_embed_matrix,
                            "term_embed_matrix":term_embed_matrix,
                                             "name":model_dir+'conditionconvmodel_PE.h5'})
    stk_model.add_model(ConditionGatedConvModel, {"conf":conf,"char_embed_matrix": char_embed_matrix,
                                          "term_embed_matrix": term_embed_matrix,
                                                  "name": model_dir+'conditiongatedconvmodel_PE.h5'})
    stk_model.add_model(ConditionGatedDeepCNNModel, {"conf":conf,"char_embed_matrix": char_embed_matrix,
                                          "term_embed_matrix": term_embed_matrix,
                                            "name": model_dir+'conditiongateddeepcnnmodel_PE.h5'})
    conf.lr = 0.0005
    stk_model.add_model(ConditionDPCNNModel, {"conf": conf, "char_embed_matrix": char_embed_matrix,
                                              "term_embed_matrix": term_embed_matrix,
                                              "name": model_dir + 'conditiondpcnnmodel_PE.h5'})
    #采样0.1用于测试
    # x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic], y, split_ratio=0.005, shuffle=False)
    # x_tn, y_tn, x_ts, y_ts = training_utils.split(x_tn, y_tn, shuffle=False)
    x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic],  y, split_ratio=0.95)
    stk_model.fit(x_tn, y_tn)
    # joblib.dump(stk_model, 'model/stack_model_3.pkl')
    y_pred = stk_model.predict(x_ts)
    acc = accuracy_score(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) )
    print(acc)
    cnf_matrix = confusion_matrix(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) )
    print(cnf_matrix)
    stk_model.save( )
Ejemplo n.º 16
0
def train_model_ftoe(model_conf,
                     model_name='hybridconvmodel_FTOE.h5',
                     ModelClass=HybridModelBase,
                     char_embed_file=None,
                     term_embed_file=None):
    print(model_name)
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    # char_embed_matrix = data_utils.load_embedding(vocab_dict,
    #                                               'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    # term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
    #                                               'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
    #                                               dump_path='data/term_embed.pkl')
    char_embed_matrix = data_utils.load_embedding(
        vocab_dict,
        char_embed_file,
        dump_path='data/{}.pkl'.format(char_embed_file[5:]))
    term_embed_matrix = data_utils.load_embedding(
        term_vocab_dict,
        term_embed_file,
        dump_path='data/{}.pkl'.format(term_embed_file[5:]))
    MAX_LEN_TERM = 300
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split(
        [x, xe, xterm, xe_term, xfeat, xt], y, split_ratio=0.95, shuffle=False)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn,
                                                    y_tn,
                                                    split_ratio=0.95,
                                                    shuffle=False)
    print('train')
    print('define model')
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=model_name,
                       train_embed=False,
                       train_top=True)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model_conf.lr *= 0.5
    model = ModelClass(model_conf,
                       char_embed_matrix=char_embed_matrix,
                       term_embed_matrix=term_embed_matrix,
                       name=model_name,
                       train_embed=True,
                       train_top=False)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model