def train_model_cv( cv_index, cv_num ):
    print('load data')
    import data_utils, training_utils
    conf = data_utils.TrainConfigure()
    data_dict = data_utils.pickle_load(conf.char_file)
    print('loading embed ...')
    vocab_dict = data_utils.pickle_load(conf.char_dict)
    char_embed_matrix = data_utils.load_embedding(vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/char_embed.pkl')

    MAX_LEN = 600
    x = data_dict['x']
    xterm = data_utils.pickle_load(conf.term_file)
    term_vocab_dict = data_utils.pickle_load(conf.term_dict)
    term_embed_matrix = data_utils.load_embedding(term_vocab_dict,
                                                  'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
                                                  dump_path='data/term_embed.pkl')
    MAX_LEN_TERM = 300
    name = 'hybriddensemodel_cv{}.h5'.format(cv_index)
    print('load embed done.')
    y = to_categorical(data_dict['y'])
    xt = data_utils.pickle_load('data/lda_vec.pkl')

    xfeat = data_utils.pickle_load(conf.feat_file)
    # normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(xfeat)
    data_utils.pickle_dump(scaler, conf.feat_norm)
    xfeat = scaler.transform(xfeat)

    xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])]
    xe = np.array(xe)
    xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])]
    xe_term = np.array(xe_term)

    x_tn, y_tn, x_ts, y_ts = training_utils.split_cv([x, xe, xterm, xe_term, xfeat, xt], y, cv_index=cv_index,cv_num=cv_num)
    x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False)
    print('train')
    print('define model')
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix,
                            MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name)
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
    model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix,
                            MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True,
                            name=name, train_embed=True, train_top=False, lr=0.001)
    model.load_weights()
    model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
    del model
Ejemplo n.º 2
0
 def fit(self, X_train, y_train):
     self.X_train = X_train
     self.y_train = y_train
     skf = StratifiedKFold(n_splits=self.n_fold, shuffle=True, random_state=0)
     # i = 0
     y_test_list = []
     y_pred_list = []
     # for train_idx, test_idx in skf.split(self.X_train, self.y_train):
     #     X_train_s, y_train_s = self.X_train[train_idx], self.y_train[train_idx]
     #     X_test_s, y_test_s = self.X_train[test_idx], self.y_train[test_idx]
     for i in range(self.n_fold):
         print('cross ', i+1, '/', self.n_fold)
         X_train_s, y_train_s, X_test_s, y_test_s = training_utils.split_cv(X_train, y_train, cv_num=self.n_fold, cv_index=i)
         # X_train_s, X_val_s, y_train_s, y_val_s = train_test_split(X_train_s, y_train_s, test_size=0.1,
         #                                                           stratify=y_train_s)
         X_train_s, y_train_s, X_val_s, y_val_s = training_utils.split(X_train_s, y_train_s, split_ratio=0.95)
         y_pred_s = None
         for models in self.base_models:
             model = models[i]
             print(model.name)
             if self.is_condition:
                 model.train_exp(X_train_s, y_train_s, X_val_s, y_val_s, X_test_s, y_test_s)
             else:
                 model.train(X_train_s, y_train_s, X_val_s, y_val_s, X_test_s, y_test_s)
             #作为特征的时候使用one-hot表示
             if y_pred_s is None:
                 y_pred_s = model.predict(X_test_s)
             else:
                 # import ipdb
                 # ipdb.set_trace( )
                 y_pred_s = np.hstack( (y_pred_s, model.predict(X_test_s) ) )
         i += 1
         y_test_list.append(y_test_s)
         y_pred_list.append(y_pred_s)
     # 使用y_pred_list做特征,y_test_list做目标,再次训练模型
     X_top = np.vstack(y_pred_list)
     y_top = np.vstack(y_test_list)
     y_top = training_utils.convert_y( y_top )
     if self.is_condition:
         X_top = np.squeeze(X_top, axis=-1)
     print(X_top.shape, y_top.shape)
     self.top_model = LogisticRegression()
     self.top_model.fit(X_top, y_top)
     print(X_top)
Ejemplo n.º 3
0
 def fit_tmp(self, X_train, y_train):
     self.X_train = X_train
     self.y_train = y_train
     y_test_list = []
     y_pred_list = []
     for i in range(self.n_fold):
         print('cross ', i+1, '/', self.n_fold)
         X_train_s, y_train_s, X_test_s, y_test_s = training_utils.split_cv(X_train, y_train, cv_num=self.n_fold, cv_index=i)
         X_train_s, y_train_s, X_val_s, y_val_s = training_utils.split(X_train_s, y_train_s, split_ratio=0.9)
         y_pred_s = None
         for models in self.base_models:
             model = models[i]
             print(model.name)
             # if self.is_condition:
             #     model.train_exp(X_train_s, y_train_s, X_val_s, y_val_s, X_test_s, y_test_s)
             # else:
             #     model.train(X_train_s, y_train_s, X_val_s, y_val_s, X_test_s, y_test_s)
             #作为特征的时候使用one-hot表示
             if y_pred_s is None:
                 y_pred_s = model.predict(X_test_s)
             else:
                 # import ipdb
                 # ipdb.set_trace( )
                 y_pred_s = np.hstack( (y_pred_s, model.predict(X_test_s) ) )
         i += 1
         y_test_list.append(y_test_s)
         y_pred_list.append(y_pred_s)
     # 使用y_pred_list做特征,y_test_list做目标,再次训练模型
     X_top = np.vstack(y_pred_list)
     if self.is_condition:
         X_top = np.squeeze(X_top, axis=-1)
     y_top = np.vstack(y_test_list)
     y_top = training_utils.convert_y( y_top )
     print(X_top.shape, y_top.shape)
     self.top_model = LogisticRegression()
     self.top_model.fit(X_top, y_top)
     print(X_top)
     joblib.dump(self.top_model, self.name)
Ejemplo n.º 4
0
import numpy as np
import  training_utils
N = 100
dim = 10
x1 = np.random.rand(N,dim )
x2 = np.random.rand(N,dim )
y = np.random.rand(N)
cv_num =5
for cv_index in range(cv_num):
    x_tn, y_tn, x_ts, y_ts = training_utils.split_cv([x1, x2], y, cv_num, cv_index)
    print(x_tn[0].shape, y_tn.shape, x_ts[0].shape, y_ts.shape)