def train_model_pe( use_char = True): print('define model') print('load data') import data_utils, training_utils tn_conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load('data.dict') y = to_categorical(data_dict['y']) if use_char: vocab_dict = data_utils.pickle_load(tn_conf.char_dict) embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') x = data_dict['x'] MAX_LEN = 600 name = 'conditionmodel_PE.h5' else: x = data_utils.pickle_load(tn_conf.term_file) vocab_dict = data_utils.pickle_load(tn_conf.term_dict) embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN = 300 name = 'conditionmodel_term_PE.h5' xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') model = ConditionModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name, PE=True) x_tn, y_tn = model.gen_train(x_tn, y_tn) x_val, y_val = model.gen_train(x_val, y_val) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_pe(): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') y = to_categorical(data_dict['y']) xe = [[i for i in range(600)] for _ in range(y.shape[0])] xe = np.array(xe) x_tn, y_tn, x_ts, y_ts = training_utils.split([data_dict['x'], xe], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = SSCharModel(embed_matrix=embed_matrix, name='sscharmodel_PE.h5', PE=True) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = SSCharModel(embed_matrix=embed_matrix, name='sscharmodel_PE.h5', PE=True, train_embed=True) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model(use_char = False ): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') if use_char: vocab_dict = data_utils.pickle_load(conf.char_dict) embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 name = 'deepcnn_model.h5' x = data_dict['x'] else: x = data_utils.pickle_load(conf.term_file) vocab_dict = data_utils.pickle_load(conf.term_dict) embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN = 300 name = 'deepcnn_model_term.h5' print('load embed done.') y = to_categorical(data_dict['y']) x_tn, y_tn, x_ts, y_ts = training_utils.split(x, y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = DeepCNNModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model(use_char=True): print('train condition conv model.\nload data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) char_embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN_TERM = 300 name = 'conditionconvmodel.h5' print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat, xt], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = ConditionConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, name=name) x_tn, y_tn = model.gen_train(x_tn, y_tn) x_val, y_val = model.gen_train(x_val, y_val) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = ConditionConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, name=name, train_embed=True, train_top=False, lr=0.001) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_n200(model_conf, name, ModelClass): print(name) import data_utils200 as data_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) char_embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = conf.MAX_LEN x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN_TERM = conf.MAX_LEN print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split( [x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = ModelClass(model_conf, char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, name=name) x_tn, y_tn = model.gen_train(x_tn, y_tn) x_val, y_val = model.gen_train(x_val, y_val) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = ModelClass(model_conf, char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, name=name, train_embed=True, train_top=False) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_main_pe(): print('load data') import data_utils, training_utils tn_conf = TrainConfigure() data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(tn_conf.term_file) xfeat = data_utils.pickle_load(tn_conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, tn_conf.feat_norm) xfeat = scaler.transform(xfeat) print('loading embed ...') term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding( char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') xe_char = [[i for i in range(600)] for _ in range(y.shape[0])] xe_char = np.array(xe_char) xe_term = [[i for i in range(300)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split( [x, xterm, xfeat, xe_char, xe_term], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('define model') model = HybridModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, name='hybridmodel_PE.h5', PE=True) # +37 print('feat shape', xfeat.shape) import sys print('train') model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_pe(use_char=False): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') if use_char: vocab_dict = data_utils.pickle_load(conf.char_dict) embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 name = 'gateddeepcnnmodel_PE.h5' x = data_dict['x'] else: x = data_utils.pickle_load(conf.term_file) vocab_dict = data_utils.pickle_load(conf.term_dict) embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN = 300 name = 'gateddeepcnnmodel_term_PE.h5' print('load embed done.') y = to_categorical(data_dict['y']) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = GatedDeepCNNModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name, PE=True) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = GatedDeepCNNModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name, PE=True, train_embed=True) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model(): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.term_dict) embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') print('load embed done.') x_tn, y_tn, x_ts, y_ts = training_utils.split(xterm, y, shuffle=True) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=True) print('train') print('define model') model = TermModel(embed_matrix=embed_matrix) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_cv( cv_index, cv_num ): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) char_embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) term_embed_matrix = data_utils.load_embedding(term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN_TERM = 300 name = 'hybriddensemodel_cv{}.h5'.format(cv_index) print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split_cv([x, xe, xterm, xe_term, xfeat, xt], y, cv_index=cv_index,cv_num=cv_num) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name, train_embed=True, train_top=False, lr=0.001) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model
def fit(self, X_train, y_train): self.X_train = X_train self.y_train = y_train skf = StratifiedKFold(n_splits=self.n_fold, shuffle=True, random_state=0) # i = 0 y_test_list = [] y_pred_list = [] # for train_idx, test_idx in skf.split(self.X_train, self.y_train): # X_train_s, y_train_s = self.X_train[train_idx], self.y_train[train_idx] # X_test_s, y_test_s = self.X_train[test_idx], self.y_train[test_idx] for i in range(self.n_fold): print('cross ', i+1, '/', self.n_fold) X_train_s, y_train_s, X_test_s, y_test_s = training_utils.split_cv(X_train, y_train, cv_num=self.n_fold, cv_index=i) # X_train_s, X_val_s, y_train_s, y_val_s = train_test_split(X_train_s, y_train_s, test_size=0.1, # stratify=y_train_s) X_train_s, y_train_s, X_val_s, y_val_s = training_utils.split(X_train_s, y_train_s, split_ratio=0.95) y_pred_s = None for models in self.base_models: model = models[i] print(model.name) if self.is_condition: model.train_exp(X_train_s, y_train_s, X_val_s, y_val_s, X_test_s, y_test_s) else: model.train(X_train_s, y_train_s, X_val_s, y_val_s, X_test_s, y_test_s) #作为特征的时候使用one-hot表示 if y_pred_s is None: y_pred_s = model.predict(X_test_s) else: # import ipdb # ipdb.set_trace( ) y_pred_s = np.hstack( (y_pred_s, model.predict(X_test_s) ) ) i += 1 y_test_list.append(y_test_s) y_pred_list.append(y_pred_s) # 使用y_pred_list做特征,y_test_list做目标,再次训练模型 X_top = np.vstack(y_pred_list) y_top = np.vstack(y_test_list) y_top = training_utils.convert_y( y_top ) if self.is_condition: X_top = np.squeeze(X_top, axis=-1) print(X_top.shape, y_top.shape) self.top_model = LogisticRegression() self.top_model.fit(X_top, y_top) print(X_top)
def fit_tmp(self, X_train, y_train): self.X_train = X_train self.y_train = y_train y_test_list = [] y_pred_list = [] for i in range(self.n_fold): print('cross ', i+1, '/', self.n_fold) X_train_s, y_train_s, X_test_s, y_test_s = training_utils.split_cv(X_train, y_train, cv_num=self.n_fold, cv_index=i) X_train_s, y_train_s, X_val_s, y_val_s = training_utils.split(X_train_s, y_train_s, split_ratio=0.9) y_pred_s = None for models in self.base_models: model = models[i] print(model.name) # if self.is_condition: # model.train_exp(X_train_s, y_train_s, X_val_s, y_val_s, X_test_s, y_test_s) # else: # model.train(X_train_s, y_train_s, X_val_s, y_val_s, X_test_s, y_test_s) #作为特征的时候使用one-hot表示 if y_pred_s is None: y_pred_s = model.predict(X_test_s) else: # import ipdb # ipdb.set_trace( ) y_pred_s = np.hstack( (y_pred_s, model.predict(X_test_s) ) ) i += 1 y_test_list.append(y_test_s) y_pred_list.append(y_pred_s) # 使用y_pred_list做特征,y_test_list做目标,再次训练模型 X_top = np.vstack(y_pred_list) if self.is_condition: X_top = np.squeeze(X_top, axis=-1) y_top = np.vstack(y_test_list) y_top = training_utils.convert_y( y_top ) print(X_top.shape, y_top.shape) self.top_model = LogisticRegression() self.top_model.fit(X_top, y_top) print(X_top) joblib.dump(self.top_model, self.name)
if len(sys.argv) > 2 and sys.argv[2] == 'pe': print('define char model with position embedding') print('load data') import data_utils, training_utils data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding( char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') xe = [[i for i in range(600)] for _ in range(y.shape[0])] xe = np.array(xe) x_tn, y_tn, x_ts, y_ts = training_utils.split([data_dict['x'], xe], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') model = RCNNModel(MAX_LEN=600, name='RCNNmodel_char_PE.h5', embed_matrix=char_embed_matrix, PE=True) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) else: print('define char model') print('load data') import data_utils, training_utils data_dict = data_utils.pickle_load(tn_conf.char_file)
def train_model_peoe( ): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) char_embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) term_embed_matrix = data_utils.load_embedding(term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') char_embeds = [] char_embed_matrix_oe = data_utils.load_our_embedding(vocab_dict) char_embeds.append( char_embed_matrix_oe ) for windows in [3, 5, 8]: sg = 1 # for sg in [0,1]: embed_file = 'data/char_embed_{}_{}.model'.format(windows, sg) char_embed_tmp = data_utils.load_our_embedding(vocab_dict, model_file = embed_file, dump_path = 'data/our_char_embed_{}_{}.pkl'.format(windows, sg)) char_embeds.append(char_embed_tmp) term_embeds = [] term_embed_matrix_oe = data_utils.load_our_embedding(term_vocab_dict, model_file='data/term_embed.model', dump_path='data/our_term_embed.pkl') term_embeds.append( term_embed_matrix_oe ) for windows in [3, 5, 8]: sg = 1 # for sg in [0,1]: embed_file = 'data/term_embed_{}_{}.model'.format(windows, sg) term_embed_tmp = data_utils.load_our_embedding(term_vocab_dict, model_file=embed_file, dump_path='data/our_term_embed_{}_{}.pkl'.format(windows, sg)) term_embeds.append(term_embed_tmp) MAX_LEN_TERM = 300 name = 'hybriddpcnnmodel_PEOE.h5' print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') #加入更多embedding模型以后,学习率要降低才能正常学习 model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name, char_embeds=char_embeds, term_embeds = term_embeds, lr=0.0004) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name, train_embed=True, train_top=False, lr=0.001, char_embeds=char_embeds, term_embeds = term_embeds) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_main(): print('load data') import data_utils, training_utils tn_conf = TrainConfigure() data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(tn_conf.term_file) xfeat = data_utils.pickle_load(tn_conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, tn_conf.feat_norm) xfeat = scaler.transform(xfeat) print('loading embed ...') term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding( char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('define model') model = HybridModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8) # +37 print('feat shape', xfeat.shape) import sys if len(sys.argv) <= 1 or sys.argv[1] == 'train': print('train') model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) if len(sys.argv) > 1 and sys.argv[1] == 'val': val_conf = ValidConfigure() data_dict = data_utils.pickle_load(val_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] ids = data_dict['id'] xterm = data_utils.pickle_load(val_conf.term_file) xfeat = data_utils.pickle_load(val_conf.feat_file) xfeat = scaler.transform(xfeat) model.load_weights() model.test([x, xterm, xfeat], ids, val_conf.out_file) if len(sys.argv) > 1 and sys.argv[1] == 'error': start_index = y_tn.shape[0] + y_val.shape[0] texts = data_utils.load_all_text(tn_conf) model.load_weights() model.error_analysis(x_ts, y_ts, texts, start_index)
def stacking_main_condition(): print('load data') tn_conf = TrainConfigure() data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(tn_conf.term_file) xfeat = data_utils.pickle_load(tn_conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, tn_conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(600)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(300)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) xtopic = data_utils.pickle_load('data/lda_vec.pkl') print('loading embed ...') term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding(term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding(char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') name = 'model/stack_condition_model.pkl' model_dir = 'model/stack/' n_fold = 3 name = 'model/stack_condition_model5.pkl' model_dir = 'model/stack5/' n_fold = 5 stk_model = stacking(n_fold, name=name, is_condition=True) conf = conditionmodelbase.ModelConfigure() conf.PE = True stk_model.add_model(ConditionConvModel, {"conf":conf,"char_embed_matrix":char_embed_matrix, "term_embed_matrix":term_embed_matrix, "name":model_dir+'conditionconvmodel_PE.h5'}) stk_model.add_model(ConditionGatedConvModel, {"conf":conf,"char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir+'conditiongatedconvmodel_PE.h5'}) stk_model.add_model(ConditionGatedDeepCNNModel, {"conf":conf,"char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir+'conditiongateddeepcnnmodel_PE.h5'}) conf.lr = 0.0005 stk_model.add_model(ConditionDPCNNModel, {"conf": conf, "char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir + 'conditiondpcnnmodel_PE.h5'}) #采样0.1用于测试 # x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic], y, split_ratio=0.005, shuffle=False) # x_tn, y_tn, x_ts, y_ts = training_utils.split(x_tn, y_tn, shuffle=False) x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic], y, split_ratio=0.95) stk_model.fit(x_tn, y_tn) # joblib.dump(stk_model, 'model/stack_model_3.pkl') y_pred = stk_model.predict(x_ts) acc = accuracy_score(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) ) print(acc) cnf_matrix = confusion_matrix(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) ) print(cnf_matrix) stk_model.save( )
def train_model_ftoe(model_conf, model_name='hybridconvmodel_FTOE.h5', ModelClass=HybridModelBase, char_embed_file=None, term_embed_file=None): print(model_name) print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) # char_embed_matrix = data_utils.load_embedding(vocab_dict, # 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/char_embed.pkl') MAX_LEN = 600 x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed.pkl') char_embed_matrix = data_utils.load_embedding( vocab_dict, char_embed_file, dump_path='data/{}.pkl'.format(char_embed_file[5:])) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, term_embed_file, dump_path='data/{}.pkl'.format(term_embed_file[5:])) MAX_LEN_TERM = 300 print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split( [x, xe, xterm, xe_term, xfeat, xt], y, split_ratio=0.95, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, split_ratio=0.95, shuffle=False) print('train') print('define model') model = ModelClass(model_conf, char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, name=model_name, train_embed=False, train_top=True) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model_conf.lr *= 0.5 model = ModelClass(model_conf, char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, name=model_name, train_embed=True, train_top=False) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model