def train_model_pe( use_char = True): print('define model') print('load data') import data_utils, training_utils tn_conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load('data.dict') y = to_categorical(data_dict['y']) if use_char: vocab_dict = data_utils.pickle_load(tn_conf.char_dict) embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') x = data_dict['x'] MAX_LEN = 600 name = 'conditionmodel_PE.h5' else: x = data_utils.pickle_load(tn_conf.term_file) vocab_dict = data_utils.pickle_load(tn_conf.term_dict) embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN = 300 name = 'conditionmodel_term_PE.h5' xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') model = ConditionModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name, PE=True) x_tn, y_tn = model.gen_train(x_tn, y_tn) x_val, y_val = model.gen_train(x_val, y_val) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model(use_char = False ): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') if use_char: vocab_dict = data_utils.pickle_load(conf.char_dict) embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 name = 'deepcnn_model.h5' x = data_dict['x'] else: x = data_utils.pickle_load(conf.term_file) vocab_dict = data_utils.pickle_load(conf.term_dict) embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN = 300 name = 'deepcnn_model_term.h5' print('load embed done.') y = to_categorical(data_dict['y']) x_tn, y_tn, x_ts, y_ts = training_utils.split(x, y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = DeepCNNModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_pe(): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') y = to_categorical(data_dict['y']) xe = [[i for i in range(600)] for _ in range(y.shape[0])] xe = np.array(xe) x_tn, y_tn, x_ts, y_ts = training_utils.split([data_dict['x'], xe], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = SSCharModel(embed_matrix=embed_matrix, name='sscharmodel_PE.h5', PE=True) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = SSCharModel(embed_matrix=embed_matrix, name='sscharmodel_PE.h5', PE=True, train_embed=True) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model(use_char=True): print('train condition conv model.\nload data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) char_embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN_TERM = 300 name = 'conditionconvmodel.h5' print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat, xt], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = ConditionConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, name=name) x_tn, y_tn = model.gen_train(x_tn, y_tn) x_val, y_val = model.gen_train(x_val, y_val) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = ConditionConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, name=name, train_embed=True, train_top=False, lr=0.001) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_n200(model_conf, name, ModelClass): print(name) import data_utils200 as data_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) char_embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = conf.MAX_LEN x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN_TERM = conf.MAX_LEN print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split( [x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = ModelClass(model_conf, char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, name=name) x_tn, y_tn = model.gen_train(x_tn, y_tn) x_val, y_val = model.gen_train(x_val, y_val) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = ModelClass(model_conf, char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, name=name, train_embed=True, train_top=False) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_cv( cv_index, cv_num ): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) char_embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) term_embed_matrix = data_utils.load_embedding(term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN_TERM = 300 name = 'hybriddensemodel_cv{}.h5'.format(cv_index) print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split_cv([x, xe, xterm, xe_term, xfeat, xt], y, cv_index=cv_index,cv_num=cv_num) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name, train_embed=True, train_top=False, lr=0.001) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model
def train_model_pe(use_char=False): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') if use_char: vocab_dict = data_utils.pickle_load(conf.char_dict) embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 name = 'gateddeepcnnmodel_PE.h5' x = data_dict['x'] else: x = data_utils.pickle_load(conf.term_file) vocab_dict = data_utils.pickle_load(conf.term_dict) embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN = 300 name = 'gateddeepcnnmodel_term_PE.h5' print('load embed done.') y = to_categorical(data_dict['y']) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = GatedDeepCNNModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name, PE=True) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = GatedDeepCNNModel(embed_matrix=embed_matrix, MAX_LEN=MAX_LEN, name=name, PE=True, train_embed=True) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model(): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.term_dict) embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') print('load embed done.') x_tn, y_tn, x_ts, y_ts = training_utils.split(xterm, y, shuffle=True) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=True) print('train') print('define model') model = TermModel(embed_matrix=embed_matrix) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_peoe( ): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) char_embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) term_embed_matrix = data_utils.load_embedding(term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') char_embeds = [] char_embed_matrix_oe = data_utils.load_our_embedding(vocab_dict) char_embeds.append( char_embed_matrix_oe ) for windows in [3, 5, 8]: sg = 1 # for sg in [0,1]: embed_file = 'data/char_embed_{}_{}.model'.format(windows, sg) char_embed_tmp = data_utils.load_our_embedding(vocab_dict, model_file = embed_file, dump_path = 'data/our_char_embed_{}_{}.pkl'.format(windows, sg)) char_embeds.append(char_embed_tmp) term_embeds = [] term_embed_matrix_oe = data_utils.load_our_embedding(term_vocab_dict, model_file='data/term_embed.model', dump_path='data/our_term_embed.pkl') term_embeds.append( term_embed_matrix_oe ) for windows in [3, 5, 8]: sg = 1 # for sg in [0,1]: embed_file = 'data/term_embed_{}_{}.model'.format(windows, sg) term_embed_tmp = data_utils.load_our_embedding(term_vocab_dict, model_file=embed_file, dump_path='data/our_term_embed_{}_{}.pkl'.format(windows, sg)) term_embeds.append(term_embed_tmp) MAX_LEN_TERM = 300 name = 'hybriddpcnnmodel_PEOE.h5' print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') #加入更多embedding模型以后,学习率要降低才能正常学习 model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name, char_embeds=char_embeds, term_embeds = term_embeds, lr=0.0004) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name, train_embed=True, train_top=False, lr=0.001, char_embeds=char_embeds, term_embeds = term_embeds) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_ftoe(model_conf, model_name='hybridconvmodel_FTOE.h5', ModelClass=HybridModelBase, char_embed_file=None, term_embed_file=None): print(model_name) print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) # char_embed_matrix = data_utils.load_embedding(vocab_dict, # 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/char_embed.pkl') MAX_LEN = 600 x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed.pkl') char_embed_matrix = data_utils.load_embedding( vocab_dict, char_embed_file, dump_path='data/{}.pkl'.format(char_embed_file[5:])) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, term_embed_file, dump_path='data/{}.pkl'.format(term_embed_file[5:])) MAX_LEN_TERM = 300 print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split( [x, xe, xterm, xe_term, xfeat, xt], y, split_ratio=0.95, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, split_ratio=0.95, shuffle=False) print('train') print('define model') model = ModelClass(model_conf, char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, name=model_name, train_embed=False, train_top=True) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model_conf.lr *= 0.5 model = ModelClass(model_conf, char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, name=model_name, train_embed=True, train_top=False) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model