def train_model_peoe( ): print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) char_embed_matrix = data_utils.load_embedding(vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) term_embed_matrix = data_utils.load_embedding(term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') char_embeds = [] char_embed_matrix_oe = data_utils.load_our_embedding(vocab_dict) char_embeds.append( char_embed_matrix_oe ) for windows in [3, 5, 8]: sg = 1 # for sg in [0,1]: embed_file = 'data/char_embed_{}_{}.model'.format(windows, sg) char_embed_tmp = data_utils.load_our_embedding(vocab_dict, model_file = embed_file, dump_path = 'data/our_char_embed_{}_{}.pkl'.format(windows, sg)) char_embeds.append(char_embed_tmp) term_embeds = [] term_embed_matrix_oe = data_utils.load_our_embedding(term_vocab_dict, model_file='data/term_embed.model', dump_path='data/our_term_embed.pkl') term_embeds.append( term_embed_matrix_oe ) for windows in [3, 5, 8]: sg = 1 # for sg in [0,1]: embed_file = 'data/term_embed_{}_{}.model'.format(windows, sg) term_embed_tmp = data_utils.load_our_embedding(term_vocab_dict, model_file=embed_file, dump_path='data/our_term_embed_{}_{}.pkl'.format(windows, sg)) term_embeds.append(term_embed_tmp) MAX_LEN_TERM = 300 name = 'hybriddpcnnmodel_PEOE.h5' print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xt], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') #加入更多embedding模型以后,学习率要降低才能正常学习 model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name, char_embeds=char_embeds, term_embeds = term_embeds, lr=0.0004) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name, train_embed=True, train_top=False, lr=0.001, char_embeds=char_embeds, term_embeds = term_embeds) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
if __name__ == '__main__': import sys tn_conf = TrainConfigure() if len(sys.argv) > 1 and sys.argv[1] == 'char': if len(sys.argv) > 2 and sys.argv[2] == 'pe': print('define char model with position embedding') print('load data') import data_utils, training_utils data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding( char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') xe = [[i for i in range(600)] for _ in range(y.shape[0])] xe = np.array(xe) x_tn, y_tn, x_ts, y_ts = training_utils.split([data_dict['x'], xe], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') model = RCNNModel(MAX_LEN=600, name='RCNNmodel_char_PE.h5', embed_matrix=char_embed_matrix, PE=True) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
train_sq_len = get_sequence_length(train_tokens) # MAXLEN = max(train_sq_len) # AVR = sum(train_sq_len)/ len(train_sq_len) # print(AVR) valid_sq_len = get_sequence_length(valid_tokens) test_sq_len = get_sequence_length(test_tokens) # create vocabulary from the data itself wordlist = itertools.chain.from_iterable(train_tokens) word_index, _ = build_vocab(wordlist) # load dependency embedding dep_embedding_path = "dep_embedding/deps.contexts" dep_embedding_index = load_embedding(dep_embedding_path) dep_embedding_matrix = get_embedding_matrix(word_index, dep_embedding_index, FLAGS.word_embedding_size) print("finish loading dep embedding") fast_embedding_path = "fast-text/wiki.simple.vec" fast_embedding_index = load_embedding(fast_embedding_path) fast_embedding_matrix = get_embedding_matrix(word_index, fast_embedding_index, FLAGS.word_embedding_size) print("finish loading fast embedding") embedding_path = "glove.6B/glove.6B.{}d.txt".format(FLAGS.word_embedding_size) embedding_index = load_embedding(embedding_path) embedding_matrix = get_embedding_matrix(word_index, embedding_index, FLAGS.word_embedding_size) print("finish loading linear embedding") vocab_size = len(word_index)
def main(args): # dev data result_dev = data_utils.load_data(args.valid_data_file) print('result dev size : %s ' % len(result_dev)) datasets_dev = data_utils.connect_metadata(result_dev) print('datasets dev size : %s ' % len(datasets_dev)) datasets_pos_dev = data_utils.sperate_data(datasets_dev) # train data result = data_utils.load_data(args.train_data_file) print('result size : %s ' % len(result)) datasets = data_utils.connect_metadata(result) print('dataset size : %s ' % len(datasets)) datasets_pos = data_utils.sperate_data(datasets) word_vocab_list, pos_vocab_list, char_vocab_list = data_utils.creat_vocab( args) args.word_vocab_size = len(word_vocab_list) print('word size: %s' % len(word_vocab_list)) args.pos_vocab_size = len(pos_vocab_list) print('pos size: %s ' % len(pos_vocab_list)) args.char_vocab_size = len(char_vocab_list) print('char size: %s ' % len(char_vocab_list)) word2id = data_utils.get_vocab2id(word_vocab_list) pos2id = data_utils.get_vocab2id(pos_vocab_list) char2id = data_utils.get_vocab2id(char_vocab_list) embed_word2id = data_utils.get_embed_word(args) print('embedding_word size: %s' % len(embed_word2id.keys())) pretrained_embedding = data_utils.load_embedding(args, word2id) pretrained_char_embedding = data_utils.load_embedding(args, char2id) train_datasets = data_utils.make_pading(datasets_pos, word2id, pos2id, char2id, args.max_length, args.max_topic_length, args.max_word_length) dev_datasets = data_utils.make_pading(datasets_pos_dev, word2id, pos2id, char2id, args.max_length, args.max_topic_length, args.max_word_length) with tf.Session() as sess: model = Triangulation_Approach_Community_QA.QASystem( sess, pretrained_embedding, pretrained_char_embedding, init_word_embed=True, args=args) sess.run(tf.global_variables_initializer()) # model.load_checkpoints() if args.bool_train: print("-" * 20 + " Start Training: %s " % datetime.datetime.now() + "-" * 20) start_time = time.time() max_apk = 0 dev_datasets_batchs = data_utils.batch_iter( dev_datasets, args.batch_size) print('the dev data sets batch is : %s' % len(dev_datasets_batchs)) for epoch in range(args.num_epochs): for batch in data_utils.batch_iter(train_datasets, args.batch_size): # print(batch) # loss, acc, global_step, label, pred, logits = run_train(model, batch) question_topic, question_topic_pos, question_topic_char, question_topic_length, question_topic_char_length = batch[ 0], batch[1], batch[2], batch[3], batch[4] question, question_pos, question_char, question_length, question_char_length = batch[ 5], batch[6], batch[7], batch[8], batch[9] question_1_topic, question_1_topic_pos, question_1_topic_char, question_1_topic_length, question_1_topic_char_length = batch[ 10], batch[11], batch[12], batch[13], batch[14] question_1, question_1_pos, question_1_char, question_1_length, question_1_char_length = batch[ 15], batch[16], batch[17], batch[18], batch[19] comment_1, comment_1_pos, comment_1_char, comment_1_length, comment_1_char_length = batch[ 20], batch[21], batch[22], batch[23], batch[24] question_2_topic, question_2_topic_pos, question_2_topic_char, question_2_topic_length, question_2_topic_char_length = batch[ 25], batch[26], batch[27], batch[28], batch[29] question_2, question_2_pos, question_2_char, question_2_length, question_2_char_length = batch[ 30], batch[31], batch[32], batch[33], batch[34] comment_2, comment_2_pos, comment_2_char, comment_2_length, comment_2_char_length = batch[ 35], batch[36], batch[37], batch[38], batch[39] label = batch[40] ''' def train(self, q, q_pos, q_char, q_topic, q_l, q_char_l, q1, q1_pos, q1_char, q1_topic, q1_l, q1_char_l, c1, c1_pos, c1_char, c1_topic, c1_l, c1_char_l, q2, q2_pos, q2_char, q2_topic, q2_l, q2_char_l, c2, c2_pos, c2_char, c2_topic, c2_l, c2_char_l, label, keep_prob): ''' loss, acc, global_step, pred, logits = model.train( np.array(question), np.array(question_pos), np.array(question_char), np.array(question_topic), question_length, question_char_length, np.array(question_1), np.array(question_1_pos), np.array(question_1_char), np.array(question_1_topic), question_1_length, question_1_char_length, np.array(comment_1), np.array(comment_1_pos), np.array(comment_1_char), np.array(question_1_topic), comment_1_length, comment_1_char_length, np.array(question_2), np.array(question_2_pos), np.array(question_2_char), np.array(question_2_topic), question_2_length, question_2_char_length, np.array(comment_2), np.array(comment_2_pos), np.array(comment_2_char), np.array(question_2_topic), comment_2_length, comment_2_char_length, label, args.keep_prob) print( "| Epoch: {:2d}".format(epoch), "| Step: {:4d}".format(global_step), "| Time: {:3d}s".format(int(time.time() - start_time)), "| Train Loss: {:.4f}".format(loss), "| Train Acc: {:.4f}".format(acc)) # 直观地观测效果 pred = [p for p in pred] print("Label: {}".format(label)) print("Pred: {}".format(pred)) pred, label = merge_sort_part(logits, pred, label) score = apk(label, pred) if score > max_apk: max_apk = score print('the score is : %s , and the max apk is : %s ' % (score, max_apk)) print('-' * 20) if acc > args.beach_mark and args.model_count and epoch > 50: model.save() args.model_count -= 1 print("-" * 20 + " Finished Training. %s " % datetime.datetime.now() + "-" * 20) ########################################################################### if args.bool_test: test_data_result = data_utils.load_data(args.test_data_file_1) print('test data result size : %s' % len(test_data_result)) test_data_datasets_pos = data_utils.sperate_test_data( test_data_result) test_datasets = data_utils.make_test_padding( test_data_datasets_pos, word2id, pos2id, char2id, args.max_length, args.max_topic_length, args.max_word_length) print("-" * 20 + " Start Testing: %s " % datetime.datetime.now() + "-" * 20) start_time = time.time() NEG = [[0] * args.max_length] NEG_char = [[0] * args.max_length * 10] NEG_topic = [[0] * args.max_topic_length] NEG_topic_char = [[0] * args.max_topic_length * 10] for key in test_datasets.keys(): length = len(test_datasets[key]) logits_list, pred_list, label_list = [], [], [] for i in range(length): ''' question_topic, question_topic_pos, question_topic_char, question_topic_length, question_topic_char_length, question, question_pos, question_char, question_length, question_char_length, question_1_topic, question_1_topic_pos, question_1_topic_char, question_1_topic_length, question_1_topic_char_length, question_1, question_1_pos, question_1_char, question_1_length, question_1_char_length, comment_1, comment_1_pos, comment_1_char, comment_1_length, comment_1_char_length, ''' question_topic = [test_datasets[key][i][0]] question_topic_pos = [test_datasets[key][i][1]] question_topic_char = [test_datasets[key][i][2]] question_topic_length = [test_datasets[key][i][3]] question_topic_char_length = [test_datasets[key][i][4]] question = [test_datasets[key][i][5]] question_pos = [test_datasets[key][i][6]] question_char = [test_datasets[key][i][7]] question_length = [test_datasets[key][i][8]] question_char_length = [test_datasets[key][i][9]] question_1_topic = [test_datasets[key][i][10]] question_1_topic_pos = [test_datasets[key][i][11]] question_1_topic_char = [test_datasets[key][i][12]] question_1_topic_length = [test_datasets[key][i][13]] question_1_topic_char_length = [test_datasets[key][i][14]] question_1 = [test_datasets[key][i][15]] question_1_pos = [test_datasets[key][i][16]] question_1_char = [test_datasets[key][i][17]] question_1_length = [test_datasets[key][i][18]] question_1_char_length = [test_datasets[key][i][19]] comment_1 = [test_datasets[key][i][20]] comment_1_pos = [test_datasets[key][i][21]] comment_1_char = [test_datasets[key][i][22]] comment_1_length = [test_datasets[key][i][23]] comment_1_char_length = [test_datasets[key][i][24]] question_2_topic = NEG_topic question_2_topic_pos = NEG_topic question_2_topic_char = NEG_topic_char question_2_topic_length = [0] question_2_topic_char_length = [0] question_2 = NEG question_2_pos = NEG question_2_char = NEG_char question_2_length = [0] question_2_char_length = [0] comment_2 = NEG comment_2_pos = NEG comment_2_char = NEG_char comment_2_length = [0] comment_2_char_length = [0] label = [test_datasets[key][i][25]] ''' def train(self, q, q_pos, q_char, q_topic, q_l, q_char_l, q1, q1_pos, q1_char, q1_topic, q1_l, q1_char_l, c1, c1_pos, c1_char, c1_topic, c1_l, c1_char_l, q2, q2_pos, q2_char, q2_topic, q2_l, q2_char_l, c2, c2_pos, c2_char, c2_topic, c2_l, c2_char_l, label, keep_prob): ''' loss, acc, pred, logits = model.test( np.array(question), np.array(question_pos), np.array(question_char), np.array(question_topic), question_length, question_char_length, np.array(question_1), np.array(question_1_pos), np.array(question_1_char), np.array(question_1_topic), question_1_length, question_1_char_length, np.array(comment_1), np.array(comment_1_pos), np.array(comment_1_char), np.array(question_1_topic), comment_1_length, comment_1_char_length, np.array(question_2), np.array(question_2_pos), np.array(question_2_char), np.array(question_2_topic), question_2_length, question_2_char_length, np.array(comment_2), np.array(comment_2_pos), np.array(comment_2_char), np.array(question_2_topic), comment_2_length, comment_2_char_length, label, args.keep_prob) print( "| Time: {:3d}s".format(int(time.time() - start_time)), "| Test Loss: {:.4f}".format(loss), "| Test Acc: {:.4f}".format(acc)) # 直观地观测效果 logits_list += list(logits) pred_list += [p for p in pred] label_list += label # print("Label: {}".format(label)) # print("Pred: {}".format(pred)) print("-" * 20) pred, label = merge_sort_part(logits_list, pred_list, label_list) score = apk(label, pred, length) if score > max_apk: max_apk = score if score < min_apk: min_apk = score print('the max apk is : %s and the min apk is : %s ' % (max_apk, min_apk)) print("-" * 20 + " End Testing: %s " % datetime.datetime.now() + "-" * 20) print('max_apk:{}'.format(max_apk), 'min_apk:{}'.format(min_apk))
def train_main(): print('load data') import data_utils, training_utils tn_conf = TrainConfigure() data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(tn_conf.term_file) xfeat = data_utils.pickle_load(tn_conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, tn_conf.feat_norm) xfeat = scaler.transform(xfeat) print('loading embed ...') term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding( char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('define model') model = HybridModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8) # +37 print('feat shape', xfeat.shape) import sys if len(sys.argv) <= 1 or sys.argv[1] == 'train': print('train') model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) if len(sys.argv) > 1 and sys.argv[1] == 'val': val_conf = ValidConfigure() data_dict = data_utils.pickle_load(val_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] ids = data_dict['id'] xterm = data_utils.pickle_load(val_conf.term_file) xfeat = data_utils.pickle_load(val_conf.feat_file) xfeat = scaler.transform(xfeat) model.load_weights() model.test([x, xterm, xfeat], ids, val_conf.out_file) if len(sys.argv) > 1 and sys.argv[1] == 'error': start_index = y_tn.shape[0] + y_val.shape[0] texts = data_utils.load_all_text(tn_conf) model.load_weights() model.error_analysis(x_ts, y_ts, texts, start_index)
def train_model_cv(cv_index, cv_num): print('train condition conv model with PE\n load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) char_embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN_TERM = 300 name = 'conditionconvmodel_cv{}.h5'.format(cv_index) print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split_cv( [x, xe, xterm, xe_term, xfeat, xt], y, cv_index=cv_index, cv_num=cv_num) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = ConditionConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name) x_tn, y_tn = model.gen_train(x_tn, y_tn) x_val, y_val = model.gen_train(x_val, y_val) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = ConditionConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=MAX_LEN, NUM_FEAT=8, PE=True, name=name, train_embed=True, train_top=False, lr=0.001) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def train_model_ftoe(model_conf, name, ModelClass, char_embed_file=None, term_embed_file=None): print(name) import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) # char_embed_matrix = data_utils.load_embedding(vocab_dict, # 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/char_embed.pkl') MAX_LEN = 600 x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) char_embed_matrix = data_utils.load_embedding( vocab_dict, char_embed_file, dump_path='data/{}.pkl'.format(char_embed_file[5:])) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, term_embed_file, dump_path='data/{}.pkl'.format(term_embed_file[5:])) MAX_LEN_TERM = 300 print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split( [x, xe, xterm, xe_term, xfeat, xt], y, split_ratio=0.95, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, split_ratio=0.95, shuffle=False) print('train') print('define model') model = ModelClass(model_conf, char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, name=name) x_tn, y_tn = model.gen_train(x_tn, y_tn) x_val, y_val = model.gen_train(x_val, y_val) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model = ModelClass(model_conf, char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, name=name, train_embed=True, train_top=False) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def __init__(self, session, batch, vocab, load_embeddings=False, sentence_len=30, calculate_loss=True): """ Sets up the graph of our RNN model. session Tensorflow session object batch The batch of sentences vocab A dictionary mapping token strings to vocabulary IDs load_embeddings Word embeddings our model will use (default: our own) sentence_len The length of one sentence (default: 30) calculate_loss True if the model is in training phase (default:True) """ self.sentence_len = sentence_len self.calculate_loss = calculate_loss self.word_inputs = batch self.initializer = tf.contrib.layers.xavier_initializer if FLAGS.lstm_size > FLAGS.default_lstm_size: print( f"Running with downsize layer from {FLAGS.lstm_size} to {FLAGS.default_lstm_size}!" ) with tf.name_scope("embedding"): self.embedding_matrix = tf.get_variable( "embedding_matrix", initializer=tf.random_uniform( [FLAGS.vocab_size, FLAGS.embedding_dim], -1.0, 1.0), dtype=tf.float32, trainable=True) if load_embeddings: data_utils.load_embedding(session, vocab, self.embedding_matrix, FLAGS.path_embeddings, FLAGS.embedding_dim, FLAGS.vocab_size) self.embedded_words = tf.nn.embedding_lookup( self.embedding_matrix, self.word_inputs ) # DIM [batch_size, sentence_len, embedding_dim] with tf.name_scope("rnn"): # Stacked LSTM layers architecture, with 2 layers lstms = (tf.contrib.rnn.LSTMBlockCell(num_units=FLAGS.lstm_size, dtype=tf.float32), tf.contrib.rnn.LSTMBlockCell(num_units=FLAGS.lstm_size, dtype=tf.float32)) with tf.variable_scope('hidden_state'): #Placeholders holding the value of the hidden states (default: zero matrix) self.lstm_c1 = tf.placeholder_with_default( np.zeros((FLAGS.batch_size, FLAGS.lstm_size), dtype=np.float32), shape=[None, lstms[0].state_size.c], name='c1_in') self.lstm_h1 = tf.placeholder_with_default( np.zeros((FLAGS.batch_size, FLAGS.lstm_size), dtype=np.float32), shape=[None, lstms[0].state_size.h], name='h1_in') self.lstm_c2 = tf.placeholder_with_default( np.zeros((FLAGS.batch_size, FLAGS.lstm_size), dtype=np.float32), shape=[None, lstms[1].state_size.c], name='c2_in') self.lstm_h2 = tf.placeholder_with_default( np.zeros((FLAGS.batch_size, FLAGS.lstm_size), dtype=np.float32), shape=[None, lstms[1].state_size.h], name='h2_in') state_in1 = tf.contrib.rnn.LSTMStateTuple( self.lstm_c1, self.lstm_h1) state_in2 = tf.contrib.rnn.LSTMStateTuple( self.lstm_c2, self.lstm_h2) self.states = [state_in1, state_in2] # Add a down size matrix if necessary if FLAGS.lstm_size > FLAGS.default_lstm_size: down_size = tf.get_variable( "down_size", [FLAGS.lstm_size, FLAGS.default_lstm_size]) self.W_h = tf.get_variable( "W_h", [FLAGS.default_lstm_size, FLAGS.vocab_size], tf.float32, initializer=tf.contrib.layers.xavier_initializer()) self.b_h = tf.get_variable("b_h", [FLAGS.vocab_size], tf.float32, initializer=tf.zeros_initializer()) self.predictions = [] self.next_words_probs = [] self.loss = 0.0 extra_count_for_predict = int(self.calculate_loss == False) for i in range(self.sentence_len - 1 + extra_count_for_predict): # words = [batch_size, embeddings] #next_words_index = [batch_size] words = self.embedded_words[:, i, :] # First layer: input word is the actual word output, self.states[0] = lstms[0]( words, self.states[0]) # output = [batch_size, embedding_dim] # add dropout layer if FLAGS.enable_dropout: output = tf.nn.dropout(output, rate=1 - FLAGS.keep_prob) # Second layer: input word is the prediction from the first layer output, self.states[1] = lstms[1](output, self.states[1]) # add second dropout layer if FLAGS.enable_dropout: output = tf.nn.dropout(output, rate=1 - FLAGS.keep_prob) # Down-project working if necessary if FLAGS.lstm_size > FLAGS.default_lstm_size: output = tf.matmul(output, down_size) # The output comes from he second layer self.logits = tf.matmul( output, self.W_h ) + self.b_h # logits = [batch_size, VOCABULARY_LEN] probabilities = tf.nn.softmax( self.logits, name="softmax_probs" ) # size = [batch_size, VOCABULARY_LENGTH] if self.calculate_loss: next_words_index = self.word_inputs[:, i + 1] # [0, word i + 1 from B0], ... , [63, word i + 1 from B63] indices_of_next_words = tf.stack( [tf.range(FLAGS.batch_size), next_words_index], axis=1) # The probas of the words that should have been predicted next_word_prob = tf.gather_nd(probabilities, indices_of_next_words) self.next_words_probs.append(next_word_prob) losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits, labels=next_words_index) self.loss = tf.math.add(self.loss, tf.reduce_mean(losses), name="cross_entropy_loss") self.predicted_words = tf.argmax(self.logits, axis=1, name="predicted_words") self.predictions.append(self.predicted_words) if self.calculate_loss: self.next_words_probs = tf.stack( self.next_words_probs, axis=1, name="probs") # [batch_size, sentence_length] words = self.word_inputs[:, 1:] # Skipping <bos> # Calculating accuracy with tf.name_scope("accuracy"): if self.calculate_loss: correct_predictions = tf.equal( self.predictions, tf.transpose(tf.cast(self.word_inputs[:, 1:], tf.int64))) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float64), name="accuracy")
def stacking_main_condition(): print('load data') tn_conf = TrainConfigure() data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(tn_conf.term_file) xfeat = data_utils.pickle_load(tn_conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, tn_conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(600)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(300)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) xtopic = data_utils.pickle_load('data/lda_vec.pkl') print('loading embed ...') term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding(term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding(char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') name = 'model/stack_condition_model.pkl' model_dir = 'model/stack/' n_fold = 3 name = 'model/stack_condition_model5.pkl' model_dir = 'model/stack5/' n_fold = 5 stk_model = stacking(n_fold, name=name, is_condition=True) conf = conditionmodelbase.ModelConfigure() conf.PE = True stk_model.add_model(ConditionConvModel, {"conf":conf,"char_embed_matrix":char_embed_matrix, "term_embed_matrix":term_embed_matrix, "name":model_dir+'conditionconvmodel_PE.h5'}) stk_model.add_model(ConditionGatedConvModel, {"conf":conf,"char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir+'conditiongatedconvmodel_PE.h5'}) stk_model.add_model(ConditionGatedDeepCNNModel, {"conf":conf,"char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir+'conditiongateddeepcnnmodel_PE.h5'}) conf.lr = 0.0005 stk_model.add_model(ConditionDPCNNModel, {"conf": conf, "char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir + 'conditiondpcnnmodel_PE.h5'}) #采样0.1用于测试 # x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic], y, split_ratio=0.005, shuffle=False) # x_tn, y_tn, x_ts, y_ts = training_utils.split(x_tn, y_tn, shuffle=False) x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic], y, split_ratio=0.95) stk_model.fit(x_tn, y_tn) # joblib.dump(stk_model, 'model/stack_model_3.pkl') y_pred = stk_model.predict(x_ts) acc = accuracy_score(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) ) print(acc) cnf_matrix = confusion_matrix(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) ) print(cnf_matrix) stk_model.save( )
def predict(tn_conf, lda_file, val_conf, val_conf100, val_conf200): """ 根据概率集成 :return: """ print('load data') data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(tn_conf.term_file) xfeat = data_utils.pickle_load(tn_conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, tn_conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(600)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(300)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) xtopic = data_utils.pickle_load(lda_file) print('loading embed ...') term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding( char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') data_dict = data_utils.pickle_load(val_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] ids = data_dict['id'] xterm = data_utils.pickle_load(val_conf.term_file) xfeat = data_utils.pickle_load(val_conf.feat_file) xfeat = scaler.transform(xfeat) print('feat shape', xfeat.shape) data_dict100 = data_utils.pickle_load(val_conf100.char_file) x100 = data_dict100['x'] xterm100 = data_utils.pickle_load(val_conf100.term_file) xe100 = [[i for i in range(100)] for _ in range(y.shape[0])] xe100 = np.array(xe100) xe_term100 = [[i for i in range(100)] for _ in range(y.shape[0])] xe_term100 = np.array(xe_term100) data_dict200 = data_utils.pickle_load(val_conf200.char_file) x200 = data_dict200['x'] xterm200 = data_utils.pickle_load(val_conf200.term_file) xe200 = [[i for i in range(200)] for _ in range(y.shape[0])] xe200 = np.array(xe200) xe_term200 = [[i for i in range(200)] for _ in range(y.shape[0])] xe_term200 = np.array(xe_term200) ys = [] print('define model') model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybriddensemodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model model = HybridDenseMAModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybriddensemodelma_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('dense model done.') model = HybridSEModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridsemodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('se model done.') print('start len 100 model') model = HybridConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100, NUM_FEAT=8, PE=True, name='hybridconvmodel_n100.h5') model.load_weights() y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic]) ys.append(y) del model print('hybrid conv model done.') model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100, NUM_FEAT=8, PE=True, name='hybridgateddeepcnnmodel_n100.h5') model.load_weights() y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic]) ys.append(y) del model print('hybrid gated deep cnn model done.') model = HybridRCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100, NUM_FEAT=8, PE=True, name='hybridrcnnmodel_n100.h5') model.load_weights() y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic]) ys.append(y) del model print('hybrid RCNN model done.') # print('start len 200 model') # model = HybridConvModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8, # PE=True, name='hybridconvmodel_n200.h5') # model.load_weights() # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) # ys.append(y) # del model # print('hybrid conv model done.') # # model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8, # PE=True, name='hybriddpcnnmodel_n200.h5') # model.load_weights() # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) # ys.append(y) # del model # print('hybrid dpcnn model done.') # # model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8, # PE=True, name='hybridgateddeepcnnmodel_n200.h5') # model.load_weights() # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) # ys.append(y) # del model # print('hybrid gated deep cnn model done.') # # model = HybridRCNNModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8, # PE=True, name='hybridrcnnmodel_n200.h5') # model.load_weights() # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) # ys.append(y) # del model #这个模型太慢 # model = ConditionAttModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, # name='conditionattmodel_PE.h5', lr=0.001) # model.load_weights() # y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) # ys.append(y) # print('condition att model done.') model = ConditionConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditionconvmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('condition conv model done.') model = ConditionDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditiondpcnnmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('condition dpcnn model done.') model = ConditionGatedConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditiongatedconvmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('condition gated conv model done.') model = ConditionGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditiongateddeepcnnmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('condition gated deepcnn model done.') model = HybridAttModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridattmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) print('hybrid att model done.') model = HybridConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridconvmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('hybrid conv model done.') model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybriddpcnnmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('hybrid dpcnn model done.') model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridgateddeepcnnmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('hybrid gated deep cnn model done.') model = HybridRCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridrcnnmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('hybrid rcnn model done.') model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridgatedconvtopicmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) print('hybrid gated conv topic done.') y = fasttextmodel.predict_char() ys.append(y) y = fasttextmodel.predict_term() ys.append(y) print(y.shape) print('fast text done.') #hybrid model model = HybridModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8) # + 37 model.load_weights() y = model.predict([x, xterm, xfeat]) ys.append(y) print(y.shape) print('hybrid model done.') ys = np.array(ys) print(ys.shape) return ys
def predict_all(): """ 根据概率集成 :return: """ print('load data') tn_conf = TrainConfigure() data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(tn_conf.term_file) xfeat = data_utils.pickle_load(tn_conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, tn_conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(600)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(300)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) xtopic = data_utils.pickle_load('data/lda_vec_val.pkl') print('loading embed ...') term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding( char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') val_conf = ValidConfigure() data_dict = data_utils.pickle_load(val_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] ids = data_dict['id'] xterm = data_utils.pickle_load(val_conf.term_file) xfeat = data_utils.pickle_load(val_conf.feat_file) xfeat = scaler.transform(xfeat) print('feat shape', xfeat.shape) import data_utils100 val_conf100 = data_utils100.ValidConfigure() data_dict100 = data_utils.pickle_load(val_conf100.char_file) x100 = data_dict100['x'] xterm100 = data_utils.pickle_load(val_conf100.term_file) xe100 = [[i for i in range(100)] for _ in range(y.shape[0])] xe100 = np.array(xe100) xe_term100 = [[i for i in range(100)] for _ in range(y.shape[0])] xe_term100 = np.array(xe_term100) import data_utils200 val_conf200 = data_utils200.ValidConfigure() data_dict200 = data_utils.pickle_load(val_conf200.char_file) x200 = data_dict200['x'] xterm200 = data_utils.pickle_load(val_conf200.term_file) xe200 = [[i for i in range(200)] for _ in range(y.shape[0])] xe200 = np.array(xe200) xe_term200 = [[i for i in range(200)] for _ in range(y.shape[0])] xe_term200 = np.array(xe_term200) ys = [] print('define model') model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybriddensemodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model model = HybridDenseMAModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybriddensemodelma_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('dense model done.') model = HybridSEModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridsemodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('se model done.') print('start len 100 model') model = HybridConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100, NUM_FEAT=8, PE=True, name='hybridconvmodel_n100.h5') model.load_weights() y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic]) ys.append(y) del model print('hybrid conv model done.') # model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100,NUM_FEAT=8, # PE=True, name='hybriddpcnnmodel_n100.h5') # model.load_weights() # y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic]) # ys.append(y) # del model # print('hybrid dpcnn model done.') model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100, NUM_FEAT=8, PE=True, name='hybridgateddeepcnnmodel_n100.h5') model.load_weights() y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic]) ys.append(y) del model print('hybrid gated deep cnn model done.') model = HybridRCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100, NUM_FEAT=8, PE=True, name='hybridrcnnmodel_n100.h5') model.load_weights() y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic]) ys.append(y) del model print('hybrid RCNN model done.') # print('start len 200 model') # model = HybridConvModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8, # PE=True, name='hybridconvmodel_n200.h5') # model.load_weights() # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) # ys.append(y) # del model # print('hybrid conv model done.') # # model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8, # PE=True, name='hybriddpcnnmodel_n200.h5') # model.load_weights() # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) # ys.append(y) # del model # print('hybrid dpcnn model done.') # # model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8, # PE=True, name='hybridgateddeepcnnmodel_n200.h5') # model.load_weights() # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) # ys.append(y) # del model # print('hybrid gated deep cnn model done.') # # model = HybridRCNNModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200,NUM_FEAT=8, # PE=True, name='hybridrcnnmodel_n200.h5') # model.load_weights() # y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) # ys.append(y) # del model #这个模型太慢 # model = ConditionAttModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, # name='conditionattmodel_PE.h5', lr=0.001) # model.load_weights() # y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) # ys.append(y) # print('condition att model done.') model = ConditionConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditionconvmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('condition conv model done.') model = ConditionDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditiondpcnnmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('condition dpcnn model done.') model = ConditionGatedConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditiongatedconvmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('condition gated conv model done.') model = ConditionGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditiongateddeepcnnmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('condition gated deepcnn model done.') #这个模型太慢 # model = ConditionRCNNModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, # name='conditionrcnnmodel_PE.h5', lr=0.001) # model.load_weights() # y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) # ys.append(y) # print('condition rcnn model done.') model = HybridAttModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridattmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) model = HybridAttModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, name='hybridattmodel.h5') model.load_weights() y = model.predict([x, xterm, xfeat, xtopic]) ys.append(y) del model print('hybrid att model done.') model = HybridConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridconvmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model model = HybridConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, name='hybridconvmodel.h5') model.load_weights() y = model.predict([x, xterm, xfeat, xtopic]) ys.append(y) del model print('hybrid conv model done.') model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybriddpcnnmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, name='hybriddpcnnmodel.h5') model.load_weights() y = model.predict([x, xterm, xfeat, xtopic]) ys.append(y) del model print('hybrid dpcnn model done.') model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridgateddeepcnnmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, name='hybridgateddeepcnnmodel.h5') model.load_weights() y = model.predict([x, xterm, xfeat, xtopic]) ys.append(y) del model print('hybrid gated deep cnn model done.') model = HybridRCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridrcnnmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model model = HybridRCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, name='hybridrcnnmodel.h5') model.load_weights() y = model.predict([x, xterm, xfeat, xtopic]) ys.append(y) del model print('hybrid rcnn model done.') model = ConditionModel(embed_matrix=char_embed_matrix) model.load_weights() y = model.predict(x) ys.append(y) model = ConditionModel(embed_matrix=char_embed_matrix, PE=True, name='conditionmodel_PE.h5') model.load_weights() y = model.predict([x, xe]) ys.append(y) del model model = ConditionModel(embed_matrix=term_embed_matrix, MAX_LEN=300, name='conditionmodel_term.h5') model.load_weights() y = model.predict(xterm) ys.append(y) model = ConditionModel(embed_matrix=term_embed_matrix, MAX_LEN=300, PE=True, name='conditionmodel_term_PE.h5') model.load_weights() y = model.predict([xterm, xe_term]) ys.append(y) del model print('condition model done.') model = GatedConvTopicModel(embed_matrix=char_embed_matrix, PE=True, name='gatedconvtopicmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xtopic]) ys.append(y) print('gated conv topic done.') model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridgatedconvtopicmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, name='hybridgatedconvtopicmodel.h5') model.load_weights() y = model.predict([x, xterm, xfeat, xtopic]) ys.append(y) print('hybrid gated conv topic done.') model = RCNNModel(MAX_LEN=300, embed_matrix=term_embed_matrix, name='RCNNmodel.h5') model.load_weights() y = model.predict(xterm) ys.append(y) print('RCNN done.') y = fasttextmodel.predict_char() ys.append(y) y = fasttextmodel.predict_term() ys.append(y) print(y.shape) print('fast text done.') #hybrid model model = HybridModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8) # + 37 model.load_weights() y = model.predict([x, xterm, xfeat]) ys.append(y) print(y.shape) print('hybrid model done.') #CNN model (char) model = CharModel(embed_matrix=char_embed_matrix) model.load_weights() y = model.predict(x) ys.append(y) model = CharModel(embed_matrix=char_embed_matrix, name='charmodel_PE.h5', PE=True) model.load_weights() y = model.predict([x, xe]) ys.append(y) model = CharModel(embed_matrix=char_embed_matrix, name='charmodel_PE_OE.h5', PE=True) model.load_weights() y = model.predict([x, xe]) ys.append(y) print('char model done.') #CNN (term) model = TermModel(embed_matrix=term_embed_matrix) model.load_weights() y = model.predict(xterm) ys.append(y) print('term model done.') model = DeepCNNModel(embed_matrix=char_embed_matrix) model.load_weights() y = model.predict(x) ys.append(y) print('deep cnn done.') # attention model (char) model = AttModel(MAX_LEN=600, name='charattmodel.h5', embed_matrix=char_embed_matrix) model.load_weights() y = model.predict(x) ys.append(y) print('att char done.') # attention model (term) model = AttModel(MAX_LEN=300, embed_matrix=term_embed_matrix) model.load_weights() y = model.predict(xterm) ys.append(y) print('att term done.') model = SSCharModel(embed_matrix=char_embed_matrix, name='sscharmodel_PE.h5', PE=True, train_embed=True) model.load_weights() y = model.predict([x, xe]) ys.append(y) model = SSCharModel(embed_matrix=char_embed_matrix, train_embed=True) model.load_weights() y = model.predict(x) ys.append(y) print('conv model with second learning passes done.') model = GatedConvModel(embed_matrix=char_embed_matrix, name='gatedconvmodel_PE.h5', PE=True) model.load_weights() y = model.predict([x, xe]) ys.append(convert_onehot(y)) model = GatedConvModel(embed_matrix=char_embed_matrix, train_embed=True) model.load_weights() y = model.predict(x) ys.append(y) print('gated conv done.') model = GatedDeepCNNModel(embed_matrix=char_embed_matrix, name='gateddeepcnnmodel_PE.h5', PE=True, train_embed=True) model.load_weights() y = model.predict([x, xe]) ys.append(y) model = GatedDeepCNNModel(embed_matrix=char_embed_matrix, train_embed=True) model.load_weights() y = model.predict(x) ys.append(y) print('gated deep cnn done.') labels = ['人类作者', '自动摘要', '机器作者', '机器翻译'] y_pred = np.mean(ys, axis=0) y_pred = convert_y(y_pred) out_file = 'result.csv' with open(out_file, 'w', encoding='utf-8') as fout: for id, yi in zip(ids, y_pred): label = labels[yi] fout.write('{},{}\n'.format(id, label)) print('done.')
def train_model_tfidf(model_conf, model_name='hybridconvmodel_tfidf.h5', ModelClass=HybridModelBase): print(model_name) print('load data') import data_utils, training_utils conf = data_utils.TrainConfigure() data_dict = data_utils.pickle_load(conf.char_file) print('loading embed ...') vocab_dict = data_utils.pickle_load(conf.char_dict) char_embed_matrix = data_utils.load_embedding( vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') MAX_LEN = 600 x = data_dict['x'] xterm = data_utils.pickle_load(conf.term_file) term_vocab_dict = data_utils.pickle_load(conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') MAX_LEN_TERM = 300 print('load embed done.') y = to_categorical(data_dict['y']) xt = data_utils.pickle_load('data/lda_vec.pkl') xfeat = data_utils.pickle_load(conf.feat_file) x_tfidf, xterm_tfidf = data_utils.pickle_load(conf.tfidf_file) print('tfidf shape', x_tfidf.shape, xterm_tfidf.shape) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(MAX_LEN)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(MAX_LEN_TERM)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split( [x, xe, xterm, xe_term, xfeat, xt, x_tfidf, xterm_tfidf], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('train') print('define model') model = ModelClass(model_conf, char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, name=model_name, train_embed=False, train_top=True) model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model model_conf.lr *= 0.5 model = ModelClass(model_conf, char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, name=model_name, train_embed=True, train_top=False) model.load_weights() model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) del model
# create word vocabulary from the data itself wordlist = itertools.chain.from_iterable(train_tokens) word_index, _ = build_vocab(wordlist) # load dependency embedding dep_embedding_path = "dep_embedding/deps.contexts" #dep_embedding_index = load_embedding(dep_embedding_path) dep_embedding_index = load_dep_embedding(dep_embedding_path) dep_embedding_matrix = get_embedding_matrix(word_index, dep_embedding_index, FLAGS.word_embedding_size) print("finish loading dependency embedding") print(dep_embedding_index.get("the")) exit() embedding_path = "glove.6B/glove.6B.{}d.txt".format(FLAGS.word_embedding_size) embedding_index = load_embedding(embedding_path) embedding_matrix = get_embedding_matrix(word_index, embedding_index, FLAGS.word_embedding_size) print("finish loading Glove embedding") print(len(dep_embedding_matrix[5])) print(len(embedding_matrix[5])) exit() vocab_size = len(word_index) # convert words to indices including padding and cutting train_x = tokens_to_indices(word_index, train_tokens, MAXLEN) valid_x = tokens_to_indices(word_index, valid_tokens, MAXLEN) test_x = tokens_to_indices(word_index, test_tokens, MAXLEN) # get topic sequence max_topic = 5 train_topic = tokens_to_indices(topic_index, train_topic, max_topic)