def exec_(param): '''initialize vocab for specified problem''' write_log("--------------------------------------------------------\n") global time_point time_point = time.clock() # common config and user control parameter loading qa_data_mode = param['data_mode'].qa_data wdim = cfg.ModelConfig.WORD_DIM_DICT[qa_data_mode] # use_preprocessed = param['model_mode'].use_preprocessed train_wv = param["preprocess_mode"].train_wv clean_corpus = param["preprocess_mode"].clean_corpus # wdim = cfg.ModelConfig.WORD_DIM[qa_data_mode] corpus_mode = cfg.PreProcessConfig.CORPUS_MODE ling_unit = cfg.PreProcessConfig.LING_UNIT # linguistic unit mode assert ling_unit in ("WORD", "CHAR") s_w_rmvl = cfg.PreProcessConfig.STOP_WORD_REMOVAL # stop-word removal mode train_set = param['model_mode'].train_set eval_set = param['model_mode'].eval_set use_saved_4_training = param['model_mode'].use_saved_4_training use_saved_4_testing = param['model_mode'].use_saved_4_testing wv_path, qa_data_path_t, qa_data_path_e, model_weight_path, score_path = generate_model_paths( qa_data_mode, ling_unit, s_w_rmvl, train_set, eval_set) vocab = du.Vocab(wv_path) # initialize a vocab instance if train_wv: # train new vectors using corpus # clean the corpus text corpus_path = cfg.DirConfig.CORPUS_PATH_DICT[corpus_mode] clean_corpus_path = cfg.DirConfig.CLEAN_CORPUS_PATH_DICT[corpus_mode] if clean_corpus: print("---starting to clean corpus text at running time %f---" % (time.clock() - time_point)) write_log("started to clean corpus text at running time %f\n" % (time.clock() - time_point)) text_cleaner = du.TextCleaner(corpus_path) text_cleaner.clean_chn_corpus_2file(clean_corpus_path) # copy cleaned text to word2vec directory clean_corpus_filename = cfg.DirConfig.CLEAN_CORPUS_FILENAME_DICT[ corpus_mode] dst_corpus_path = cfg.DirConfig.W2V_DIR + clean_corpus_filename copyfile(clean_corpus_path, dst_corpus_path) print("---starting to build vocab database at running time %f---" % (time.clock() - time_point)) write_log("started to build vocab database at running time %f\n" % (time.clock() - time_point)) vocab.build_vocab_database(qa_data_mode, clean_corpus_filename) else: print("---starting to load vocab from database at running time %f---" % (time.clock() - time_point)) write_log("started to load vocab from database at running time %f\n" % (time.clock() - time_point)) vocab.load_wv_from_db(qa_data_mode) ''' initialize data stream for model and start training (canceled) data stream initiates unknown-word mapping (canceled) so beware of deadlock in database operation ''' if train_set != 'NAH' and eval_set != 'NAH': # do one time evaluation before training for comparison print("evaluation before training:") write_log("evaluation before training:\n") print( "---starting to prepare evaluation data stream at running time %f---" % (time.clock() - time_point)) write_log( "started to prepare evaluation data stream at running time %f\n" % (time.clock() - time_point)) predicted_score_lst = [] score_file = open(score_path, 'wb') data_stream = du.SentenceDataStream(qa_data_path_e, vocab, (qa_data_mode, 'e')) print( "---starting to initialize model for evaluation at running time %f---" % (time.clock() - time_point)) write_log( "started to initialize model for evaluation at running time %f\n" % (time.clock() - time_point)) model_graph = ConvQAModelGraph(wdim) model_in = model_graph.get_model_inputs() model_out = model_graph.get_model_outputs() my_model = Model(inputs=model_in, outputs=model_out) loss_func = cfg.ModelConfig.LOSS_FUNC optm = cfg.ModelConfig.OPT my_model.compile(optimizer=optm, loss=loss_func, metrics=['accuracy']) if use_saved_4_training: # in consistency with training here try: my_model.load_weights(model_weight_path) except Exception as e: print("%s" % e) write_log("%s" % e) else: pass # some hyper-parameters batch_size = data_stream.get_batch_size() g = data_stream.get_batch() while (True): try: q_batch, a_batch, add_feat_batch = next(g) except StopIteration: break x = [q_batch, a_batch, add_feat_batch] predicted_batch = list(my_model.predict(x, batch_size)) predicted_score_lst.extend(predicted_batch) # y = model.predict(x, batch_size=batch_size) for sc in predicted_score_lst: score_to_write = (str(sc[0]) + '\n').encode('utf-8') score_file.write(score_to_write) score_file.close() res = eval_in_model(qa_data_path_e, score_path, '') write_log(res + '\n') write_log("before-training evaluation ends\n") write_log("----------------------------\n") if train_set != 'NAH': # training print( "---starting to prepare training data stream at running time %f---" % (time.clock() - time_point)) write_log( "started to prepare training data stream at running time %f\n" % (time.clock() - time_point)) data_stream = du.SentenceDataStream(qa_data_path_t, vocab, (qa_data_mode, 't')) print( "---starting to initialize model for training at running time %f---" % (time.clock() - time_point)) write_log( "started to initialize model for training at running time %f\n" % (time.clock() - time_point)) model_graph = ConvQAModelGraph(wdim) model_in = model_graph.get_model_inputs() model_out = model_graph.get_model_outputs() my_model = Model(inputs=model_in, outputs=model_out) # some hyper-parameters batch_size = data_stream.get_batch_size() train_epoch = cfg.ModelConfig.TRAIN_EPOCH loss_func = cfg.ModelConfig.LOSS_FUNC optm = cfg.ModelConfig.OPT # model initialization my_model.compile(optimizer=optm, loss=loss_func, metrics=['accuracy']) if use_saved_4_training: try: my_model.load_weights(model_weight_path) except Exception as e: print("%s" % e) write_log("%s" % e) else: pass # start training print("---starting to feed model at running time %f---" % (time.clock() - time_point)) write_log("started to feed model at running time %f\n" % (time.clock() - time_point)) for _ in range(train_epoch): g = data_stream.get_batch() while (True): try: q_batch, a_batch, label_batch, add_feat_batch = next(g) except StopIteration: break x = [q_batch, a_batch, add_feat_batch] y = [label_batch] my_model.fit(x, y, batch_size=batch_size) my_model.save_weights(model_weight_path) print(vocab._unk_num) ####################################### print(len(vocab.kn_set)) write_log("In %s set, %d known words and %d unknown words found.\n" % (eval_set, len(vocab.kn_set), vocab._unk_num)) if eval_set != 'NAH': # evaluation print( "---starting to prepare evaluation data stream at running time %f---" % (time.clock() - time_point)) write_log( "started to prepare evaluation data stream at running time %f\n" % (time.clock() - time_point)) predicted_score_lst = [] score_file = open(score_path, 'wb') data_stream = du.SentenceDataStream(qa_data_path_e, vocab, (qa_data_mode, 'e')) print( "---starting to initialize model for evaluation at running time %f---" % (time.clock() - time_point)) write_log( "started to initialize model for evaluation at running time %f\n" % (time.clock() - time_point)) model_graph = ConvQAModelGraph(wdim) model_in = model_graph.get_model_inputs() model_out = model_graph.get_model_outputs() my_model = Model(inputs=model_in, outputs=model_out) loss_func = cfg.ModelConfig.LOSS_FUNC optm = cfg.ModelConfig.OPT my_model.compile(optimizer=optm, loss=loss_func, metrics=['accuracy']) if use_saved_4_testing: try: my_model.load_weights(model_weight_path) except Exception as e: print("%s" % e) write_log("%s" % e) else: pass # some hyper-parameters batch_size = data_stream.get_batch_size() g = data_stream.get_batch() while (True): try: q_batch, a_batch, add_feat_batch = next(g) except StopIteration: break x = [q_batch, a_batch, add_feat_batch] predicted_batch = list(my_model.predict(x, batch_size)) predicted_score_lst.extend(predicted_batch) # y = model.predict(x, batch_size=batch_size) for sc in predicted_score_lst: score_to_write = (str(sc[0]) + '\n').encode('utf-8') score_file.write(score_to_write) score_file.close() print(vocab._unk_num) ####################################### print(len(vocab.kn_set)) write_log("In %s set, %d known words and %d unknown words found.\n" % (eval_set, len(vocab.kn_set), vocab._unk_num)) res = eval_in_model(qa_data_path_e, score_path, '') write_log(res + '\n') write_log("Finished at time %f.\n" % (time.clock() - time_point)) write_log("--------------------------------------------------------\n\n") time_point = time.clock()
import tensorflow as tf import numpy as np import data_util import os from text_cnn import TextCNN from config import Config import csv test_data_file = "data/test/tokenized_reviews.txt" test_label_file = "data/test/labels.txt" vocab_file = "data/vocab" checkpoint_dir = "./save/checkpoints/cnn" result_file = "./data/cnn_result.csv" checkpoint_prefix = os.path.join(checkpoint_dir, "cnn") max_vocab_size = 5e5 vocab = data_util.Vocab(vocab_file, max_vocab_size) # load test data test_docs, seq_len, max_len, test_labels = data_util.load_data( test_data_file, test_label_file, vocab) config = Config(max_vocab_size, max_len) model = TextCNN(config) model.build() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.gpu_options.per_process_gpu_memory_fraction = 0.9 sess = tf.Session(config=sess_config) init = tf.global_variables_initializer() sess.run(init) ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and tf.train.get_checkpoint_state(checkpoint_dir): model.restore(sess, ckpt.model_checkpoint_path)
dropout = 1.0 zoneout = 0.0 filter_width = 3 embedding_size = 300 num_layers = 1 summary_len = 100 attention_hidden_size = 100 beam_depth = 5 state_size = 120 mode = "test" doc_file = "data/test_article.txt" sum_file = "data/test_abstract.txt" vocab_file = "data/vocab" checkpoint_dir = "./save/baseline/checkpoints" checkpoint_prefix = os.path.join(checkpoint_dir, "baseline") vocab = data_util.Vocab("data/vocab", max_vocab_size) docs = data_util.load_test_data(doc_file, vocab, max_num_tokens) summary_file = "result/summaries.txt" with tf.Graph().as_default(): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.9 sess = tf.Session() log_writer = tf.summary.FileWriter(checkpoint_dir, graph=sess.graph) model = DenseQuasiGRU(vocab_size=max_vocab_size, embedding_size=embedding_size, num_layers=num_layers, state_size=state_size, decoder_vocab_size=max_vocab_size, filter_width=filter_width, zoneout=zoneout,