def test_mask_dataset(): logging.info("test_mask_dataset") dataset = mask_dataset(data_dir, set_names=['train', 'val'], suffixes=['context', 'question']) print dataset.keys() for k, v in dataset.iteritems(): print k, len(v), v[0]
def main(): logging.basicConfig(level=logging.INFO) file_handler = logging.FileHandler('log_test.txt') logging.getLogger().addHandler(file_handler) answer = read_answers(data_dir) dataset = mask_dataset(data_dir, set_names=set_names, suffixes=suffixes) embed_path = pjoin(data_dir, "glove.trimmed.100.npz") embedding = np.load(embed_path)['glove'] test_model(100, dataset['train_context'][:100], dataset['train_question'][:100], embedding, answer['train_answer'][:100])
def rnn_test(): embed_path = pjoin(data_dir, "glove.trimmed.100.npz") embedding = np.load(embed_path)['glove'] dataset = mask_dataset(data_dir, set_names=set_names, suffixes=suffixes) test_data = dataset['train_context'][:2] inputs = [x[0] for x in test_data] masks = [x[1] for x in test_data] inputs = np.array(inputs) print('shape of inputs {}'.format(inputs.shape)) masks = np.array(masks) print('shape of masks {}'.format(masks.shape)) with tf.Graph().as_default(): # embedding_tf = tf.Variable(embedding) x = tf.placeholder(tf.int32, (None, 400)) x_m = tf.placeholder(tf.bool, (None, 400)) l_x = tf.placeholder(tf.int32, (None,)) print(x) print(x_m) print(l_x) embed = tf.nn.embedding_lookup(embedding, x) # x_in = tf.boolean_mask(embed, x_m) print('shape of embed {}'.format(embed.shape)) # print('shape of x_in {}'.format(x_in.shape)) num_hidden = 5 lstm_fw_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0) lstm_bw_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0) outputs, outputs_states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,lstm_bw_cell, embed,sequence_length=sequence_length(x_m),dtype=tf.float64) outputs = tf.concat(outputs, axis=2) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) outp, outps = sess.run([ outputs, outputs_states], feed_dict={x:inputs, x_m:masks}) # print('shape of input embeddings is : {}'.format(xin.shape)) print("shape of output is :{}".format(outp.shape)) assert outp.shape == (2, 400, 2 * num_hidden), 'the shape of outp should be {} but it is {}'\ .format((2, 400, 2 * num_hidden), outp.shape) print(outp)
def main(_): '''Check the Config.py to set up models pathes to be ensembled.''' data_dir = cfg.DATA_DIR set_names = cfg.set_names suffixes = cfg.suffixes dataset = mask_dataset(data_dir, set_names, suffixes) raw_answers = read_raw_answers(data_dir) vocab_path = pjoin(data_dir, cfg.vocab_file) vocab, rev_vocab = initialize_vocab(vocab_path) if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) file_handler = logging.FileHandler( pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) model_pathes = cfg.model_pathes num_m = len(model_pathes) train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32) train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32) val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32) val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32) # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True for i in xrange(num_m): tf.reset_default_graph() with tf.Session(config=config) as sess: encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder) init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(model_pathes[i]) initialize_model(sess, qa, load_train_dir) ts, te, vs, ve = qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, ensemble=True, training=True, sample=cfg.num_eval) train_s[:, i] = ts train_e[:, i] = te val_s[:, i] = vs val_e[:, i] = ve if i == num_m - 1: # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e]) train_s = bin_count(train_s) train_e = bin_count(train_e) val_s = bin_count(val_s) val_e = bin_count(val_e) qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sendin=(train_s, train_e, val_s, val_e), sample=cfg.num_eval)
def main(_): data_dir = cfg.DATA_DIR set_names = cfg.set_names suffixes = cfg.suffixes dataset = mask_dataset(data_dir, set_names, suffixes) answers = read_answers(data_dir) raw_answers = read_raw_answers(data_dir) vocab_path = pjoin(data_dir, cfg.vocab_file) vocab, rev_vocab = initialize_vocab(vocab_path) embed_path = pjoin(data_dir, "glove.trimmed." + str(cfg.embed_size) + ".npz") # logging.info('embed size: {} for path {}'.format(cfg.embed_size, embed_path)) # embedding = np.load(embed_path)['glove'] c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) file_handler = logging.FileHandler( pjoin(cfg.log_dir, 'log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) print_parameters() # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.reset_default_graph() encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder, embed_path) with tf.Session(config=config) as sess: init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(cfg.train_dir) logging.info('=========== trainable varaibles ============') for i in tf.trainable_variables(): logging.info(i.name) logging.info('=========== regularized varaibles ============') for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES): logging.info(i.name) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(cfg.train_dir) qa.train( cfg.start_lr, sess, dataset, answers, save_train_dir, raw_answers=raw_answers, # debug_num=1000, rev_vocab=rev_vocab) qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sample=4000)
def main(_): set_names = cfg.set_names suffixes = cfg.suffixes num_hidden = cfg.lstm_num_hidden data_dir = cfg.DATA_DIR embed_path = pjoin(data_dir, "glove.trimmed." + str(cfg.embed_size) + ".npz") vocab_path = pjoin(data_dir, cfg.vocab_file) dataset = mask_dataset(data_dir, set_names=set_names, suffixes=suffixes) answers = read_answers(data_dir) raw_answers = read_raw_answers(data_dir) vocab, rev_vocab = initialize_vocab(vocab_path) c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) file_handler = logging.FileHandler( pjoin(cfg.log_dir, 'log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) print_parameters() # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True encoder = Encoder(size=2 * num_hidden) decoder = Decoder(output_size=2 * num_hidden) qa = QASystem(encoder, decoder, embed_path) with tf.Session(config=config) as sess: load_train_dir = get_normalized_train_dir(cfg.train_dir) logging.info('=========== trainable varaibles ============') for i in tf.trainable_variables(): logging.info(i.name) logging.info('=========== regularized varaibles ============') for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES): logging.info(i.name) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(cfg.train_dir) tic = time.time() qa.train( sess, dataset, answers, save_train_dir, raw_answers, rev_vocab, # debug_num=1000 ) # qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sample=4000) toc = time.time() logging.info("Total training process took {} hours".format( (toc - tic) / 3600.))
def main(_): c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) args = parse_arg() update_config(args, c_time) # pprint.pprint(cfg) logging.info(cfg) if args.test: pdb.set_trace() data_dir = cfg.DATA_DIR set_names = cfg.set_names suffixes = cfg.suffixes dataset = mask_dataset(data_dir, set_names, suffixes) answers = read_answers(data_dir) raw_answers = read_raw_answers(data_dir) vocab_path = pjoin(data_dir, cfg.vocab_file) vocab, rev_vocab = initialize_vocab(vocab_path) embed_path = pjoin(data_dir, "glove.trimmed." + str(cfg.embed_size) + ".npz") # logging.info('embed size: {} for path {}'.format(cfg.embed_size, embed_path)) # embedding = np.load(embed_path)['glove'] if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) print_parameters() # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.reset_default_graph() encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder, embed_path) with tf.Session(config=config) as sess: init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(cfg.train_dir) logging.info('=========== trainable varaibles ============') for i in tf.trainable_variables(): logging.info(i.name) logging.info('=========== regularized varaibles ============') for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES): logging.info(i.name) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(cfg.train_dir) if args.test: qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir, raw_answers=raw_answers, debug_num=100, rev_vocab=rev_vocab) else: qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir, raw_answers=raw_answers, rev_vocab=rev_vocab) qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sample=4000)
def main(_): '''Check the Config.py to set up models pathes to be ensembled.''' data_dir = cfg.DATA_DIR set_names = cfg.set_names suffixes = cfg.suffixes dataset = mask_dataset(data_dir, set_names, suffixes) raw_answers = read_raw_answers(data_dir) vocab_path = pjoin(data_dir, cfg.vocab_file) vocab, rev_vocab = initialize_vocab(vocab_path) if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) model_pathes = cfg.model_pathes num_m = len(model_pathes) train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32) train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32) val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32) val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32) # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True for i in xrange(num_m): tf.reset_default_graph() with tf.Session(config=config) as sess: encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder) init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(model_pathes[i]) initialize_model(sess, qa, load_train_dir) ts, te, vs, ve = qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, ensemble=True, training=True, sample=cfg.num_eval) train_s[:, i] = ts train_e[:, i] = te val_s[:, i] = vs val_e[:, i] = ve if i == num_m - 1: # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e]) train_s = bin_count(train_s) train_e = bin_count(train_e) val_s = bin_count(val_s) val_e = bin_count(val_e) qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sendin=(train_s, train_e, val_s, val_e), sample=cfg.num_eval )