def main(_): global FLAGS print("FLAGS:", vars(FLAGS)) vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) embeddings = np.load(embed_path) #, glove=glove) glove = embeddings['glove'] # np array if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) # print(vars(FLAGS)) # with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: # json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) context_data, raw_context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) # preprocess data by truncating p = map(lambda line: map(int, (line.strip()).split(' ')), context_data) q = map(lambda line: map(int, (line.strip()).split(' ')), question_data) raw_context_data = map(lambda line: (line.strip()).split(' '), raw_context_data) max_len_p = min(max(map(len, p)), FLAGS.output_size) max_len_q = max(map(len, q)) dataset = (p, raw_context_data, q, question_uuid_data) #dataset = (context_data, raw_context_data, question_data, question_uuid_data) train_p, raw_train_p, train_q, train_ans = \ load_dataset("train", FLAGS.data_dir) train_padded_p, train_mask_p, train_padded_q, train_mask_q, train_ans = \ preprocess_data((train_p, train_q, train_ans), "train", max_len_p, max_len_q) train_dataset = zip(train_padded_p, train_mask_p, train_padded_q, train_mask_q, train_ans) # Reload flags print("loaded flags", vars(FLAGS)) # ========= Model-specific ========= # You must change the following code to adjust to your model encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, flags=FLAGS, max_len_p=max_len_p, max_len_q=max_len_q) decoder = Decoder(output_size=FLAGS.output_size, flags=FLAGS) qa = QASystem(encoder, decoder, glove, max_len_p, max_len_q, FLAGS) # create saver qa.saver = tf.train.Saver() # train dir with tf.Session() as sess: train_dir = get_normalized_train_dir(FLAGS.train_dir) initialize_model(sess, qa, train_dir) qa.raw_train = raw_train_p f1, em = qa.evaluate_answer(sess, train_dataset) logging.info("train total f1 {}, em {}".format(f1, em)) with tf.Session() as sess: train_dir = get_normalized_train_dir(FLAGS.train_dir) initialize_model(sess, qa, train_dir) answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_): if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) FLAGS.sessname = "{:%Y%m%d_%H%M%S}".format(datetime.now()) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log{}.txt".format(FLAGS.sessname))) logging.getLogger().addHandler(file_handler) # Do what you need to load datasets from FLAGS.data_dir dataset = None train_p, raw_train_p, train_q, train_ans = \ load_dataset("train", FLAGS.data_dir) val_p, raw_val_p, val_q, val_ans = \ load_dataset("val", FLAGS.data_dir) max_len_p = max(max(map(len, train_p)), max(map(len, val_p))) max_len_p = FLAGS.output_size # truncate max_len_q = max(max(map(len, train_q)), max(map(len, val_q))) max_len_q = 60 # truncate in case things go awry... max_len_ans = max(map(len, train_ans)) # 2 train_padded_p, train_mask_p, train_padded_q, train_mask_q, train_ans = \ preprocess_data((train_p, train_q, train_ans), "train", max_len_p, max_len_q) val_padded_p, val_mask_p, val_padded_q, val_mask_q, val_ans = \ preprocess_data((val_p, val_q, val_ans), "val", max_len_p, max_len_q) t_len = FLAGS.train_set_size if t_len != -1: # minibatch to check overfitting train_dataset = zip(train_padded_p[:t_len], train_mask_p[:t_len], train_padded_q[:t_len], train_mask_q[:t_len], train_ans[:t_len]) else: # regular version train_dataset = zip(train_padded_p, train_mask_p, train_padded_q, train_mask_q, train_ans) FLAGS.num_iters = len(train_dataset) val_dataset = zip(val_padded_p, val_mask_p, val_padded_q, val_mask_q, val_ans) raw_dataset = (raw_train_p, raw_val_p) dataset = (train_dataset, val_dataset, raw_dataset) logger.info("Sanity check on lengths: min %s, max %s" % \ (lambda x: (min(x), max(x)))(map(len, train_padded_p))) logger.info("Loading glove embeddings...") embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = np.load(embed_path)#, glove=glove) glove = embeddings['glove'] # np array logger.info("glove dims {}".format(glove.shape)) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, flags=FLAGS, max_len_p=max_len_p, max_len_q=max_len_q) decoder = Decoder(output_size=FLAGS.output_size, flags=FLAGS) qa = QASystem(encoder, decoder, glove, max_len_p, max_len_q, FLAGS) # create saver qa.saver = tf.train.Saver() logger.info("{}".format(vars(FLAGS))) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) #qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True) f1, em = qa.evaluate_answer(sess, train_dataset, log=True) logger.info("final evaluation: F1: {}, EM: {}".format(f1, em))