def main(_): # Do what you need to load datasets from FLAGS.data_dir datasetTrain = initialize_datasets(FLAGS.data_dir, 'train') datasetVal = initialize_datasets(FLAGS.data_dir, 'val') #datasetTrain = datasetTrain[0:100] embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, embed_path, rev_vocab, FLAGS) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, datasetTrain, save_train_dir) qa.evaluate_answer(sess, datasetVal, sample=1000, log=True)
def main(args): if args: restore = args embed_path = FLAGS.embed_path or "data/squad/glove.trimmed.{}.npz".format( FLAGS.embedding_size) embeddingz = np.load(embed_path) embeddings = embeddingz['glove'] embeddingz.close() assert embeddings.shape[1] == FLAGS.embedding_size vocab_len = embeddings.shape[0] train = load_squad(FLAGS.data_dir, "train", max_vocab=vocab_len if FLAGS.check_embeddings else 0, max_samples=FLAGS.max_train_samples) val = load_squad(FLAGS.data_dir, "val", max_vocab=vocab_len if FLAGS.check_embeddings else 0, max_samples=FLAGS.max_val_samples) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) if FLAGS.verify_only: print_samples(train, FLAGS.verify_only, rev_vocab) return global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if os.path.exists(global_train_dir): os.unlink(global_train_dir) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir qa = QASystem(train_dir, embeddings) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: initialize_model(sess, qa, train_dir) qa.train(sess, train) qa.evaluate_answer(sess, qa.preprocess_sequence_data(val), log=True)
def main(FLAGS): print(80 * "=") print("INITIALIZING") print(80 * "=") # Do what you need to load datasets from FLAGS.data_dir #parser, embeddings, train_examples, dev_set, test_set = load_and_preprocess_data(debug) if not os.path.exists('./data/weights/'): os.makedirs('./data/weights/') embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) print("Loading Embedding Matrix") embeddings = np.load(embed_path)['glove'] encoder = Encoder(size=FLAGS.output_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS, embeddings) #qa = QASystem(encoder, FLAGS, embeddings) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) print("Building Network ... ") initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) print("Load Training Data") dataset = initialize_datasets(FLAGS.data_dir, dataset='train', debugMode=True) # encoder.encode_question( # dataset['Questions'], question['Questions_masks']) print(80 * "=") print("Training") print(80 * "=") qa.train(sess, dataset, save_train_dir) print("Finished Training") print("Load Validation Data") dataset = initialize_datasets(FLAGS.data_dir, dataset='val', debugMode=True) print(80 * "=") print("Evaluation") print(80 * "=") qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): # TODO # Do what you need to load datasets from FLAGS.data_dir dataset = dict() for dataset_type in ['train', 'val']: with open(os.path.join(FLAGS.data_dir, "%s.ids.context" % dataset_type)) as f: data_context = [ map(int, line.split()) for line in f.read().splitlines() ] with open( os.path.join(FLAGS.data_dir, "%s.ids.question" % dataset_type)) as f: data_question = [ map(int, line.split()) for line in f.read().splitlines() ] with open(os.path.join(FLAGS.data_dir, "%s.span" % dataset_type)) as f: data_span = [ map(int, line.split()) for line in f.read().splitlines() ] dataset[dataset_type] = (data_context, data_question, data_span) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, config=config) decoder = Decoder(output_size=FLAGS.output_size, config=config) qa = QASystem(encoder, decoder, config=config) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, FLAGS.evaluate, log=True)
def main(_): # TODO maybe pass as loaded dataset abstraction instead of # file_paths? default_hparams = create_hparams(FLAGS) context_file_path = FLAGS.data_dir + '/train.ids.context' question_file_path = FLAGS.data_dir + '/train.ids.question' span_file_path = FLAGS.data_dir + '/train.span' dataset = (context_file_path, question_file_path, span_file_path) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) default_hparams.add_hparam('vocab_size', len(vocab)) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(state_size=FLAGS.state_size) qa = QASystem(encoder, decoder, default_hparams) # Setup embeddings embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) np_embeddings = np.float32(np.load(embed_path)['glove']) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) session_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) session_config.gpu_options.allow_growth = True with tf.Session(config=session_config) as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir, np_embeddings) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir datasetTrain = initialize_datasets(FLAGS.data_dir, 'train.', debugMode=False) datasetVal = initialize_datasets(FLAGS.data_dir, 'val.', debugMode=False) datasetTrain.extend(datasetVal) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) #This is taking a long time tic = datetime.now() qa = QASystem(encoder, decoder, embed_path, FLAGS, rev_vocab) print('Time to setup the model: ', datetime.now() - tic) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) #saver = tf.train.Saver() with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) # Get directory to save model #save_train_dir = get_normalized_train_dir(FLAGS.train_dir) results_path = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now()) save_train_dir = results_path + "model.weights/" if not os.path.exists(save_train_dir): os.makedirs(save_train_dir) qa.train(sess, datasetTrain, save_train_dir) #, saver) qa.evaluate_answer(sess, datasetVal, rev_vocab, sample=1000, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir with open(pjoin("data", "squad", "train.ids.context"), encoding='utf-8') as fcontext, \ open(pjoin("data", "squad", "train.ids.question"), encoding='utf-8') as fquestion, \ open(pjoin("data", "squad", "train.span"), encoding='utf-8') as fspan: context = np.array([[int(idx) for idx in l.split()] for l in fcontext.readlines()]) question = np.array([[int(idx) for idx in l.split()] for l in fquestion.readlines()]) ans = np.array([[int(idx) for idx in l.split()] for l in fspan.readlines()]) dataset = { 'context': context, 'question': question, 'answer_span_start': ans[:, 0], 'answer_span_end': ans[:, 1] } embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = load_embeddings(embed_path) qa = QASystem(embeddings, FLAGS) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) # print(vars(FLAGS)) # with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w', encoding='utf-8') as fout: # json.dump(FLAGS.__flags, fout) with tf.Session() as sess: # load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) load_train_dir = FLAGS.train_dir initialize_model(sess, qa, load_train_dir) # save_train_dir = get_normalized_train_dir(FLAGS.train_dir) save_train_dir = FLAGS.train_dir qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, FLAGS.evaluate, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = {"train": load_data(FLAGS.data_dir, mode="train"), \ "val": load_data(FLAGS.data_dir, mode="val")} embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) matcher = Matcher(perspective_dim=25, input_size=FLAGS.state_size) # add flag decoder = Decoder(output_size=FLAGS.output_size, state_size=FLAGS.state_size, n_perspective_dim=50 * 2) # add flag qa = QASystem(encoder, matcher, decoder, \ vocab=vocab, vocab_dim=FLAGS.embedding_size, rev_vocab=rev_vocab, embed_path=embed_path) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) tf.global_variables_initializer().run() graph_writer = tf.summary.FileWriter("qa-graph") graph_writer.add_graph(sess.graph) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, 500, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir training_question_data_path = pjoin(FLAGS.data_dir, 'train.question') dataset = load_dataset(FLAGS.data_dir) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) pretrained_embeddings = np.load(embed_path)['glove'] encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, pretrained_embeddings=pretrained_embeddings, max_question_length=FLAGS.max_question_length, max_context_length=FLAGS.max_context_length) decoder = Decoder(output_size=FLAGS.output_size, size=FLAGS.state_size, max_context_length=FLAGS.max_context_length) qa = QASystem(encoder, decoder) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = None embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) with np.load(embed_path) as data: glove_embeddings = np.asfarray(data["glove"], dtype=np.float32) dataset = load_and_preprocess_data() # print(train_data) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, config=FLAGS) decoder = Decoder(output_size=FLAGS.output_size, config=FLAGS) qa = QASystem(encoder, decoder, embeddings=glove_embeddings, config=FLAGS, vocab=(vocab, rev_vocab)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, FLAGS.evaluate, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = None vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.size) deocder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, deocder) global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if os.path.exists(global_train_dir): os.unlink(global_train_dir) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: initialize_model(sess, qa, train_dir) qa.train(sess, dataset) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): config = Config() dataset = None # TODO ;load dateset ??? - look at dataset and seenhow it loooks - change model.py accordingly embed_path = config.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(config.embed_size)) embeddingz = np.load(embed_path) embeddings = embeddingz['glove'] embeddingz.close() vocab_len = embeddings.shape[0] train = load_squad(config.data_dir, "train", vocab_len, config.data_dir, max_samples=config.max_train_samples) val = load_squad(config.data_dir, "val", vocab_len, config.data_dir, max_samples=config.max_val_samples) print('train size: ', len(train), ' val size: ', len(val)) vocab_path = config.vocab_path or pjoin(config.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) # print_samples(train,1, rev_vocab) #train is triplets of (context,question,answer) encoder = Encoder(state_size=config.hidden_size, embedding_size=config.embed_size) decoder = Decoder(state_size=config.hidden_size, embedding_size=config.embed_size) qa = QASystem(encoder, decoder) with tf.Session() as sess: load_train_dir = (config.load_train_dir or config.train_dir) # put "" here if you want to build a new model initialize_model(sess, qa, load_train_dir) save_train_dir = config.train_dir ds_train = qa.pad_sequences(train) ret_q, ret_p, ret_labels = ds_train qa.train(sess, ds_train, save_train_dir) ds_val = qa.pad_sequences(val) print('train error') qa.evaluate_answer(sess, ds_train, log=True) print('val error') qa.evaluate_answer(sess, ds_val, log=True)
def main(_): FLAGS.config = int(sys.argv[1]) load_config(current_config=FLAGS.config) # Do what you need to load datasets from FLAGS.data_dir dataset = load_data(FLAGS.data_dir) # ((question, context), answer) train_data = preprocess_dataset(dataset['train'], FLAGS.output_size, FLAGS.question_size) val_data = preprocess_dataset(dataset['val'], FLAGS.output_size, FLAGS.question_size) # print(dataset) embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log" + '_config_' + str(FLAGS.config) + ".txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags" + '_config_' +\ str(FLAGS.config) + ".json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, train_data, val_data, save_train_dir) qa.evaluate_answer(sess, train_data, val_data, FLAGS.evaluate, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = load_dataset() embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, embedding_size=FLAGS.embedding_size, output_size=FLAGS.output_size) decoder = Decoder(state_size=FLAGS.state_size, output_size=FLAGS.output_size) qa_args = {"embed_path": embed_path, "embedding_size": FLAGS.embedding_size, "output_size": FLAGS.output_size, "optimizer": FLAGS.optimizer, "learning_rate": FLAGS.learning_rate, "epochs": FLAGS.epochs, "batch_size": FLAGS.batch_size, "max_gradient_norm": FLAGS.max_gradient_norm, "dropout_keep_prob": 1.0 - FLAGS.dropout, "train_dir": FLAGS.train_dir, "state_size": FLAGS.state_size} qa = QASystem(encoder, decoder, **qa_args) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): '''Check the Config.py to set up models pathes to be ensembled.''' data_dir = cfg.DATA_DIR set_names = cfg.set_names suffixes = cfg.suffixes dataset = mask_dataset(data_dir, set_names, suffixes) raw_answers = read_raw_answers(data_dir) vocab_path = pjoin(data_dir, cfg.vocab_file) vocab, rev_vocab = initialize_vocab(vocab_path) if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) model_pathes = cfg.model_pathes num_m = len(model_pathes) train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32) train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32) val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32) val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32) # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True for i in xrange(num_m): tf.reset_default_graph() with tf.Session(config=config) as sess: encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder) init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(model_pathes[i]) initialize_model(sess, qa, load_train_dir) ts, te, vs, ve = qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, ensemble=True, training=True, sample=cfg.num_eval) train_s[:, i] = ts train_e[:, i] = te val_s[:, i] = vs val_e[:, i] = ve if i == num_m - 1: # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e]) train_s = bin_count(train_s) train_e = bin_count(train_e) val_s = bin_count(val_s) val_e = bin_count(val_e) qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sendin=(train_s, train_e, val_s, val_e), sample=cfg.num_eval )
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = {} question_max_len = 40 context_max_len = 600 # Preprocess and collect train data train_q_path = pjoin(FLAGS.data_dir, "train.ids.question") train_q_data, train_q_seq_len = pad_sentences(train_q_path, question_max_len) assert not any( train_q_seq_len > train_q_data.shape[1] ), 'Some questions have length greater than max question length' train_c_path = pjoin(FLAGS.data_dir, "train.ids.context") train_c_data, train_c_seq_len = pad_sentences(train_c_path, context_max_len) assert not any( train_c_seq_len > train_c_data.shape[1] ), 'Some contexts have length greater than max context length' train_s_path = pjoin(FLAGS.data_dir, "train.span") train_s_e_id = get_answer_span(train_s_path, context_max_len) dataset['train'] = [ train_q_data, train_q_seq_len, train_c_data, train_c_seq_len, train_s_e_id ] # Preprocess and collect validation data val_q_path = pjoin(FLAGS.data_dir, "val.ids.question") val_q_data, val_q_seq_len = pad_sentences(val_q_path, question_max_len) val_c_path = pjoin(FLAGS.data_dir, "val.ids.context") val_c_data, val_c_seq_len = pad_sentences(val_c_path, context_max_len) val_s_path = pjoin(FLAGS.data_dir, "val.span") val_s_e_id = get_answer_span(val_s_path, context_max_len) dataset['val'] = [ val_q_data, val_q_seq_len, val_c_data, val_c_seq_len, val_s_e_id ] embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(state_size=FLAGS.state_size) qa = QASystem(encoder, decoder, question_max_len, context_max_len, embed_path, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.dropout, FLAGS.optimizer, FLAGS.max_gradient_norm) # try without dropout if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) for i in range(FLAGS.epochs): qa.train(sess, dataset['train'], save_train_dir) # print('Finish training epoch {}'.format(i)) qa.evaluate_answer(sess, dataset['val']) # vocab, FLAGS.evaluate
def main(_): c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) args = parse_arg() update_config(args, c_time) # pprint.pprint(cfg) logging.info(cfg) if args.test: pdb.set_trace() data_dir = cfg.DATA_DIR set_names = cfg.set_names suffixes = cfg.suffixes dataset = mask_dataset(data_dir, set_names, suffixes) answers = read_answers(data_dir) raw_answers = read_raw_answers(data_dir) vocab_path = pjoin(data_dir, cfg.vocab_file) vocab, rev_vocab = initialize_vocab(vocab_path) embed_path = pjoin(data_dir, "glove.trimmed." + str(cfg.embed_size) + ".npz") # logging.info('embed size: {} for path {}'.format(cfg.embed_size, embed_path)) # embedding = np.load(embed_path)['glove'] if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) print_parameters() # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.reset_default_graph() encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder, embed_path) with tf.Session(config=config) as sess: init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(cfg.train_dir) logging.info('=========== trainable varaibles ============') for i in tf.trainable_variables(): logging.info(i.name) logging.info('=========== regularized varaibles ============') for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES): logging.info(i.name) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(cfg.train_dir) if args.test: qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir, raw_answers=raw_answers, debug_num=100, rev_vocab=rev_vocab) else: qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir, raw_answers=raw_answers, rev_vocab=rev_vocab) qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sample=4000)
def main(_): config_fname = FLAGS.config_path assert os.path.exists(config_fname), "config file does not exist" logging.info("Loaded configs from: " + config_fname) with open(config_fname, "rb") as fp: json_flag = json.load(fp) # print(json_flag) print(vars(FLAGS)) for key, value in json_flag.iteritems(): if key == "eval_on_train": continue if key == "dev_path": continue if key == "train_dir": continue FLAGS.__setattr__(key, value) print(vars(FLAGS)) assert os.path.exists(FLAGS.train_dir), "train dir does not exist" # assert False vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Model-specific ========= # You must change the following code to adjust to your model embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) embeddings = load_glove_embeddings(embed_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) # mixer = Mixer() # decoder = Decoder(FLAGS) if FLAGS.model == 'baseline': qa = QASystem(encoder, FLAGS, embeddings, 1) elif FLAGS.model == 'matchLSTM': qa = QASystemMatchLSTM(FLAGS, embeddings, 1) print('\n\nrand_unknown is set to be ' + str(FLAGS.rand_unknown)) if FLAGS.load_from_json: dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) dataset = prepare_dev(dev_dirname, dev_filename, vocab) # remove answer # context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data, s_labels, e_labels, true_answers = dataset context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data = dataset for i in range(1): logging.debug('context') logging.debug(' '.join(context_tokens_data[i])) logging.debug('context_data') logging.debug(context_data[i]) logging.debug('question') logging.debug(' '.join(question_tokens_data[i])) logging.debug('question_data') logging.debug(question_data[i]) logging.debug('uuid_data') logging.debug(question_uuid_data[i]) with tf.Session() as sess: # train_dir = get_normalized_train_dir(FLAGS.train_dir) train_dir = FLAGS.train_dir initialize_model(sess, qa, train_dir) print('About to start generate_answers') print(FLAGS.eval_on_train) answers = generate_answers(sess, qa, dataset) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False))) else: # load from files converted from json FLAGS.data_dir = os.path.join("data", "squad", "qa_answer") if (not os.path.isdir("/home/el")) or not os.listdir(FLAGS.data_dir): process_dev_json_to_files() else: print('Data directory %s is not empty: ' % (FLAGS.data_dir, str(os.listdir(FLAGS.data_dir)))) dataset, max_q_len, max_c_len = load_dataset(FLAGS.data_dir, FLAGS.data_size, FLAGS.max_question_length, FLAGS.max_context_length, ['dev']) dev_set = dataset['dev'] print('Start running evaluate_answer on %d of data' % len(dev_set)) dev_examples = qa.preprocess_question_answer(dev_set) dev_raw = dataset['dev_raw'] dev_uuid = dataset['dev_uuid'] dev_dataset = [dev_examples, dev_raw, dev_uuid] with tf.Session() as sess: train_dir = FLAGS.train_dir initialize_model(sess, qa, train_dir) answers_model = qa.evaluate_answer(session=sess, dataset=dev_dataset, sample=len(dev_set), return_answer_dict=True) # write to json file to root dir with io.open('dev-prediction-model.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers_model, ensure_ascii=False)))
def main(_): # Reads file name for paragraph, parses paragraphs, changes form to list of lists of word embeddings # and stores length of each paragraph in paraLens. Next 5 blocks of code do the same for questions and get the indices # of answers within the paragraph. Also, does the same for all three elements of the validation dataset. paraFileName = pjoin(FLAGS.data_dir, "train") + ".ids.context" paraFile = open(paraFileName, "r") paras = [[int(a) for a in b.strip().split(" ") if a != ''] for b in paraFile.read().strip().split("\n")] paras, paraLens = zip(*[padList(para, FLAGS.paraLen) for para in paras]) questionFileName = pjoin(FLAGS.data_dir, "train") + ".ids.question" questionFile = open(questionFileName, "r") questions = [[int(a) for a in b.strip().split(" ") if a != ''] for b in questionFile.read().strip().split("\n")] questions, questionLens = zip( *[padList(question, FLAGS.qLen) for question in questions]) ansFileName = pjoin(FLAGS.data_dir, "train") + ".span" ansFile = open(ansFileName, "r") ans = [[int(a) for a in b.strip().split(" ") if a != ''] for b in ansFile.read().strip().split("\n")] paraFileNameVal = pjoin(FLAGS.data_dir, "val") + ".ids.context" paraFileVal = open(paraFileNameVal, "r") parasVal = [[int(a) for a in b.strip().split(" ") if a != ''] for b in paraFileVal.read().strip().split("\n")] parasVal, paraLensVal = zip( *[padList(para, FLAGS.paraLen) for para in parasVal]) questionFileNameVal = pjoin(FLAGS.data_dir, "val") + ".ids.question" questionFileVal = open(questionFileNameVal, "r") questionsVal = [[int(a) for a in b.strip().split(" ") if a != ''] for b in questionFileVal.read().strip().split("\n")] questionsVal, questionLensVal = zip( *[padList(question, FLAGS.qLen) for question in questionsVal]) ansFileNameVal = pjoin(FLAGS.data_dir, "val") + ".span" ansFileVal = open(ansFileNameVal, "r") ansVal = [[int(a) for a in b.strip().split(" ") if a != ''] for b in ansFileVal.read().strip().split("\n")] train_remove = [] val_remove = [] for i in range(len(ans)): if ans[i][0] >= FLAGS.paraLen or ans[i][1] >= FLAGS.paraLen: train_remove.append(i) for i in range(len(ansVal)): if ansVal[i][0] >= FLAGS.paraLen or ans[i][1] >= FLAGS.paraLen: val_remove.append(i) train_remove = list(reversed(train_remove)) val_remove = list(reversed(val_remove)) paras = list(paras) questions = list(questions) paraLens = list(paraLens) questionLens = list(questionLens) parasVal = list(parasVal) questionsVal = list(questionsVal) paraLensVal = list(paraLensVal) questionLensVal = list(questionLensVal) for i in range(len(train_remove)): ind = train_remove[i] paras.pop(ind) questions.pop(ind) paraLens.pop(ind) questionLens.pop(ind) ans.pop(ind) for i in range(len(val_remove)): ind = val_remove[i] parasVal.pop(ind) questionsVal.pop(ind) paraLensVal.pop(ind) questionLensVal.pop(ind) ansVal.pop(ind) paras = tuple(paras) questions = tuple(questions) paraLens = tuple(paraLens) questionLens = tuple(questionLens) parasVal = tuple(parasVal) questionsVal = tuple(questionsVal) paraLensVal = tuple(paraLensVal) questionLensVal = tuple(questionLensVal) #valDataset has paragraphs and questions as word embeddings, answers as the start and end indices in the #paragraph and length of each paragraph and question for the validation dataset valDataset = (parasVal, questionsVal, ansVal, paraLensVal, questionLensVal) #dataset is the same format as valDataset but for the training dataset. It also has valDataset as the last element dataset = (paras, questions, ans, paraLens, questionLens, valDataset) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = CoattentionEncoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) #encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) #decoder = ChunkDecoder(output_size=FLAGS.output_size) embeddings = np.load(embed_path)['glove'] #print(embeddings.keys()) qa = QASystem(encoder, decoder, embeddings, vocab, rev_vocab) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) saver = tf.train.Saver() with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir, saver) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) sample_rates = [1, 2, 5, 10] for rate in sample_rates: print("rate: ", rate) f1_l = [] em_l = [] for i in range(10): f1, em = qa.evaluate_answer(sess, valDataset, sample=len(valDataset[0]), sample_rate=rate) f1_l.append(f1) em_l.append(em) #print("f1: ", f1, "; em: ", em) print(rate) print(f1_l) print(em_l)
def main(_): '''Check the Config.py to set up models pathes to be ensembled.''' data_dir = cfg.DATA_DIR set_names = cfg.set_names suffixes = cfg.suffixes dataset = mask_dataset(data_dir, set_names, suffixes) raw_answers = read_raw_answers(data_dir) vocab_path = pjoin(data_dir, cfg.vocab_file) vocab, rev_vocab = initialize_vocab(vocab_path) if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) file_handler = logging.FileHandler( pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) model_pathes = cfg.model_pathes num_m = len(model_pathes) train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32) train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32) val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32) val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32) # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True for i in xrange(num_m): tf.reset_default_graph() with tf.Session(config=config) as sess: encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder) init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(model_pathes[i]) initialize_model(sess, qa, load_train_dir) ts, te, vs, ve = qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, ensemble=True, training=True, sample=cfg.num_eval) train_s[:, i] = ts train_e[:, i] = te val_s[:, i] = vs val_e[:, i] = ve if i == num_m - 1: # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e]) train_s = bin_count(train_s) train_e = bin_count(train_e) val_s = bin_count(val_s) val_e = bin_count(val_e) qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sendin=(train_s, train_e, val_s, val_e), sample=cfg.num_eval)
def main(_): if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) FLAGS.sessname = "{:%Y%m%d_%H%M%S}".format(datetime.now()) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log{}.txt".format(FLAGS.sessname))) logging.getLogger().addHandler(file_handler) # Do what you need to load datasets from FLAGS.data_dir dataset = None train_p, raw_train_p, train_q, train_ans = \ load_dataset("train", FLAGS.data_dir) val_p, raw_val_p, val_q, val_ans = \ load_dataset("val", FLAGS.data_dir) max_len_p = max(max(map(len, train_p)), max(map(len, val_p))) max_len_p = FLAGS.output_size # truncate max_len_q = max(max(map(len, train_q)), max(map(len, val_q))) max_len_q = 60 # truncate in case things go awry... max_len_ans = max(map(len, train_ans)) # 2 train_padded_p, train_mask_p, train_padded_q, train_mask_q, train_ans = \ preprocess_data((train_p, train_q, train_ans), "train", max_len_p, max_len_q) val_padded_p, val_mask_p, val_padded_q, val_mask_q, val_ans = \ preprocess_data((val_p, val_q, val_ans), "val", max_len_p, max_len_q) t_len = FLAGS.train_set_size if t_len != -1: # minibatch to check overfitting train_dataset = zip(train_padded_p[:t_len], train_mask_p[:t_len], train_padded_q[:t_len], train_mask_q[:t_len], train_ans[:t_len]) else: # regular version train_dataset = zip(train_padded_p, train_mask_p, train_padded_q, train_mask_q, train_ans) FLAGS.num_iters = len(train_dataset) val_dataset = zip(val_padded_p, val_mask_p, val_padded_q, val_mask_q, val_ans) raw_dataset = (raw_train_p, raw_val_p) dataset = (train_dataset, val_dataset, raw_dataset) logger.info("Sanity check on lengths: min %s, max %s" % \ (lambda x: (min(x), max(x)))(map(len, train_padded_p))) logger.info("Loading glove embeddings...") embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = np.load(embed_path)#, glove=glove) glove = embeddings['glove'] # np array logger.info("glove dims {}".format(glove.shape)) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, flags=FLAGS, max_len_p=max_len_p, max_len_q=max_len_q) decoder = Decoder(output_size=FLAGS.output_size, flags=FLAGS) qa = QASystem(encoder, decoder, glove, max_len_p, max_len_q, FLAGS) # create saver qa.saver = tf.train.Saver() logger.info("{}".format(vars(FLAGS))) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) #qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True) f1, em = qa.evaluate_answer(sess, train_dataset, log=True) logger.info("final evaluation: F1: {}, EM: {}".format(f1, em))
def main(_): global FLAGS print("FLAGS:", vars(FLAGS)) vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) embeddings = np.load(embed_path) #, glove=glove) glove = embeddings['glove'] # np array if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) # print(vars(FLAGS)) # with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: # json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) context_data, raw_context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) # preprocess data by truncating p = map(lambda line: map(int, (line.strip()).split(' ')), context_data) q = map(lambda line: map(int, (line.strip()).split(' ')), question_data) raw_context_data = map(lambda line: (line.strip()).split(' '), raw_context_data) max_len_p = min(max(map(len, p)), FLAGS.output_size) max_len_q = max(map(len, q)) dataset = (p, raw_context_data, q, question_uuid_data) #dataset = (context_data, raw_context_data, question_data, question_uuid_data) train_p, raw_train_p, train_q, train_ans = \ load_dataset("train", FLAGS.data_dir) train_padded_p, train_mask_p, train_padded_q, train_mask_q, train_ans = \ preprocess_data((train_p, train_q, train_ans), "train", max_len_p, max_len_q) train_dataset = zip(train_padded_p, train_mask_p, train_padded_q, train_mask_q, train_ans) # Reload flags print("loaded flags", vars(FLAGS)) # ========= Model-specific ========= # You must change the following code to adjust to your model encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, flags=FLAGS, max_len_p=max_len_p, max_len_q=max_len_q) decoder = Decoder(output_size=FLAGS.output_size, flags=FLAGS) qa = QASystem(encoder, decoder, glove, max_len_p, max_len_q, FLAGS) # create saver qa.saver = tf.train.Saver() # train dir with tf.Session() as sess: train_dir = get_normalized_train_dir(FLAGS.train_dir) initialize_model(sess, qa, train_dir) qa.raw_train = raw_train_p f1, em = qa.evaluate_answer(sess, train_dataset) logging.info("train total f1 {}, em {}".format(f1, em)) with tf.Session() as sess: train_dir = get_normalized_train_dir(FLAGS.train_dir) initialize_model(sess, qa, train_dir) answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_): data_dir = cfg.DATA_DIR set_names = cfg.set_names suffixes = cfg.suffixes dataset = mask_dataset(data_dir, set_names, suffixes) answers = read_answers(data_dir) raw_answers = read_raw_answers(data_dir) vocab_path = pjoin(data_dir, cfg.vocab_file) vocab, rev_vocab = initialize_vocab(vocab_path) embed_path = pjoin(data_dir, "glove.trimmed." + str(cfg.embed_size) + ".npz") # logging.info('embed size: {} for path {}'.format(cfg.embed_size, embed_path)) # embedding = np.load(embed_path)['glove'] c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) file_handler = logging.FileHandler( pjoin(cfg.log_dir, 'log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) print_parameters() # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.reset_default_graph() encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder, embed_path) with tf.Session(config=config) as sess: init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(cfg.train_dir) logging.info('=========== trainable varaibles ============') for i in tf.trainable_variables(): logging.info(i.name) logging.info('=========== regularized varaibles ============') for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES): logging.info(i.name) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(cfg.train_dir) qa.train( cfg.start_lr, sess, dataset, answers, save_train_dir, raw_answers=raw_answers, # debug_num=1000, rev_vocab=rev_vocab) qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sample=4000)
def main(_): set_names = cfg.set_names suffixes = cfg.suffixes num_hidden = cfg.lstm_num_hidden data_dir = cfg.DATA_DIR embed_path = pjoin(data_dir, "glove.trimmed." + str(cfg.embed_size) + ".npz") vocab_path = pjoin(data_dir, cfg.vocab_file) dataset = mask_dataset(data_dir, set_names=set_names, suffixes=suffixes) answers = read_answers(data_dir) raw_answers = read_raw_answers(data_dir) vocab, rev_vocab = initialize_vocab(vocab_path) c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) file_handler = logging.FileHandler( pjoin(cfg.log_dir, 'log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) print_parameters() # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True encoder = Encoder(size=2 * num_hidden) decoder = Decoder(output_size=2 * num_hidden) qa = QASystem(encoder, decoder, embed_path) with tf.Session(config=config) as sess: load_train_dir = get_normalized_train_dir(cfg.train_dir) logging.info('=========== trainable varaibles ============') for i in tf.trainable_variables(): logging.info(i.name) logging.info('=========== regularized varaibles ============') for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES): logging.info(i.name) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(cfg.train_dir) tic = time.time() qa.train( sess, dataset, answers, save_train_dir, raw_answers, rev_vocab, # debug_num=1000 ) # qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sample=4000) toc = time.time() logging.info("Total training process took {} hours".format( (toc - tic) / 3600.))