def run_func(): config = Config() # ========= Load Dataset ========= # You can change this code to load dataset in your own way vocab, rev_vocab = initialize_vocab(config.vocab_path) dev_path = "data/squad/fuse.json" dev_dirname = os.path.dirname(os.path.abspath(dev_path)) dev_filename = os.path.basename(dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) ques_len = len(question_data) answers = [[0, 0] for _ in xrange(ques_len)] dataset = [question_data, context_data, answers] embed_path = config.embed_path embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) with tf.Session() as sess: qa.initialize_model(sess, config.train_dir) answers, _ = generate_answers(sess, qa, dataset, question_uuid_data, rev_vocab) # write to json file to root dir with io.open('temp/fuse-answer.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def run_func2(dataset, config): vocab, rev_vocab = initialize_vocab(config.vocab_path) q, c, a = zip(*[[_q, _c, _a] for (_q, _c, _a) in dataset]) dataset = [q, c, a] embed_path = config.embed_path embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) question_uuid_data = [i for i in xrange(len(a))] with tf.Session() as sess: qa.initialize_model(sess, config.train_dir) answers, answers_canonical = generate_answers(sess, qa, dataset, question_uuid_data, rev_vocab) # write to json file to root dir with io.open('dev-prediction.txt', 'w', encoding='utf-8') as f: for i in xrange(len(a)): curr_ans = unicode(answers[i], "utf-8") f.write("%s\n" % (curr_ans))
def run_func(): config = Config() vocab, rev_vocab = initialize_vocab(config.vocab_path) dev_path = "download/squad/test.json" dev_dirname = os.path.dirname(os.path.abspath(dev_path)) dev_filename = os.path.basename(dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) ques_len = len(question_data) answers = [[0, 0] for _ in xrange(ques_len)] dataset = [question_data, context_data, answers] embed_path = config.embed_path embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) data = "Id,Answer\n" with tf.Session() as sess: qa.initialize_model(sess, config.train_dir) answers, _ = generate_answers(sess, qa, dataset, question_uuid_data, rev_vocab) for a in answers: ans = answers[a] data += a + "," + normalize_answer(ans).replace(" s ", "s ") + "\n" with open('submission.csv', 'wb') as file: file.write(data)
def main(_): # Do what you need to load datasets from FLAGS.data_dir datasetTrain = initialize_datasets(FLAGS.data_dir, 'train') datasetVal = initialize_datasets(FLAGS.data_dir, 'val') #datasetTrain = datasetTrain[0:100] embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, embed_path, rev_vocab, FLAGS) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, datasetTrain, save_train_dir) qa.evaluate_answer(sess, datasetVal, sample=1000, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir # load all in once, maybe better to try batch by batch question_path = "./data/squad/train.ids.question" context_path = "./data/squad/train.ids.context" answer_path = "./data/squad/train.span" val_q = "./data/squad/val.ids.question" val_c = "./data/squad/val.ids.context" val_a = "./data/squad/val.span" embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") # embeddings is a matrix of shape [vocab_size, embedding_size] embeddings = np.load(embed_path)['glove'].astype(np.float32) val_data = load_and_pad_val_data(val_q, val_c, val_a) # vocab is the mapping from word -> token id # rev_vocab is the reverse mapping, from id -> word vocab, rev_vocab = initialize_vocab(vocab_path) # someone posted that the max length of question is 766 info = (question_path, context_path, answer_path, FLAGS.batch_size, FLAGS.max_length, FLAGS.output_size) ''' batch_gen = batch_generator(question_path, context_path, answer_path, FLAGS.batch_size, FLAGS.max_length, FLAGS.output_size) i = 0; while True: batch_gen.next() i += 1 logging.info(i) ''' encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS, embeddings) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, batch_generator, info, save_train_dir, val_data, rev_vocab)
def main(FLAGS): print(80 * "=") print("INITIALIZING") print(80 * "=") # Do what you need to load datasets from FLAGS.data_dir #parser, embeddings, train_examples, dev_set, test_set = load_and_preprocess_data(debug) if not os.path.exists('./data/weights/'): os.makedirs('./data/weights/') embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) print("Loading Embedding Matrix") embeddings = np.load(embed_path)['glove'] encoder = Encoder(size=FLAGS.output_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS, embeddings) #qa = QASystem(encoder, FLAGS, embeddings) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) print("Building Network ... ") initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) print("Load Training Data") dataset = initialize_datasets(FLAGS.data_dir, dataset='train', debugMode=True) # encoder.encode_question( # dataset['Questions'], question['Questions_masks']) print(80 * "=") print("Training") print(80 * "=") qa.train(sess, dataset, save_train_dir) print("Finished Training") print("Load Validation Data") dataset = initialize_datasets(FLAGS.data_dir, dataset='val', debugMode=True) print(80 * "=") print("Evaluation") print(80 * "=") qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): FLAGS.config = int(sys.argv[1]) load_config(current_config=FLAGS.config) vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.vocab_dim)) global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) if os.path.exists(global_train_dir): os.unlink(global_train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) dataset = (context_data, question_data, question_uuid_data) # ========= Model-specific ========= # You must change the following code to adjust to your model encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, train_dir) with tf.Session() as sess: initialize_model(sess, qa, train_dir) answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_): # Do what you need to load datasets from FLAGS.data_dir #dataset = load_data(FLAGS.data_dir) # None dataset = {} num_train = load_data_dq(dataset, 'train', FLAGS.data_dir) num_val = load_data_dq(dataset, 'val', FLAGS.data_dir) load_data_sa(dataset, 'train', FLAGS.data_dir, num_train) load_data_sa(dataset, 'val', FLAGS.data_dir, num_val) trim(dataset['train']) trim(dataset['val']) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) # Session moved upfront to set the ifgpu flag before QASystem with tf.Session() as sess: pass local_device_protos = device_lib.list_local_devices() # 38559755 for x in local_device_protos: if x.device_type == 'GPU': FLAGS.ifgpu = True break qa = QASystem(encoder, decoder, embed_path, rev_vocab) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) #print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # tfdbg #with tf.Session() as sess: # pass #sess = tfdbg.LocalCLIDebugWrapperSession(sess) #sess.add_tensor_filter("has_inf_or_nan", tfdbg.has_inf_or_nan) load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) #qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True) sess.close() #tfdbg
def main(_): dataset = {} question_max_len = 40 context_max_len = 600 # Preprocess and collect small test data test_q_path = pjoin(FLAGS.data_dir, "test.ids.question") test_q_data, test_q_seq_len = pad_sentences(test_q_path, question_max_len) assert not any( test_q_seq_len > test_q_data.shape[1] ), 'Some questions have length greater than max question length' test_c_path = pjoin(FLAGS.data_dir, "test.ids.context") test_c_data, test_c_seq_len = pad_sentences(test_c_path, context_max_len) assert not any( test_c_seq_len > test_c_data.shape[1] ), 'Some contexts have length greater than max context length' test_s_path = pjoin(FLAGS.data_dir, "test.span") test_s_e_id = get_answer_span(test_s_path, context_max_len) dataset['test'] = [ test_q_data, test_q_seq_len, test_c_data, test_c_seq_len, test_s_e_id ] embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(state_size=FLAGS.state_size) qa = QASystem(encoder, decoder, question_max_len, context_max_len, embed_path, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.dropout, FLAGS.optimizer, FLAGS.max_gradient_norm) # try without dropout if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset['test'], save_train_dir, small_data_test=True) #
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) FLAGS.embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) dataset = { "val_context": context_data, "val_questions": question_data, "val_question_uuids": question_uuid_data } # ========= Model-specific ========= # You must change the following code to adjust to your model encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, FLAGS=FLAGS) decoder = Decoder(FLAGS=FLAGS) qa = QASystem(encoder, decoder, FLAGS) with tf.Session() as sess: #train_dir = get_normalized_train_dir(FLAGS.train_dir) train_dir = FLAGS.train_dir print("train_dir: ", train_dir) initialize_model(sess, qa, train_dir) print("Generating Answers") answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir print("Writing to json file") with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_): # TODO # Do what you need to load datasets from FLAGS.data_dir dataset = dict() for dataset_type in ['train', 'val']: with open(os.path.join(FLAGS.data_dir, "%s.ids.context" % dataset_type)) as f: data_context = [ map(int, line.split()) for line in f.read().splitlines() ] with open( os.path.join(FLAGS.data_dir, "%s.ids.question" % dataset_type)) as f: data_question = [ map(int, line.split()) for line in f.read().splitlines() ] with open(os.path.join(FLAGS.data_dir, "%s.span" % dataset_type)) as f: data_span = [ map(int, line.split()) for line in f.read().splitlines() ] dataset[dataset_type] = (data_context, data_question, data_span) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, config=config) decoder = Decoder(output_size=FLAGS.output_size, config=config) qa = QASystem(encoder, decoder, config=config) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, FLAGS.evaluate, log=True)
def main(_): data_dir = cfg.DATA_DIR vocab, rev_vocab = initialize_vocab(FLAGS.vocab) # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.reset_default_graph() encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder, FLAGS.embed) with tf.Session(config=config) as sess: init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(FLAGS.ckpt) initialize_model(sess, qa, load_train_dir) print( '*********************************************************************' ) print( "Welcome! You can use this to explore the behavior of the model.") print( '*********************************************************************' ) while True: print('-------------------') print('Input the context: ') print('-------------------') sentence = raw_input() print('-------------------') print('Input the question: ') print('-------------------') query = raw_input() raw_context = nltk.word_tokenize(sentence) context = sentence_to_token_ids(sentence, vocab, tokenizer=nltk.word_tokenize) question = sentence_to_token_ids(query, vocab, tokenizer=nltk.word_tokenize) context_in = mask_input(context, cfg.context_max_len) question_in = mask_input(question, cfg.question_max_len) start, end = qa.answer(sess, [context_in], [question_in]) answer = ' '.join(raw_context[start[0]:end[0] + 1]) print('==========================================') print('ANSWER: {}'.format(answer)) print('==========================================')
def main(_): # Do what you need to load datasets from FLAGS.data_dir # use .readlines() to load file ourselves # use python generator question_path = pjoin(FLAGS.data_dir, "data_train/train_tokenH.txt") paragraph_path = pjoin(FLAGS.data_dir, "data_train/train_tokenP.txt") answer_path = pjoin(FLAGS.data_dir, "data_train/train_index.txt") val_question_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenH.txt") val_paragraph_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenP.txt") val_answer_path = pjoin(FLAGS.data_dir, "data_dev/dev_index.txt") # for testing # dataset = [(1,1,1), (1,1,1)] dataset = load_dataset(question_path, paragraph_path, answer_path, FLAGS.batch_size) val_dataset = load_dataset(val_question_path, val_paragraph_path, val_answer_path, FLAGS.batch_size) #generate_histograms(dataset) #generate_histograms(val_dataset) # loads embedding FLAGS.embed_path = FLAGS.embed_path or pjoin("data", "sgns.merge.word.npz") vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.txt") vocab, rev_vocab = initialize_vocab( vocab_path) # one is list and one is dict encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(size=FLAGS.state_size, output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS) # log file if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # start training with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, val_dataset, save_train_dir, rev_vocab)
def main(_): # TODO maybe pass as loaded dataset abstraction instead of # file_paths? default_hparams = create_hparams(FLAGS) context_file_path = FLAGS.data_dir + '/train.ids.context' question_file_path = FLAGS.data_dir + '/train.ids.question' span_file_path = FLAGS.data_dir + '/train.span' dataset = (context_file_path, question_file_path, span_file_path) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) default_hparams.add_hparam('vocab_size', len(vocab)) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(state_size=FLAGS.state_size) qa = QASystem(encoder, decoder, default_hparams) # Setup embeddings embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) np_embeddings = np.float32(np.load(embed_path)['glove']) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) session_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) session_config.gpu_options.allow_growth = True with tf.Session(config=session_config) as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir, np_embeddings) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def run_func(): config = Config() train = squad_dataset(config.question_train, config.context_train, config.answer_train) dev = squad_dataset(config.question_dev, config.context_dev, config.answer_dev) # print(config.question_train) embed_path = config.embed_path vocab_path = config.vocab_path # print(config.embed_path, config.vocab_path) vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_size) decoder = Decoder(config.hidden_size) qa = QASystem(encoder, decoder, embeddings, config) with tf.Session() as sess: # ====== Load a pretrained model if it exists or create a new one if no pretrained available ====== qa.initialize_model(sess, config.train_dir) # train process # qa.train(sess, [train, dev], config.train_dir) # em = qa.evaluate_model(sess, dev) # run process while True: question = input('please input question: ') if question == 'exit': break raw_context = input('please input context: ') if raw_context == 'exit': break question = [ vocab[x] if x in vocab.keys() else 2 for x in question.split() ] context = [ vocab[x] if x in vocab.keys() else 2 for x in raw_context.split() ] test = [[question], [context], [[1, 2]]] a_s, a_e = qa.answer(sess, test) if a_e == a_s: print("answer: ", raw_context.split()[a_s[0]]) else: print("answer: ", ' '.join(raw_context.split()[a_s[0]:a_e[0] + 1]))
def main(_): # Do what you need to load datasets from FLAGS.data_dir datasetTrain = initialize_datasets(FLAGS.data_dir, 'train.', debugMode=False) datasetVal = initialize_datasets(FLAGS.data_dir, 'val.', debugMode=False) datasetTrain.extend(datasetVal) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) #This is taking a long time tic = datetime.now() qa = QASystem(encoder, decoder, embed_path, FLAGS, rev_vocab) print('Time to setup the model: ', datetime.now() - tic) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) #saver = tf.train.Saver() with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) # Get directory to save model #save_train_dir = get_normalized_train_dir(FLAGS.train_dir) results_path = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now()) save_train_dir = results_path + "model.weights/" if not os.path.exists(save_train_dir): os.makedirs(save_train_dir) qa.train(sess, datasetTrain, save_train_dir) #, saver) qa.evaluate_answer(sess, datasetVal, rev_vocab, sample=1000, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = load_dataset() embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, embedding_size=FLAGS.embedding_size, output_size=FLAGS.output_size) decoder = Decoder(state_size=FLAGS.state_size, output_size=FLAGS.output_size) qa_args = { "embed_path": embed_path, "embedding_size": FLAGS.embedding_size, "output_size": FLAGS.output_size, "optimizer": FLAGS.optimizer, "learning_rate": FLAGS.learning_rate, "epochs": FLAGS.epochs, "batch_size": FLAGS.batch_size, "max_gradient_norm": FLAGS.max_gradient_norm, "dropout_keep_prob": 1.0 - FLAGS.dropout, "train_dir": FLAGS.train_dir, "state_size": FLAGS.state_size } qa = QASystem(encoder, decoder, **qa_args) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = {"train": load_data(FLAGS.data_dir, mode="train"), \ "val": load_data(FLAGS.data_dir, mode="val")} embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) matcher = Matcher(perspective_dim=25, input_size=FLAGS.state_size) # add flag decoder = Decoder(output_size=FLAGS.output_size, state_size=FLAGS.state_size, n_perspective_dim=50 * 2) # add flag qa = QASystem(encoder, matcher, decoder, \ vocab=vocab, vocab_dim=FLAGS.embedding_size, rev_vocab=rev_vocab, embed_path=embed_path) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) tf.global_variables_initializer().run() graph_writer = tf.summary.FileWriter("qa-graph") graph_writer.add_graph(sess.graph) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, 500, log=True)
def main(_): logging.info("Loading training data") dataset_train = load_train_data(FLAGS.data_dir, isValidation = False) logging.info("Loading validation data") dataset_val = load_train_data(FLAGS.data_dir, isValidation = True) logging.info("Building Model Graph") tf.set_random_seed(42) np.random.seed(43) select_test(0) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = SimpleLinearDecoder() #AnswerPointerDecoder() qa = QASystem(encoder, decoder, len(dataset_train[0])) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) logging.info(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) cris_flag = os.environ.get('CS224N_CRIS') if cris_flag is not None: logging.info('hi cris') sess = tf.Session(config = tf.ConfigProto(intra_op_parallelism_threads = 1)) else: sess = tf.Session() with sess.as_default(): load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset_train, dataset_val, save_train_dir) sess.close()
def main(_): # Do what you need to load datasets from FLAGS.data_dir training_question_data_path = pjoin(FLAGS.data_dir, 'train.question') dataset = load_dataset(FLAGS.data_dir) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) pretrained_embeddings = np.load(embed_path)['glove'] encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, pretrained_embeddings=pretrained_embeddings, max_question_length=FLAGS.max_question_length, max_context_length=FLAGS.max_context_length) decoder = Decoder(output_size=FLAGS.output_size, size=FLAGS.state_size, max_context_length=FLAGS.max_context_length) qa = QASystem(encoder, decoder) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): dataset = [ load_dataset([ "embedding/train.ids.question", "embedding/train.ids.context", "data/train.span" ]), load_dataset([ "embedding/val.ids.question", "embedding/val.ids.context", "data/val.span" ]) ] embed_path = pjoin("embedding", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) tf.reset_default_graph() encoder = Encoder(FLAGS.num_hidden_unit, tf.contrib.rnn.GRUCell) decoder = Decoder(FLAGS.num_hidden_unit, tf.contrib.rnn.BasicLSTMCell) embedding = np.load(embed_path)["glove"] qa = QASystem(encoder, decoder, embedding, FLAGS.keep_prob) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: initialize_model(sess, qa, FLAGS.train_dir) qa.train(sess, dataset, FLAGS.epochs, FLAGS.batch_size, FLAGS.train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = None embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) with np.load(embed_path) as data: glove_embeddings = np.asfarray(data["glove"], dtype=np.float32) dataset = load_and_preprocess_data() # print(train_data) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, config=FLAGS) decoder = Decoder(output_size=FLAGS.output_size, config=FLAGS) qa = QASystem(encoder, decoder, embeddings=glove_embeddings, config=FLAGS, vocab=(vocab, rev_vocab)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, FLAGS.evaluate, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = None vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.size) deocder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, deocder) global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if os.path.exists(global_train_dir): os.unlink(global_train_dir) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: initialize_model(sess, qa, train_dir) qa.train(sess, dataset) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def run_func(): config = Config() train = squad_dataset(config.question_train, config.context_train, config.answer_train) dev = squad_dataset(config.question_dev, config.context_dev, config.answer_dev) embed_path = config.embed_path vocab_path = config.vocab_path vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) with tf.Session() as sess: # ====== Load a pretrained model if it exists or create a new one if no pretrained available ====== qa.initialize_model(sess, config.train_dir) qa.train(sess, [train, dev], config.train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = get_dataset(FLAGS.data_dir, FLAGS.max_question_size, FLAGS.max_paragraph_size) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) FLAGS.embed_path = embed_path vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, FLAGS=FLAGS) decoder = Decoder(FLAGS=FLAGS) qa = QASystem(encoder, decoder, FLAGS) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: #load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) #Change these back for final submission load_train_dir = FLAGS.load_train_dir or FLAGS.train_dir print("load_train_dir: ", load_train_dir) initialize_model(sess, qa, load_train_dir) #save_train_dir = get_normalized_train_dir(FLAGS.train_dir) #Change back for final submission save_train_dir = FLAGS.train_dir print("save_train_dir: ", save_train_dir) qa.train(sess, dataset, save_train_dir, rev_vocab)
def main(_): FLAGS.config = int(sys.argv[1]) load_config(current_config=FLAGS.config) # Do what you need to load datasets from FLAGS.data_dir dataset = load_data(FLAGS.data_dir) # ((question, context), answer) train_data = preprocess_dataset(dataset['train'], FLAGS.output_size, FLAGS.question_size) val_data = preprocess_dataset(dataset['val'], FLAGS.output_size, FLAGS.question_size) # print(dataset) embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log" + '_config_' + str(FLAGS.config) + ".txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags" + '_config_' +\ str(FLAGS.config) + ".json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, train_data, val_data, save_train_dir) qa.evaluate_answer(sess, train_data, val_data, FLAGS.evaluate, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = [] dataset.append(pjoin(FLAGS.data_dir, "train.ids.question")) dataset.append(pjoin(FLAGS.data_dir, "train.ids.context")) dataset.append(pjoin(FLAGS.data_dir, "train.span")) dataset.append(pjoin(FLAGS.data_dir, "val.ids.question")) dataset.append(pjoin(FLAGS.data_dir, "val.ids.context")) dataset.append(pjoin(FLAGS.data_dir, "val.span")) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.paragraph_output_size) qa = QASystem(encoder, decoder) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir)
def main(_): config = Config() dataset = None # TODO ;load dateset ??? - look at dataset and seenhow it loooks - change model.py accordingly embed_path = config.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(config.embed_size)) embeddingz = np.load(embed_path) embeddings = embeddingz['glove'] embeddingz.close() vocab_len = embeddings.shape[0] train = load_squad(config.data_dir, "train", vocab_len, config.data_dir, max_samples=config.max_train_samples) val = load_squad(config.data_dir, "val", vocab_len, config.data_dir, max_samples=config.max_val_samples) print('train size: ', len(train), ' val size: ', len(val)) vocab_path = config.vocab_path or pjoin(config.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) # print_samples(train,1, rev_vocab) #train is triplets of (context,question,answer) encoder = Encoder(state_size=config.hidden_size, embedding_size=config.embed_size) decoder = Decoder(state_size=config.hidden_size, embedding_size=config.embed_size) qa = QASystem(encoder, decoder) with tf.Session() as sess: load_train_dir = (config.load_train_dir or config.train_dir) # put "" here if you want to build a new model initialize_model(sess, qa, load_train_dir) save_train_dir = config.train_dir ds_train = qa.pad_sequences(train) ret_q, ret_p, ret_labels = ds_train qa.train(sess, ds_train, save_train_dir) ds_val = qa.pad_sequences(val) print('train error') qa.evaluate_answer(sess, ds_train, log=True) print('val error') qa.evaluate_answer(sess, ds_val, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = {} question_max_len = 40 context_max_len = 600 # Preprocess and collect train data train_q_path = pjoin(FLAGS.data_dir, "train.ids.question") train_q_data, train_q_seq_len = pad_sentences(train_q_path, question_max_len) assert not any( train_q_seq_len > train_q_data.shape[1] ), 'Some questions have length greater than max question length' train_c_path = pjoin(FLAGS.data_dir, "train.ids.context") train_c_data, train_c_seq_len = pad_sentences(train_c_path, context_max_len) assert not any( train_c_seq_len > train_c_data.shape[1] ), 'Some contexts have length greater than max context length' train_s_path = pjoin(FLAGS.data_dir, "train.span") train_s_e_id = get_answer_span(train_s_path, context_max_len) dataset['train'] = [ train_q_data, train_q_seq_len, train_c_data, train_c_seq_len, train_s_e_id ] # Preprocess and collect validation data val_q_path = pjoin(FLAGS.data_dir, "val.ids.question") val_q_data, val_q_seq_len = pad_sentences(val_q_path, question_max_len) val_c_path = pjoin(FLAGS.data_dir, "val.ids.context") val_c_data, val_c_seq_len = pad_sentences(val_c_path, context_max_len) val_s_path = pjoin(FLAGS.data_dir, "val.span") val_s_e_id = get_answer_span(val_s_path, context_max_len) dataset['val'] = [ val_q_data, val_q_seq_len, val_c_data, val_c_seq_len, val_s_e_id ] embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(state_size=FLAGS.state_size) qa = QASystem(encoder, decoder, question_max_len, context_max_len, embed_path, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.dropout, FLAGS.optimizer, FLAGS.max_gradient_norm) # try without dropout if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) for i in range(FLAGS.epochs): qa.train(sess, dataset['train'], save_train_dir) # print('Finish training epoch {}'.format(i)) qa.evaluate_answer(sess, dataset['val']) # vocab, FLAGS.evaluate
def main(_): config_fname = FLAGS.config_path assert os.path.exists(config_fname), "config file does not exist" logging.info("Loaded configs from: " + config_fname) with open(config_fname, "rb") as fp: json_flag = json.load(fp) # print(json_flag) print(vars(FLAGS)) for key, value in json_flag.iteritems(): if key == "eval_on_train": continue if key == "dev_path": continue if key == "train_dir": continue FLAGS.__setattr__(key, value) print(vars(FLAGS)) assert os.path.exists(FLAGS.train_dir), "train dir does not exist" # assert False vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Model-specific ========= # You must change the following code to adjust to your model embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) embeddings = load_glove_embeddings(embed_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) # mixer = Mixer() # decoder = Decoder(FLAGS) if FLAGS.model == 'baseline': qa = QASystem(encoder, FLAGS, embeddings, 1) elif FLAGS.model == 'matchLSTM': qa = QASystemMatchLSTM(FLAGS, embeddings, 1) print('\n\nrand_unknown is set to be ' + str(FLAGS.rand_unknown)) if FLAGS.load_from_json: dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) dataset = prepare_dev(dev_dirname, dev_filename, vocab) # remove answer # context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data, s_labels, e_labels, true_answers = dataset context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data = dataset for i in range(1): logging.debug('context') logging.debug(' '.join(context_tokens_data[i])) logging.debug('context_data') logging.debug(context_data[i]) logging.debug('question') logging.debug(' '.join(question_tokens_data[i])) logging.debug('question_data') logging.debug(question_data[i]) logging.debug('uuid_data') logging.debug(question_uuid_data[i]) with tf.Session() as sess: # train_dir = get_normalized_train_dir(FLAGS.train_dir) train_dir = FLAGS.train_dir initialize_model(sess, qa, train_dir) print('About to start generate_answers') print(FLAGS.eval_on_train) answers = generate_answers(sess, qa, dataset) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False))) else: # load from files converted from json FLAGS.data_dir = os.path.join("data", "squad", "qa_answer") if (not os.path.isdir("/home/el")) or not os.listdir(FLAGS.data_dir): process_dev_json_to_files() else: print('Data directory %s is not empty: ' % (FLAGS.data_dir, str(os.listdir(FLAGS.data_dir)))) dataset, max_q_len, max_c_len = load_dataset(FLAGS.data_dir, FLAGS.data_size, FLAGS.max_question_length, FLAGS.max_context_length, ['dev']) dev_set = dataset['dev'] print('Start running evaluate_answer on %d of data' % len(dev_set)) dev_examples = qa.preprocess_question_answer(dev_set) dev_raw = dataset['dev_raw'] dev_uuid = dataset['dev_uuid'] dev_dataset = [dev_examples, dev_raw, dev_uuid] with tf.Session() as sess: train_dir = FLAGS.train_dir initialize_model(sess, qa, train_dir) answers_model = qa.evaluate_answer(session=sess, dataset=dev_dataset, sample=len(dev_set), return_answer_dict=True) # write to json file to root dir with io.open('dev-prediction-model.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers_model, ensure_ascii=False)))