def run_func(): config = Config() # ========= Load Dataset ========= # You can change this code to load dataset in your own way vocab, rev_vocab = initialize_vocab(config.vocab_path) dev_path = "data/squad/fuse.json" dev_dirname = os.path.dirname(os.path.abspath(dev_path)) dev_filename = os.path.basename(dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) ques_len = len(question_data) answers = [[0, 0] for _ in xrange(ques_len)] dataset = [question_data, context_data, answers] embed_path = config.embed_path embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) with tf.Session() as sess: qa.initialize_model(sess, config.train_dir) answers, _ = generate_answers(sess, qa, dataset, question_uuid_data, rev_vocab) # write to json file to root dir with io.open('temp/fuse-answer.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_): # Do what you need to load datasets from FLAGS.data_dir train_data, val_data = load_preprocess_data(FLAGS.data_dir, FLAGS.max_context_len, FLAGS.max_question_len) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = tf.constant(load_embeddings(embed_path), tf.float32) encoder = Encoder(FLAGS.state_size, FLAGS.summary_flag, FLAGS.max_context_len, FLAGS.max_question_len) decoder = Decoder(FLAGS.state_size, FLAGS.summary_flag) qa = QASystem(encoder, decoder, FLAGS, embeddings, rev_vocab) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, train_data, val_data, save_train_dir)
def run_func2(dataset, config): vocab, rev_vocab = initialize_vocab(config.vocab_path) q, c, a = zip(*[[_q, _c, _a] for (_q, _c, _a) in dataset]) dataset = [q, c, a] embed_path = config.embed_path embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) question_uuid_data = [i for i in xrange(len(a))] with tf.Session() as sess: qa.initialize_model(sess, config.train_dir) answers, answers_canonical = generate_answers(sess, qa, dataset, question_uuid_data, rev_vocab) # write to json file to root dir with io.open('dev-prediction.txt', 'w', encoding='utf-8') as f: for i in xrange(len(a)): curr_ans = unicode(answers[i], "utf-8") f.write("%s\n" % (curr_ans))
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) context_data, question_data, question_uuid_data, context_text = prepare_dev( dev_dirname, dev_filename, vocab) dataset = (context_data, question_data, question_uuid_data) idx_word = data_utils.invert_map(vocab) qa = QASystem(FLAGS, embed_path, idx_word, False, 0, True) with tf.Session() as sess: initialize_model(sess, qa, FLAGS.train_dir) start, end = qa.test(sess, dataset) answers = generate_answers(sess, qa, dataset, rev_vocab, context_text, idx_word) with io.open('dev-prediction_2.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def run_func(): config = Config() vocab, rev_vocab = initialize_vocab(config.vocab_path) dev_path = "download/squad/test.json" dev_dirname = os.path.dirname(os.path.abspath(dev_path)) dev_filename = os.path.basename(dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) ques_len = len(question_data) answers = [[0, 0] for _ in xrange(ques_len)] dataset = [question_data, context_data, answers] embed_path = config.embed_path embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) data = "Id,Answer\n" with tf.Session() as sess: qa.initialize_model(sess, config.train_dir) answers, _ = generate_answers(sess, qa, dataset, question_uuid_data, rev_vocab) for a in answers: ans = answers[a] data += a + "," + normalize_answer(ans).replace(" s ", "s ") + "\n" with open('submission.csv', 'wb') as file: file.write(data)
def main(_): # Do what you need to load datasets from FLAGS.data_dir # load all in once, maybe better to try batch by batch question_path = "./data/squad/train.ids.question" context_path = "./data/squad/train.ids.context" answer_path = "./data/squad/train.span" val_q = "./data/squad/val.ids.question" val_c = "./data/squad/val.ids.context" val_a = "./data/squad/val.span" embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") # embeddings is a matrix of shape [vocab_size, embedding_size] embeddings = np.load(embed_path)['glove'].astype(np.float32) val_data = load_and_pad_val_data(val_q, val_c, val_a) # vocab is the mapping from word -> token id # rev_vocab is the reverse mapping, from id -> word vocab, rev_vocab = initialize_vocab(vocab_path) # someone posted that the max length of question is 766 info = (question_path, context_path, answer_path, FLAGS.batch_size, FLAGS.max_length, FLAGS.output_size) ''' batch_gen = batch_generator(question_path, context_path, answer_path, FLAGS.batch_size, FLAGS.max_length, FLAGS.output_size) i = 0; while True: batch_gen.next() i += 1 logging.info(i) ''' encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS, embeddings) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, batch_generator, info, save_train_dir, val_data, rev_vocab)
def predict(model_name): qa = QASystem(model_name) with tf.Session() as sess: qa.initialize_model(sess) while True: question = input("Ask a question: ") for answer, confidence, doc in answer_question(qa, sess, question, best_n=10): print('{:.2f}:\t{} ({})'.format(confidence, answer, doc))
def main(_): # Do what you need to load datasets from FLAGS.data_dir #dataset = load_data(FLAGS.data_dir) # None dataset = {} num_train = load_data_dq(dataset, 'train', FLAGS.data_dir) num_val = load_data_dq(dataset, 'val', FLAGS.data_dir) load_data_sa(dataset, 'train', FLAGS.data_dir, num_train) load_data_sa(dataset, 'val', FLAGS.data_dir, num_val) trim(dataset['train']) trim(dataset['val']) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) # Session moved upfront to set the ifgpu flag before QASystem with tf.Session() as sess: pass local_device_protos = device_lib.list_local_devices() # 38559755 for x in local_device_protos: if x.device_type == 'GPU': FLAGS.ifgpu = True break qa = QASystem(encoder, decoder, embed_path, rev_vocab) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) #print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # tfdbg #with tf.Session() as sess: # pass #sess = tfdbg.LocalCLIDebugWrapperSession(sess) #sess.add_tensor_filter("has_inf_or_nan", tfdbg.has_inf_or_nan) load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) #qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True) sess.close() #tfdbg
def main(_): dataset = {} question_max_len = 40 context_max_len = 600 # Preprocess and collect small test data test_q_path = pjoin(FLAGS.data_dir, "test.ids.question") test_q_data, test_q_seq_len = pad_sentences(test_q_path, question_max_len) assert not any( test_q_seq_len > test_q_data.shape[1] ), 'Some questions have length greater than max question length' test_c_path = pjoin(FLAGS.data_dir, "test.ids.context") test_c_data, test_c_seq_len = pad_sentences(test_c_path, context_max_len) assert not any( test_c_seq_len > test_c_data.shape[1] ), 'Some contexts have length greater than max context length' test_s_path = pjoin(FLAGS.data_dir, "test.span") test_s_e_id = get_answer_span(test_s_path, context_max_len) dataset['test'] = [ test_q_data, test_q_seq_len, test_c_data, test_c_seq_len, test_s_e_id ] embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(state_size=FLAGS.state_size) qa = QASystem(encoder, decoder, question_max_len, context_max_len, embed_path, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.dropout, FLAGS.optimizer, FLAGS.max_gradient_norm) # try without dropout if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset['test'], save_train_dir, small_data_test=True) #
def main(_): data_dir = cfg.DATA_DIR vocab, rev_vocab = initialize_vocab(FLAGS.vocab) # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.reset_default_graph() encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder, FLAGS.embed) with tf.Session(config=config) as sess: init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(FLAGS.ckpt) initialize_model(sess, qa, load_train_dir) print( '*********************************************************************' ) print( "Welcome! You can use this to explore the behavior of the model.") print( '*********************************************************************' ) while True: print('-------------------') print('Input the context: ') print('-------------------') sentence = raw_input() print('-------------------') print('Input the question: ') print('-------------------') query = raw_input() raw_context = nltk.word_tokenize(sentence) context = sentence_to_token_ids(sentence, vocab, tokenizer=nltk.word_tokenize) question = sentence_to_token_ids(query, vocab, tokenizer=nltk.word_tokenize) context_in = mask_input(context, cfg.context_max_len) question_in = mask_input(question, cfg.question_max_len) start, end = qa.answer(sess, [context_in], [question_in]) answer = ' '.join(raw_context[start[0]:end[0] + 1]) print('==========================================') print('ANSWER: {}'.format(answer)) print('==========================================')
def main(_): if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) logger = logging.getLogger() # Do what you need to load datasets from FLAGS.data_dir dataset = None vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) train_data = zip(*load_data(FLAGS.data_dir, "train")) val_data = zip(*load_data(FLAGS.data_dir, "val")) dev_data = zip(*load_data(FLAGS.data_dir, "dev")) #model_train_data = train_data + val_data + dev_data model_train_data = train_data model_eval_data = val_data global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if os.path.exists(global_train_dir): os.unlink(global_train_dir) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Graph().as_default(): with tf.Session() as sess: logger.info("Loading embeddings") embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz') pretrained_embeddings = embeddings['glove'] logger.info("Embeddings loaded with shape: %s %s" % (pretrained_embeddings.shape)) qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys())) initialize_model(sess, qa, train_dir) qa.train(sess, model_train_data, model_eval_data)
def main(_): # Do what you need to load datasets from FLAGS.data_dir # use .readlines() to load file ourselves # use python generator question_path = pjoin(FLAGS.data_dir, "data_train/train_tokenH.txt") paragraph_path = pjoin(FLAGS.data_dir, "data_train/train_tokenP.txt") answer_path = pjoin(FLAGS.data_dir, "data_train/train_index.txt") val_question_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenH.txt") val_paragraph_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenP.txt") val_answer_path = pjoin(FLAGS.data_dir, "data_dev/dev_index.txt") # for testing # dataset = [(1,1,1), (1,1,1)] dataset = load_dataset(question_path, paragraph_path, answer_path, FLAGS.batch_size) val_dataset = load_dataset(val_question_path, val_paragraph_path, val_answer_path, FLAGS.batch_size) #generate_histograms(dataset) #generate_histograms(val_dataset) # loads embedding FLAGS.embed_path = FLAGS.embed_path or pjoin("data", "sgns.merge.word.npz") vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.txt") vocab, rev_vocab = initialize_vocab( vocab_path) # one is list and one is dict encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(size=FLAGS.state_size, output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS) # log file if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # start training with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, val_dataset, save_train_dir, rev_vocab)
def run_func(): config = Config() train = squad_dataset(config.question_train, config.context_train, config.answer_train) dev = squad_dataset(config.question_dev, config.context_dev, config.answer_dev) # print(config.question_train) embed_path = config.embed_path vocab_path = config.vocab_path # print(config.embed_path, config.vocab_path) vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_size) decoder = Decoder(config.hidden_size) qa = QASystem(encoder, decoder, embeddings, config) with tf.Session() as sess: # ====== Load a pretrained model if it exists or create a new one if no pretrained available ====== qa.initialize_model(sess, config.train_dir) # train process # qa.train(sess, [train, dev], config.train_dir) # em = qa.evaluate_model(sess, dev) # run process while True: question = input('please input question: ') if question == 'exit': break raw_context = input('please input context: ') if raw_context == 'exit': break question = [ vocab[x] if x in vocab.keys() else 2 for x in question.split() ] context = [ vocab[x] if x in vocab.keys() else 2 for x in raw_context.split() ] test = [[question], [context], [[1, 2]]] a_s, a_e = qa.answer(sess, test) if a_e == a_s: print("answer: ", raw_context.split()[a_s[0]]) else: print("answer: ", ' '.join(raw_context.split()[a_s[0]:a_e[0] + 1]))
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) context_ids, ctx_mask = pad(context_data, FLAGS.output_size) question_ids, q_mask = pad(question_data, FLAGS.question_size) for i in range(0, len(context_ids)): context_ids[i] = context_ids[i][:FLAGS.output_size] for j in range(0, len(question_ids)): question_ids[j] = question_ids[j][:FLAGS.question_size] context_ids = np.array(context_ids) question_ids = np.array(question_ids) ctx_mask = np.array(ctx_mask) q_mask = np.array(q_mask) answer_span = np.array( [(0, 0)] * len(context_ids) ) #Need this because minibatches is expecting it this way dataset = [ context_ids, question_ids, answer_span, ctx_mask, q_mask, question_uuid_data ] embeddings = initialize_embeddings(embed_path) # ========= Model-specific ========= # You must change the following code to adjust to your model #encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) #decoder = Decoder(output_size=FLAGS.output_size) #qa = QASystem(pretrained_embeddings=embeddings,flags=FLAGS) qa = QASystem(pretrained_embeddings=embeddings, flags=FLAGS) with tf.Session() as sess: train_dir = get_normalized_train_dir(FLAGS.train_dir) initialize_model(sess, qa, train_dir) answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) embeddings = load_glove_embeddings(embed_path) raw_embed_path = pjoin( "data", "squad", "glove.untrimmed.{}.npz".format(FLAGS.embedding_size)) raw_glove_data = np.load(raw_embed_path) raw_glove = raw_glove_data['glove'] raw_glove_vocab = raw_glove_data['glove_vocab_dict'][()] # expand vocab vocab, rev_vocab, embeddings = expand_vocab(dev_dirname, dev_filename, vocab, embeddings, raw_glove, raw_glove_vocab) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) context_len_data = [len(context.split()) for context in context_data] mydata = preprocessing(context_data, question_data, FLAGS.context_maxlen, FLAGS.question_maxlen) dataset = (mydata, context_data, context_len_data, question_uuid_data) # ========= Model-specific ========= # You must change the following code to adjust to your model #encoder = Encoder(vocab_dim=FLAGS.embedding_size, state_size = FLAGS.encoder_state_size) #decoder = Decoder(output_size=FLAGS.output_size, hidden_size = FLAGS.decoder_hidden_size, state_size = FLAGS.decoder_state_size) qa = QASystem(embeddings, FLAGS) with tf.Session() as sess: train_dir = get_normalized_train_dir(FLAGS.train_dir) initialize_model(sess, qa, train_dir) answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_): #======Fill the model name============= train_dir = "train/test" #====================================== vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) # ========= Load Dataset ========= train_data,val_data = load_and_preprocess_data(FLAGS.data_dir, FLAGS.max_context_len, FLAGS.max_question_len, size = FLAGS.train_size) # ========= Model-specific ========= # You must change the following code to adjust to your model embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) embedding = tf.constant(load_embeddings(embed_path), dtype = tf.float32) encoder = Encoder(FLAGS.state_size, FLAGS.max_context_len, FLAGS.max_question_len, FLAGS.embedding_size, FLAGS.summary_flag, FLAGS.filter_flag) decoder = Decoder(FLAGS.state_size, FLAGS.max_context_len, FLAGS.max_question_len, FLAGS.output_size, FLAGS.summary_flag) qa = QASystem(encoder, decoder, FLAGS, embedding, rev_vocab) with tf.Session() as sess: train_dir = get_normalized_train_dir(train_dir) qa = initialize_model(sess, qa, train_dir) output_list, output_dict = generate_answers(sess, qa, val_data, rev_vocab) store_result(output_list, output_dict, train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = load_dataset() embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, embedding_size=FLAGS.embedding_size, output_size=FLAGS.output_size) decoder = Decoder(state_size=FLAGS.state_size, output_size=FLAGS.output_size) qa_args = { "embed_path": embed_path, "embedding_size": FLAGS.embedding_size, "output_size": FLAGS.output_size, "optimizer": FLAGS.optimizer, "learning_rate": FLAGS.learning_rate, "epochs": FLAGS.epochs, "batch_size": FLAGS.batch_size, "max_gradient_norm": FLAGS.max_gradient_norm, "dropout_keep_prob": 1.0 - FLAGS.dropout, "train_dir": FLAGS.train_dir, "state_size": FLAGS.state_size } qa = QASystem(encoder, decoder, **qa_args) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir)
def evaluate(model_name, n=None): data = [] with open(config.TREC_PATH, 'r') as f: reader = csv.reader(f, delimiter='\t') for row in list(reader)[199:]: data.append((row[2].strip(), row[3].strip())) if not n: n = len(data) qa = QASystem(model_name) top_count = 0 top_5_count = 0 top_10_count = 0 with tf.Session() as sess: qa.initialize_model(sess) with open(os.path.join(config.MODELS_DIR, model_name, 'trec.csv'), 'w') as f: writer = csv.writer(f) i = 0 for question, answer_pattern in tqdm(data[:n]): answers = [ answer for answer, confidence, doc in answer_question( qa, sess, question, 10) ] writer.writerow(answers) correct = [ bool(re.search(answer_pattern, answer)) for answer in answers ] if True in correct[:1]: top_count += 1 if True in correct[:5]: top_5_count += 1 if True in correct[:10]: top_10_count += 1 i += 1 print('{}: {}, {}, {}'.format(i, float(top_count) / i, float(top_5_count) / i, float(top_10_count) / i)) print('Top match: {}'.format(float(top_count) / n)) print('Top 5 match: {}'.format(float(top_5_count) / n)) print('Top 10 match: {}'.format(float(top_10_count) / n))
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) if os.path.exists(global_train_dir): os.unlink(global_train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) context_data, question_data, question_uuid_data = prepare_dev(dev_dirname, dev_filename, vocab) dataset = load_data(context_data, question_data, question_uuid_data) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) # ========= Model-specific ========= # You must change the following code to adjust to your model embed_path = FLAGS.embed_path or "data/squad/glove.trimmed.{}.npz".format(FLAGS.embedding_size) embeddingz = np.load(embed_path) embeddings = embeddingz['glove'] embeddingz.close() assert embeddings.shape[1] == FLAGS.embedding_size qa = QASystem(train_dir, embeddings) with tf.Session() as sess: initialize_model(sess, qa, train_dir) answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_): data_dir = cfg.DATA_DIR vocab, rev_vocab = initialize_vocab(FLAGS.vocab) # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.reset_default_graph() encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder, FLAGS.embed) with tf.Session(config=config) as sess: init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(FLAGS.ckpt) initialize_model(sess, qa, load_train_dir) print('*********************************************************************') print("Welcome! You can use this to explore the behavior of the model.") print('*********************************************************************') while True: print('-------------------') print('Input the context: ') print('-------------------') sentence = raw_input() print('-------------------') print('Input the question: ') print('-------------------') query = raw_input() raw_context = nltk.word_tokenize(sentence) context = sentence_to_token_ids(sentence, vocab, tokenizer=nltk.word_tokenize) question = sentence_to_token_ids(query, vocab, tokenizer=nltk.word_tokenize) context_in = mask_input(context, cfg.context_max_len) question_in = mask_input(question, cfg.question_max_len) start, end = qa.answer(sess, [context_in], [question_in]) answer = ' '.join(raw_context[start[0]: end[0] + 1]) print('==========================================') print('ANSWER: {}'.format(answer)) print('==========================================')
def main(_): logging.info("Loading training data") dataset_train = load_train_data(FLAGS.data_dir, isValidation = False) logging.info("Loading validation data") dataset_val = load_train_data(FLAGS.data_dir, isValidation = True) logging.info("Building Model Graph") tf.set_random_seed(42) np.random.seed(43) select_test(0) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = SimpleLinearDecoder() #AnswerPointerDecoder() qa = QASystem(encoder, decoder, len(dataset_train[0])) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) logging.info(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) cris_flag = os.environ.get('CS224N_CRIS') if cris_flag is not None: logging.info('hi cris') sess = tf.Session(config = tf.ConfigProto(intra_op_parallelism_threads = 1)) else: sess = tf.Session() with sess.as_default(): load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset_train, dataset_val, save_train_dir) sess.close()
def main(args): if args: restore = args embed_path = FLAGS.embed_path or "data/squad/glove.trimmed.{}.npz".format( FLAGS.embedding_size) embeddingz = np.load(embed_path) embeddings = embeddingz['glove'] embeddingz.close() assert embeddings.shape[1] == FLAGS.embedding_size vocab_len = embeddings.shape[0] train = load_squad(FLAGS.data_dir, "train", max_vocab=vocab_len if FLAGS.check_embeddings else 0, max_samples=FLAGS.max_train_samples) val = load_squad(FLAGS.data_dir, "val", max_vocab=vocab_len if FLAGS.check_embeddings else 0, max_samples=FLAGS.max_val_samples) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) if FLAGS.verify_only: print_samples(train, FLAGS.verify_only, rev_vocab) return global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if os.path.exists(global_train_dir): os.unlink(global_train_dir) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir qa = QASystem(train_dir, embeddings) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: initialize_model(sess, qa, train_dir) qa.train(sess, train) qa.evaluate_answer(sess, qa.preprocess_sequence_data(val), log=True)
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) FLAGS.embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) dataset = { "val_context": context_data, "val_questions": question_data, "val_question_uuids": question_uuid_data } # ========= Model-specific ========= # You must change the following code to adjust to your model encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, FLAGS=FLAGS) decoder = Decoder(FLAGS=FLAGS) qa = QASystem(encoder, decoder, FLAGS) with tf.Session() as sess: #train_dir = get_normalized_train_dir(FLAGS.train_dir) train_dir = FLAGS.train_dir print("train_dir: ", train_dir) initialize_model(sess, qa, train_dir) print("Generating Answers") answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir print("Writing to json file") with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_): # Do what you need to load datasets from FLAGS.data_dir datasetTrain = initialize_datasets(FLAGS.data_dir, 'train') datasetVal = initialize_datasets(FLAGS.data_dir, 'val') #datasetTrain = datasetTrain[0:100] embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, embed_path, rev_vocab, FLAGS) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, datasetTrain, save_train_dir) qa.evaluate_answer(sess, datasetVal, sample=1000, log=True)
def main(_): dataset = [ load_dataset([ "embedding/train.ids.question", "embedding/train.ids.context", "data/train.span" ]), load_dataset([ "embedding/val.ids.question", "embedding/val.ids.context", "data/val.span" ]) ] embed_path = pjoin("embedding", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) tf.reset_default_graph() encoder = Encoder(FLAGS.num_hidden_unit, tf.contrib.rnn.GRUCell) decoder = Decoder(FLAGS.num_hidden_unit, tf.contrib.rnn.BasicLSTMCell) embedding = np.load(embed_path)["glove"] qa = QASystem(encoder, decoder, embedding, FLAGS.keep_prob) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: initialize_model(sess, qa, FLAGS.train_dir) qa.train(sess, dataset, FLAGS.epochs, FLAGS.batch_size, FLAGS.train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = get_dataset(FLAGS.data_dir, FLAGS.max_question_size, FLAGS.max_paragraph_size) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) FLAGS.embed_path = embed_path vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, FLAGS=FLAGS) decoder = Decoder(FLAGS=FLAGS) qa = QASystem(encoder, decoder, FLAGS) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: #load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) #Change these back for final submission load_train_dir = FLAGS.load_train_dir or FLAGS.train_dir print("load_train_dir: ", load_train_dir) initialize_model(sess, qa, load_train_dir) #save_train_dir = get_normalized_train_dir(FLAGS.train_dir) #Change back for final submission save_train_dir = FLAGS.train_dir print("save_train_dir: ", save_train_dir) qa.train(sess, dataset, save_train_dir, rev_vocab)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = [] dataset.append(pjoin(FLAGS.data_dir, "train.ids.question")) dataset.append(pjoin(FLAGS.data_dir, "train.ids.context")) dataset.append(pjoin(FLAGS.data_dir, "train.span")) dataset.append(pjoin(FLAGS.data_dir, "val.ids.question")) dataset.append(pjoin(FLAGS.data_dir, "val.ids.context")) dataset.append(pjoin(FLAGS.data_dir, "val.span")) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.paragraph_output_size) qa = QASystem(encoder, decoder) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir)
def main(_): config = Config() dataset = None # TODO ;load dateset ??? - look at dataset and seenhow it loooks - change model.py accordingly embed_path = config.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(config.embed_size)) embeddingz = np.load(embed_path) embeddings = embeddingz['glove'] embeddingz.close() vocab_len = embeddings.shape[0] train = load_squad(config.data_dir, "train", vocab_len, config.data_dir, max_samples=config.max_train_samples) val = load_squad(config.data_dir, "val", vocab_len, config.data_dir, max_samples=config.max_val_samples) print('train size: ', len(train), ' val size: ', len(val)) vocab_path = config.vocab_path or pjoin(config.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) # print_samples(train,1, rev_vocab) #train is triplets of (context,question,answer) encoder = Encoder(state_size=config.hidden_size, embedding_size=config.embed_size) decoder = Decoder(state_size=config.hidden_size, embedding_size=config.embed_size) qa = QASystem(encoder, decoder) with tf.Session() as sess: load_train_dir = (config.load_train_dir or config.train_dir) # put "" here if you want to build a new model initialize_model(sess, qa, load_train_dir) save_train_dir = config.train_dir ds_train = qa.pad_sequences(train) ret_q, ret_p, ret_labels = ds_train qa.train(sess, ds_train, save_train_dir) ds_val = qa.pad_sequences(val) print('train error') qa.evaluate_answer(sess, ds_train, log=True) print('val error') qa.evaluate_answer(sess, ds_val, log=True)
def main(FLAGS): print(80 * "=") print("INITIALIZING") print(80 * "=") # Do what you need to load datasets from FLAGS.data_dir #parser, embeddings, train_examples, dev_set, test_set = load_and_preprocess_data(debug) if not os.path.exists('./data/weights/'): os.makedirs('./data/weights/') embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) print("Loading Embedding Matrix") embeddings = np.load(embed_path)['glove'] encoder = Encoder(size=FLAGS.output_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS, embeddings) #qa = QASystem(encoder, FLAGS, embeddings) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) print("Building Network ... ") initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) print("Load Training Data") dataset = initialize_datasets(FLAGS.data_dir, dataset='train', debugMode=True) # encoder.encode_question( # dataset['Questions'], question['Questions_masks']) print(80 * "=") print("Training") print(80 * "=") qa.train(sess, dataset, save_train_dir) print("Finished Training") print("Load Validation Data") dataset = initialize_datasets(FLAGS.data_dir, dataset='val', debugMode=True) print(80 * "=") print("Evaluation") print(80 * "=") qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) train_path = 'data/squad' val_path = 'data/squad' config = Config(embed_path, train_path, val_path) encoder = EncoderCoattention(config) decoder = DecoderDynamic(config) qa = QASystem(encoder, decoder, config) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, save_train_dir)
def run_func(model_name): train = SquadData.load(config.SQUAD_TRAIN_PREFIX, size=config.TRAIN_SIZE) dev = SquadData.load(config.SQUAD_DEV_PREFIX, size=config.EVAL_SIZE) qa = QASystem(model_name) with tf.Session() as sess: # ====== Load a pretrained model if it exists or create a new one if no pretrained available ====== qa.initialize_model(sess) qa.train(sess, [train, dev])
def main(_): '''Check the Config.py to set up models pathes to be ensembled.''' data_dir = cfg.DATA_DIR set_names = cfg.set_names suffixes = cfg.suffixes dataset = mask_dataset(data_dir, set_names, suffixes) raw_answers = read_raw_answers(data_dir) vocab_path = pjoin(data_dir, cfg.vocab_file) vocab, rev_vocab = initialize_vocab(vocab_path) if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) model_pathes = cfg.model_pathes num_m = len(model_pathes) train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32) train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32) val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32) val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32) # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True for i in xrange(num_m): tf.reset_default_graph() with tf.Session(config=config) as sess: encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder) init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(model_pathes[i]) initialize_model(sess, qa, load_train_dir) ts, te, vs, ve = qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, ensemble=True, training=True, sample=cfg.num_eval) train_s[:, i] = ts train_e[:, i] = te val_s[:, i] = vs val_e[:, i] = ve if i == num_m - 1: # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e]) train_s = bin_count(train_s) train_e = bin_count(train_e) val_s = bin_count(val_s) val_e = bin_count(val_e) qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sendin=(train_s, train_e, val_s, val_e), sample=cfg.num_eval )
def main(_): c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) args = parse_arg() update_config(args, c_time) # pprint.pprint(cfg) logging.info(cfg) if args.test: pdb.set_trace() data_dir = cfg.DATA_DIR set_names = cfg.set_names suffixes = cfg.suffixes dataset = mask_dataset(data_dir, set_names, suffixes) answers = read_answers(data_dir) raw_answers = read_raw_answers(data_dir) vocab_path = pjoin(data_dir, cfg.vocab_file) vocab, rev_vocab = initialize_vocab(vocab_path) embed_path = pjoin(data_dir, "glove.trimmed." + str(cfg.embed_size) + ".npz") # logging.info('embed size: {} for path {}'.format(cfg.embed_size, embed_path)) # embedding = np.load(embed_path)['glove'] if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) print_parameters() # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.reset_default_graph() encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder, embed_path) with tf.Session(config=config) as sess: init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(cfg.train_dir) logging.info('=========== trainable varaibles ============') for i in tf.trainable_variables(): logging.info(i.name) logging.info('=========== regularized varaibles ============') for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES): logging.info(i.name) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(cfg.train_dir) if args.test: qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir, raw_answers=raw_answers, debug_num=100, rev_vocab=rev_vocab) else: qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir, raw_answers=raw_answers, rev_vocab=rev_vocab) qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sample=4000)