def main(_): # Do what you need to load datasets from FLAGS.data_dir train_data, val_data = load_preprocess_data(FLAGS.data_dir, FLAGS.max_context_len, FLAGS.max_question_len) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = tf.constant(load_embeddings(embed_path), tf.float32) encoder = Encoder(FLAGS.state_size, FLAGS.summary_flag, FLAGS.max_context_len, FLAGS.max_question_len) decoder = Decoder(FLAGS.state_size, FLAGS.summary_flag) qa = QASystem(encoder, decoder, FLAGS, embeddings, rev_vocab) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, train_data, val_data, save_train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir datasetTrain = initialize_datasets(FLAGS.data_dir, 'train') datasetVal = initialize_datasets(FLAGS.data_dir, 'val') #datasetTrain = datasetTrain[0:100] embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, embed_path, rev_vocab, FLAGS) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, datasetTrain, save_train_dir) qa.evaluate_answer(sess, datasetVal, sample=1000, log=True)
def main(args): if args: restore = args embed_path = FLAGS.embed_path or "data/squad/glove.trimmed.{}.npz".format( FLAGS.embedding_size) embeddingz = np.load(embed_path) embeddings = embeddingz['glove'] embeddingz.close() assert embeddings.shape[1] == FLAGS.embedding_size vocab_len = embeddings.shape[0] train = load_squad(FLAGS.data_dir, "train", max_vocab=vocab_len if FLAGS.check_embeddings else 0, max_samples=FLAGS.max_train_samples) val = load_squad(FLAGS.data_dir, "val", max_vocab=vocab_len if FLAGS.check_embeddings else 0, max_samples=FLAGS.max_val_samples) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) if FLAGS.verify_only: print_samples(train, FLAGS.verify_only, rev_vocab) return global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if os.path.exists(global_train_dir): os.unlink(global_train_dir) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir qa = QASystem(train_dir, embeddings) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: initialize_model(sess, qa, train_dir) qa.train(sess, train) qa.evaluate_answer(sess, qa.preprocess_sequence_data(val), log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir # load all in once, maybe better to try batch by batch question_path = "./data/squad/train.ids.question" context_path = "./data/squad/train.ids.context" answer_path = "./data/squad/train.span" val_q = "./data/squad/val.ids.question" val_c = "./data/squad/val.ids.context" val_a = "./data/squad/val.span" embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") # embeddings is a matrix of shape [vocab_size, embedding_size] embeddings = np.load(embed_path)['glove'].astype(np.float32) val_data = load_and_pad_val_data(val_q, val_c, val_a) # vocab is the mapping from word -> token id # rev_vocab is the reverse mapping, from id -> word vocab, rev_vocab = initialize_vocab(vocab_path) # someone posted that the max length of question is 766 info = (question_path, context_path, answer_path, FLAGS.batch_size, FLAGS.max_length, FLAGS.output_size) ''' batch_gen = batch_generator(question_path, context_path, answer_path, FLAGS.batch_size, FLAGS.max_length, FLAGS.output_size) i = 0; while True: batch_gen.next() i += 1 logging.info(i) ''' encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS, embeddings) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, batch_generator, info, save_train_dir, val_data, rev_vocab)
def main(FLAGS): print(80 * "=") print("INITIALIZING") print(80 * "=") # Do what you need to load datasets from FLAGS.data_dir #parser, embeddings, train_examples, dev_set, test_set = load_and_preprocess_data(debug) if not os.path.exists('./data/weights/'): os.makedirs('./data/weights/') embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) print("Loading Embedding Matrix") embeddings = np.load(embed_path)['glove'] encoder = Encoder(size=FLAGS.output_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS, embeddings) #qa = QASystem(encoder, FLAGS, embeddings) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) print("Building Network ... ") initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) print("Load Training Data") dataset = initialize_datasets(FLAGS.data_dir, dataset='train', debugMode=True) # encoder.encode_question( # dataset['Questions'], question['Questions_masks']) print(80 * "=") print("Training") print(80 * "=") qa.train(sess, dataset, save_train_dir) print("Finished Training") print("Load Validation Data") dataset = initialize_datasets(FLAGS.data_dir, dataset='val', debugMode=True) print(80 * "=") print("Evaluation") print(80 * "=") qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def run_func(model_name): train = SquadData.load(config.SQUAD_TRAIN_PREFIX, size=config.TRAIN_SIZE) dev = SquadData.load(config.SQUAD_DEV_PREFIX, size=config.EVAL_SIZE) qa = QASystem(model_name) with tf.Session() as sess: # ====== Load a pretrained model if it exists or create a new one if no pretrained available ====== qa.initialize_model(sess) qa.train(sess, [train, dev])
def main(_): # Do what you need to load datasets from FLAGS.data_dir #dataset = load_data(FLAGS.data_dir) # None dataset = {} num_train = load_data_dq(dataset, 'train', FLAGS.data_dir) num_val = load_data_dq(dataset, 'val', FLAGS.data_dir) load_data_sa(dataset, 'train', FLAGS.data_dir, num_train) load_data_sa(dataset, 'val', FLAGS.data_dir, num_val) trim(dataset['train']) trim(dataset['val']) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) # Session moved upfront to set the ifgpu flag before QASystem with tf.Session() as sess: pass local_device_protos = device_lib.list_local_devices() # 38559755 for x in local_device_protos: if x.device_type == 'GPU': FLAGS.ifgpu = True break qa = QASystem(encoder, decoder, embed_path, rev_vocab) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) #print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # tfdbg #with tf.Session() as sess: # pass #sess = tfdbg.LocalCLIDebugWrapperSession(sess) #sess.add_tensor_filter("has_inf_or_nan", tfdbg.has_inf_or_nan) load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) #qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True) sess.close() #tfdbg
def main(_): # TODO # Do what you need to load datasets from FLAGS.data_dir dataset = dict() for dataset_type in ['train', 'val']: with open(os.path.join(FLAGS.data_dir, "%s.ids.context" % dataset_type)) as f: data_context = [ map(int, line.split()) for line in f.read().splitlines() ] with open( os.path.join(FLAGS.data_dir, "%s.ids.question" % dataset_type)) as f: data_question = [ map(int, line.split()) for line in f.read().splitlines() ] with open(os.path.join(FLAGS.data_dir, "%s.span" % dataset_type)) as f: data_span = [ map(int, line.split()) for line in f.read().splitlines() ] dataset[dataset_type] = (data_context, data_question, data_span) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, config=config) decoder = Decoder(output_size=FLAGS.output_size, config=config) qa = QASystem(encoder, decoder, config=config) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, FLAGS.evaluate, log=True)
def main(_): dataset = {} question_max_len = 40 context_max_len = 600 # Preprocess and collect small test data test_q_path = pjoin(FLAGS.data_dir, "test.ids.question") test_q_data, test_q_seq_len = pad_sentences(test_q_path, question_max_len) assert not any( test_q_seq_len > test_q_data.shape[1] ), 'Some questions have length greater than max question length' test_c_path = pjoin(FLAGS.data_dir, "test.ids.context") test_c_data, test_c_seq_len = pad_sentences(test_c_path, context_max_len) assert not any( test_c_seq_len > test_c_data.shape[1] ), 'Some contexts have length greater than max context length' test_s_path = pjoin(FLAGS.data_dir, "test.span") test_s_e_id = get_answer_span(test_s_path, context_max_len) dataset['test'] = [ test_q_data, test_q_seq_len, test_c_data, test_c_seq_len, test_s_e_id ] embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(state_size=FLAGS.state_size) qa = QASystem(encoder, decoder, question_max_len, context_max_len, embed_path, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.dropout, FLAGS.optimizer, FLAGS.max_gradient_norm) # try without dropout if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset['test'], save_train_dir, small_data_test=True) #
def main(_): if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) logger = logging.getLogger() # Do what you need to load datasets from FLAGS.data_dir dataset = None vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) train_data = zip(*load_data(FLAGS.data_dir, "train")) val_data = zip(*load_data(FLAGS.data_dir, "val")) dev_data = zip(*load_data(FLAGS.data_dir, "dev")) #model_train_data = train_data + val_data + dev_data model_train_data = train_data model_eval_data = val_data global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if os.path.exists(global_train_dir): os.unlink(global_train_dir) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Graph().as_default(): with tf.Session() as sess: logger.info("Loading embeddings") embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz') pretrained_embeddings = embeddings['glove'] logger.info("Embeddings loaded with shape: %s %s" % (pretrained_embeddings.shape)) qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys())) initialize_model(sess, qa, train_dir) qa.train(sess, model_train_data, model_eval_data)
def main(_): # Do what you need to load datasets from FLAGS.data_dir # use .readlines() to load file ourselves # use python generator question_path = pjoin(FLAGS.data_dir, "data_train/train_tokenH.txt") paragraph_path = pjoin(FLAGS.data_dir, "data_train/train_tokenP.txt") answer_path = pjoin(FLAGS.data_dir, "data_train/train_index.txt") val_question_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenH.txt") val_paragraph_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenP.txt") val_answer_path = pjoin(FLAGS.data_dir, "data_dev/dev_index.txt") # for testing # dataset = [(1,1,1), (1,1,1)] dataset = load_dataset(question_path, paragraph_path, answer_path, FLAGS.batch_size) val_dataset = load_dataset(val_question_path, val_paragraph_path, val_answer_path, FLAGS.batch_size) #generate_histograms(dataset) #generate_histograms(val_dataset) # loads embedding FLAGS.embed_path = FLAGS.embed_path or pjoin("data", "sgns.merge.word.npz") vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.txt") vocab, rev_vocab = initialize_vocab( vocab_path) # one is list and one is dict encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(size=FLAGS.state_size, output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS) # log file if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # start training with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, val_dataset, save_train_dir, rev_vocab)
def main(_): test_device_placement() prepend = FLAGS.sample_data_prepend # change this FLAGS value to an empty string to train on the full data set. #Some logic that lets us only run validation. It should be cleaner. val_only = True if len( sys.argv) > 1 and sys.argv[1] == 'val_only' else False print('Reading data') print('==================') val_set = fetch_data_set(prepend, 'val') if val_only: tr_set_size = 0 else: tr_set = fetch_data_set(prepend, 'train') tr_set_size = tr_set[0].shape[0] print('Finished reading data') print('==================') embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") #vocab is map from words to indices, rev_vocab is our list of words in reverse frequency order vocab, rev_vocab = initialize_vocab(vocab_path) idx_word = data_utils.invert_map(vocab) del vocab del rev_vocab qa = QASystem(FLAGS, embed_path, idx_word, False, tr_set_size, False) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: initialize_model(sess, qa, FLAGS.train_dir) if val_only: print('Running validation only') qa.validate(sess, val_set, 'validation') else: qa.train(sess, tr_set, val_set, FLAGS.train_dir)
def main(_): # TODO maybe pass as loaded dataset abstraction instead of # file_paths? default_hparams = create_hparams(FLAGS) context_file_path = FLAGS.data_dir + '/train.ids.context' question_file_path = FLAGS.data_dir + '/train.ids.question' span_file_path = FLAGS.data_dir + '/train.span' dataset = (context_file_path, question_file_path, span_file_path) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) default_hparams.add_hparam('vocab_size', len(vocab)) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(state_size=FLAGS.state_size) qa = QASystem(encoder, decoder, default_hparams) # Setup embeddings embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) np_embeddings = np.float32(np.load(embed_path)['glove']) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) session_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) session_config.gpu_options.allow_growth = True with tf.Session(config=session_config) as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir, np_embeddings) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir datasetTrain = initialize_datasets(FLAGS.data_dir, 'train.', debugMode=False) datasetVal = initialize_datasets(FLAGS.data_dir, 'val.', debugMode=False) datasetTrain.extend(datasetVal) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) #This is taking a long time tic = datetime.now() qa = QASystem(encoder, decoder, embed_path, FLAGS, rev_vocab) print('Time to setup the model: ', datetime.now() - tic) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) #saver = tf.train.Saver() with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) # Get directory to save model #save_train_dir = get_normalized_train_dir(FLAGS.train_dir) results_path = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now()) save_train_dir = results_path + "model.weights/" if not os.path.exists(save_train_dir): os.makedirs(save_train_dir) qa.train(sess, datasetTrain, save_train_dir) #, saver) qa.evaluate_answer(sess, datasetVal, rev_vocab, sample=1000, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = load_dataset() embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, embedding_size=FLAGS.embedding_size, output_size=FLAGS.output_size) decoder = Decoder(state_size=FLAGS.state_size, output_size=FLAGS.output_size) qa_args = { "embed_path": embed_path, "embedding_size": FLAGS.embedding_size, "output_size": FLAGS.output_size, "optimizer": FLAGS.optimizer, "learning_rate": FLAGS.learning_rate, "epochs": FLAGS.epochs, "batch_size": FLAGS.batch_size, "max_gradient_norm": FLAGS.max_gradient_norm, "dropout_keep_prob": 1.0 - FLAGS.dropout, "train_dir": FLAGS.train_dir, "state_size": FLAGS.state_size } qa = QASystem(encoder, decoder, **qa_args) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir with open(pjoin("data", "squad", "train.ids.context"), encoding='utf-8') as fcontext, \ open(pjoin("data", "squad", "train.ids.question"), encoding='utf-8') as fquestion, \ open(pjoin("data", "squad", "train.span"), encoding='utf-8') as fspan: context = np.array([[int(idx) for idx in l.split()] for l in fcontext.readlines()]) question = np.array([[int(idx) for idx in l.split()] for l in fquestion.readlines()]) ans = np.array([[int(idx) for idx in l.split()] for l in fspan.readlines()]) dataset = { 'context': context, 'question': question, 'answer_span_start': ans[:, 0], 'answer_span_end': ans[:, 1] } embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = load_embeddings(embed_path) qa = QASystem(embeddings, FLAGS) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) # print(vars(FLAGS)) # with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w', encoding='utf-8') as fout: # json.dump(FLAGS.__flags, fout) with tf.Session() as sess: # load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) load_train_dir = FLAGS.train_dir initialize_model(sess, qa, load_train_dir) # save_train_dir = get_normalized_train_dir(FLAGS.train_dir) save_train_dir = FLAGS.train_dir qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, FLAGS.evaluate, log=True)
def main(_): logging.info("Loading training data") dataset_train = load_train_data(FLAGS.data_dir, isValidation = False) logging.info("Loading validation data") dataset_val = load_train_data(FLAGS.data_dir, isValidation = True) logging.info("Building Model Graph") tf.set_random_seed(42) np.random.seed(43) select_test(0) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = SimpleLinearDecoder() #AnswerPointerDecoder() qa = QASystem(encoder, decoder, len(dataset_train[0])) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) logging.info(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) cris_flag = os.environ.get('CS224N_CRIS') if cris_flag is not None: logging.info('hi cris') sess = tf.Session(config = tf.ConfigProto(intra_op_parallelism_threads = 1)) else: sess = tf.Session() with sess.as_default(): load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset_train, dataset_val, save_train_dir) sess.close()
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = {"train": load_data(FLAGS.data_dir, mode="train"), \ "val": load_data(FLAGS.data_dir, mode="val")} embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) matcher = Matcher(perspective_dim=25, input_size=FLAGS.state_size) # add flag decoder = Decoder(output_size=FLAGS.output_size, state_size=FLAGS.state_size, n_perspective_dim=50 * 2) # add flag qa = QASystem(encoder, matcher, decoder, \ vocab=vocab, vocab_dim=FLAGS.embedding_size, rev_vocab=rev_vocab, embed_path=embed_path) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) tf.global_variables_initializer().run() graph_writer = tf.summary.FileWriter("qa-graph") graph_writer.add_graph(sess.graph) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, 500, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir training_question_data_path = pjoin(FLAGS.data_dir, 'train.question') dataset = load_dataset(FLAGS.data_dir) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) pretrained_embeddings = np.load(embed_path)['glove'] encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, pretrained_embeddings=pretrained_embeddings, max_question_length=FLAGS.max_question_length, max_context_length=FLAGS.max_context_length) decoder = Decoder(output_size=FLAGS.output_size, size=FLAGS.state_size, max_context_length=FLAGS.max_context_length) qa = QASystem(encoder, decoder) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): dataset = [ load_dataset([ "embedding/train.ids.question", "embedding/train.ids.context", "data/train.span" ]), load_dataset([ "embedding/val.ids.question", "embedding/val.ids.context", "data/val.span" ]) ] embed_path = pjoin("embedding", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) tf.reset_default_graph() encoder = Encoder(FLAGS.num_hidden_unit, tf.contrib.rnn.GRUCell) decoder = Decoder(FLAGS.num_hidden_unit, tf.contrib.rnn.BasicLSTMCell) embedding = np.load(embed_path)["glove"] qa = QASystem(encoder, decoder, embedding, FLAGS.keep_prob) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: initialize_model(sess, qa, FLAGS.train_dir) qa.train(sess, dataset, FLAGS.epochs, FLAGS.batch_size, FLAGS.train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = None vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.size) deocder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, deocder) global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if os.path.exists(global_train_dir): os.unlink(global_train_dir) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: initialize_model(sess, qa, train_dir) qa.train(sess, dataset) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = None embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) with np.load(embed_path) as data: glove_embeddings = np.asfarray(data["glove"], dtype=np.float32) dataset = load_and_preprocess_data() # print(train_data) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, config=FLAGS) decoder = Decoder(output_size=FLAGS.output_size, config=FLAGS) qa = QASystem(encoder, decoder, embeddings=glove_embeddings, config=FLAGS, vocab=(vocab, rev_vocab)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, FLAGS.evaluate, log=True)
def run_func(): config = Config() train = squad_dataset(config.question_train, config.context_train, config.answer_train) dev = squad_dataset(config.question_dev, config.context_dev, config.answer_dev) embed_path = config.embed_path vocab_path = config.vocab_path vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) with tf.Session() as sess: # ====== Load a pretrained model if it exists or create a new one if no pretrained available ====== qa.initialize_model(sess, config.train_dir) qa.train(sess, [train, dev], config.train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = get_dataset(FLAGS.data_dir, FLAGS.max_question_size, FLAGS.max_paragraph_size) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) FLAGS.embed_path = embed_path vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, FLAGS=FLAGS) decoder = Decoder(FLAGS=FLAGS) qa = QASystem(encoder, decoder, FLAGS) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: #load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) #Change these back for final submission load_train_dir = FLAGS.load_train_dir or FLAGS.train_dir print("load_train_dir: ", load_train_dir) initialize_model(sess, qa, load_train_dir) #save_train_dir = get_normalized_train_dir(FLAGS.train_dir) #Change back for final submission save_train_dir = FLAGS.train_dir print("save_train_dir: ", save_train_dir) qa.train(sess, dataset, save_train_dir, rev_vocab)
def main(_): # Do what you need to load datasets from FLAGS.data_dir embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) train_path = 'data/squad' val_path = 'data/squad' config = Config(embed_path, train_path, val_path) encoder = EncoderCoattention(config) decoder = DecoderDynamic(config) qa = QASystem(encoder, decoder, config) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, save_train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = [] dataset.append(pjoin(FLAGS.data_dir, "train.ids.question")) dataset.append(pjoin(FLAGS.data_dir, "train.ids.context")) dataset.append(pjoin(FLAGS.data_dir, "train.span")) dataset.append(pjoin(FLAGS.data_dir, "val.ids.question")) dataset.append(pjoin(FLAGS.data_dir, "val.ids.context")) dataset.append(pjoin(FLAGS.data_dir, "val.span")) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.paragraph_output_size) qa = QASystem(encoder, decoder) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir)
def main(_): config = Config() dataset = None # TODO ;load dateset ??? - look at dataset and seenhow it loooks - change model.py accordingly embed_path = config.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(config.embed_size)) embeddingz = np.load(embed_path) embeddings = embeddingz['glove'] embeddingz.close() vocab_len = embeddings.shape[0] train = load_squad(config.data_dir, "train", vocab_len, config.data_dir, max_samples=config.max_train_samples) val = load_squad(config.data_dir, "val", vocab_len, config.data_dir, max_samples=config.max_val_samples) print('train size: ', len(train), ' val size: ', len(val)) vocab_path = config.vocab_path or pjoin(config.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) # print_samples(train,1, rev_vocab) #train is triplets of (context,question,answer) encoder = Encoder(state_size=config.hidden_size, embedding_size=config.embed_size) decoder = Decoder(state_size=config.hidden_size, embedding_size=config.embed_size) qa = QASystem(encoder, decoder) with tf.Session() as sess: load_train_dir = (config.load_train_dir or config.train_dir) # put "" here if you want to build a new model initialize_model(sess, qa, load_train_dir) save_train_dir = config.train_dir ds_train = qa.pad_sequences(train) ret_q, ret_p, ret_labels = ds_train qa.train(sess, ds_train, save_train_dir) ds_val = qa.pad_sequences(val) print('train error') qa.evaluate_answer(sess, ds_train, log=True) print('val error') qa.evaluate_answer(sess, ds_val, log=True)
def main(_): FLAGS.config = int(sys.argv[1]) load_config(current_config=FLAGS.config) # Do what you need to load datasets from FLAGS.data_dir dataset = load_data(FLAGS.data_dir) # ((question, context), answer) train_data = preprocess_dataset(dataset['train'], FLAGS.output_size, FLAGS.question_size) val_data = preprocess_dataset(dataset['val'], FLAGS.output_size, FLAGS.question_size) # print(dataset) embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log" + '_config_' + str(FLAGS.config) + ".txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags" + '_config_' +\ str(FLAGS.config) + ".json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, train_data, val_data, save_train_dir) qa.evaluate_answer(sess, train_data, val_data, FLAGS.evaluate, log=True)
def main(_): c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) args = parse_arg() update_config(args, c_time) # pprint.pprint(cfg) logging.info(cfg) if args.test: pdb.set_trace() data_dir = cfg.DATA_DIR set_names = cfg.set_names suffixes = cfg.suffixes dataset = mask_dataset(data_dir, set_names, suffixes) answers = read_answers(data_dir) raw_answers = read_raw_answers(data_dir) vocab_path = pjoin(data_dir, cfg.vocab_file) vocab, rev_vocab = initialize_vocab(vocab_path) embed_path = pjoin(data_dir, "glove.trimmed." + str(cfg.embed_size) + ".npz") # logging.info('embed size: {} for path {}'.format(cfg.embed_size, embed_path)) # embedding = np.load(embed_path)['glove'] if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) print_parameters() # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.reset_default_graph() encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder, embed_path) with tf.Session(config=config) as sess: init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(cfg.train_dir) logging.info('=========== trainable varaibles ============') for i in tf.trainable_variables(): logging.info(i.name) logging.info('=========== regularized varaibles ============') for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES): logging.info(i.name) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(cfg.train_dir) if args.test: qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir, raw_answers=raw_answers, debug_num=100, rev_vocab=rev_vocab) else: qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir, raw_answers=raw_answers, rev_vocab=rev_vocab) qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sample=4000)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = {} question_max_len = 40 context_max_len = 600 # Preprocess and collect train data train_q_path = pjoin(FLAGS.data_dir, "train.ids.question") train_q_data, train_q_seq_len = pad_sentences(train_q_path, question_max_len) assert not any( train_q_seq_len > train_q_data.shape[1] ), 'Some questions have length greater than max question length' train_c_path = pjoin(FLAGS.data_dir, "train.ids.context") train_c_data, train_c_seq_len = pad_sentences(train_c_path, context_max_len) assert not any( train_c_seq_len > train_c_data.shape[1] ), 'Some contexts have length greater than max context length' train_s_path = pjoin(FLAGS.data_dir, "train.span") train_s_e_id = get_answer_span(train_s_path, context_max_len) dataset['train'] = [ train_q_data, train_q_seq_len, train_c_data, train_c_seq_len, train_s_e_id ] # Preprocess and collect validation data val_q_path = pjoin(FLAGS.data_dir, "val.ids.question") val_q_data, val_q_seq_len = pad_sentences(val_q_path, question_max_len) val_c_path = pjoin(FLAGS.data_dir, "val.ids.context") val_c_data, val_c_seq_len = pad_sentences(val_c_path, context_max_len) val_s_path = pjoin(FLAGS.data_dir, "val.span") val_s_e_id = get_answer_span(val_s_path, context_max_len) dataset['val'] = [ val_q_data, val_q_seq_len, val_c_data, val_c_seq_len, val_s_e_id ] embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(state_size=FLAGS.state_size) qa = QASystem(encoder, decoder, question_max_len, context_max_len, embed_path, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.dropout, FLAGS.optimizer, FLAGS.max_gradient_norm) # try without dropout if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) for i in range(FLAGS.epochs): qa.train(sess, dataset['train'], save_train_dir) # print('Finish training epoch {}'.format(i)) qa.evaluate_answer(sess, dataset['val']) # vocab, FLAGS.evaluate
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = None embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) context_ids_path = pjoin(FLAGS.data_dir, "train.ids.context") question_ids_path = pjoin(FLAGS.data_dir, "train.ids.question") answer_span_path = pjoin(FLAGS.data_dir, "train.span") val_context_ids_path = pjoin(FLAGS.data_dir, "val.ids.context") val_question_ids_path = pjoin(FLAGS.data_dir, "val.ids.question") val_answer_span_path = pjoin(FLAGS.data_dir, "val.span") context_path = pjoin(FLAGS.data_dir, "train.context") val_context_path = pjoin(FLAGS.data_dir, "val.context") context_ids = initialize_data(context_ids_path) question_ids = initialize_data(question_ids_path) answer_spans = initialize_data(answer_span_path) context = initialize_data(context_path, keep_as_string=True) val_context_ids = initialize_data(val_context_ids_path) val_question_ids = initialize_data(val_question_ids_path) val_answer_spans = initialize_data(val_answer_span_path) val_context = initialize_data(val_context_path, keep_as_string=True) # TODO: check this clipping, especially the answer # Reducing context length to the specified max in FLAGS.output_size paragraph_lengths = [] # question_lengths = [] for i in range(0, len(context_ids)): paragraph_lengths.append(len(context_ids[i])) context_ids[i] = context_ids[i][:FLAGS.output_size] context[i] = context[i][:FLAGS.output_size] answer_spans[i] = np.clip(answer_spans[i], 0, FLAGS.output_size - 1) question_ids[i] = question_ids[i][:FLAGS.question_size] for j in range(0, len(val_context_ids)): paragraph_lengths.append(len(val_context_ids[j])) val_context_ids[j] = val_context_ids[j][:FLAGS.output_size] val_context[j] = val_context[j][:FLAGS.output_size] val_answer_spans[j] = np.clip(val_answer_spans[j], 0, FLAGS.output_size - 1) val_question_ids[j] = val_question_ids[j][:FLAGS.question_size] embeddings = initialize_embeddings(embed_path) max_ctx_len = max(max(map(len, context_ids)), max(map(len, val_context_ids))) max_q_len = max(max(map(len, question_ids)), max(map(len, val_question_ids))) assert max_ctx_len == FLAGS.output_size, "MISMATCH BETWEEN MAX_CTX_LEN AND FLAGS.OUTPUT_SIZE: " + str( max_ctx_len) + ", " + str(FLAGS.output_size) context_ids, ctx_mask = pad(context_ids, FLAGS.output_size) question_ids, q_mask = pad(question_ids, FLAGS.question_size) val_context_ids, val_ctx_mask = pad(val_context_ids, FLAGS.output_size) val_question_ids, val_q_mask = pad(val_question_ids, FLAGS.question_size) context_ids = np.array(context_ids) question_ids = np.array(question_ids) answer_spans = np.array(answer_spans) ctx_mask = np.array(ctx_mask) q_mask = np.array(q_mask) val_context_ids = np.array(val_context_ids) val_question_ids = np.array(val_question_ids) val_answer_spans = np.array(val_answer_spans) val_ctx_mask = np.array(val_ctx_mask) val_q_mask = np.array(val_q_mask) check_pad(context_ids, ctx_mask) print("CONTEXT IDS PADDED AND CHECKED") check_pad(question_ids, q_mask) print("QUESTION IDS PADDED AND CHECKED") check_pad(val_context_ids, val_ctx_mask) print("VAL CONTEXT IDS PADDED AND CHECKED") check_pad(val_question_ids, val_q_mask) print("VAL QUESTION IDS PADDED AND CHECKED") dataset = [ context_ids, question_ids, answer_spans, ctx_mask, q_mask, context ] val_dataset = [ val_context_ids, val_question_ids, val_answer_spans, val_ctx_mask, val_q_mask, val_context ] assert len(vocab) == embeddings.shape[ 0], "Mismatch between embedding shape and vocab length" assert embeddings.shape[ 1] == FLAGS.embedding_size, "Mismatch between embedding shape and FLAGS" assert len(context_ids) == len(question_ids) == len( answer_spans ), "Mismatch between context, questions, and answer lengths" print("Using model type : {}".format(FLAGS.model_type)) qa = QASystem(pretrained_embeddings=embeddings, flags=FLAGS) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) saver = tf.train.Saver() qa.train(session=sess, dataset=dataset, val_dataset=val_dataset, train_dir=save_train_dir)