Ejemplo n.º 1
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    datasetTrain = initialize_datasets(FLAGS.data_dir, 'train')
    datasetVal = initialize_datasets(FLAGS.data_dir, 'val')
    #datasetTrain = datasetTrain[0:100]


    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, embed_path, rev_vocab, FLAGS)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, datasetTrain, save_train_dir)

        qa.evaluate_answer(sess, datasetVal, sample=1000, log=True)
Ejemplo n.º 2
0
def main(args):
    if args:
        restore = args

    embed_path = FLAGS.embed_path or "data/squad/glove.trimmed.{}.npz".format(
        FLAGS.embedding_size)
    embeddingz = np.load(embed_path)
    embeddings = embeddingz['glove']
    embeddingz.close()
    assert embeddings.shape[1] == FLAGS.embedding_size

    vocab_len = embeddings.shape[0]

    train = load_squad(FLAGS.data_dir,
                       "train",
                       max_vocab=vocab_len if FLAGS.check_embeddings else 0,
                       max_samples=FLAGS.max_train_samples)
    val = load_squad(FLAGS.data_dir,
                     "val",
                     max_vocab=vocab_len if FLAGS.check_embeddings else 0,
                     max_samples=FLAGS.max_val_samples)

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    if FLAGS.verify_only:
        print_samples(train, FLAGS.verify_only, rev_vocab)

        return

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    qa = QASystem(train_dir, embeddings)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        initialize_model(sess, qa, train_dir)

        qa.train(sess, train)

        qa.evaluate_answer(sess, qa.preprocess_sequence_data(val), log=True)
Ejemplo n.º 3
0
def main(FLAGS):
    print(80 * "=")
    print("INITIALIZING")
    print(80 * "=")
    # Do what you need to load datasets from FLAGS.data_dir

    #parser, embeddings, train_examples, dev_set, test_set = load_and_preprocess_data(debug)
    if not os.path.exists('./data/weights/'):
        os.makedirs('./data/weights/')
    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    print("Loading Embedding Matrix")
    embeddings = np.load(embed_path)['glove']

    encoder = Encoder(size=FLAGS.output_size, vocab_dim=FLAGS.embedding_size)

    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, FLAGS, embeddings)
    #qa = QASystem(encoder, FLAGS, embeddings)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        print("Building Network ... ")
        initialize_model(sess, qa, load_train_dir)
        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        print("Load Training Data")
        dataset = initialize_datasets(FLAGS.data_dir,
                                      dataset='train',
                                      debugMode=True)
        # encoder.encode_question(
        #     dataset['Questions'], question['Questions_masks'])
        print(80 * "=")
        print("Training")
        print(80 * "=")
        qa.train(sess, dataset, save_train_dir)
        print("Finished Training")
        print("Load Validation Data")
        dataset = initialize_datasets(FLAGS.data_dir,
                                      dataset='val',
                                      debugMode=True)
        print(80 * "=")
        print("Evaluation")
        print(80 * "=")
        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
Ejemplo n.º 4
0
def main(_):

    # TODO
    # Do what you need to load datasets from FLAGS.data_dir
    dataset = dict()
    for dataset_type in ['train', 'val']:
        with open(os.path.join(FLAGS.data_dir,
                               "%s.ids.context" % dataset_type)) as f:
            data_context = [
                map(int, line.split()) for line in f.read().splitlines()
            ]
        with open(
                os.path.join(FLAGS.data_dir,
                             "%s.ids.question" % dataset_type)) as f:
            data_question = [
                map(int, line.split()) for line in f.read().splitlines()
            ]
        with open(os.path.join(FLAGS.data_dir, "%s.span" % dataset_type)) as f:
            data_span = [
                map(int, line.split()) for line in f.read().splitlines()
            ]
        dataset[dataset_type] = (data_context, data_question, data_span)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      config=config)
    decoder = Decoder(output_size=FLAGS.output_size, config=config)

    qa = QASystem(encoder, decoder, config=config)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)

        qa.evaluate_answer(sess, dataset, FLAGS.evaluate, log=True)
Ejemplo n.º 5
0
Archivo: train.py Proyecto: truncs/QA
def main(_):

    # TODO maybe pass as loaded dataset abstraction instead of
    # file_paths?

    default_hparams = create_hparams(FLAGS)

    context_file_path = FLAGS.data_dir + '/train.ids.context'
    question_file_path = FLAGS.data_dir + '/train.ids.question'
    span_file_path = FLAGS.data_dir + '/train.span'
    dataset = (context_file_path, question_file_path, span_file_path)

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    default_hparams.add_hparam('vocab_size', len(vocab))

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(state_size=FLAGS.state_size)

    qa = QASystem(encoder, decoder, default_hparams)

    # Setup embeddings
    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    np_embeddings = np.float32(np.load(embed_path)['glove'])

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    session_config = tf.ConfigProto(log_device_placement=False,
                                    allow_soft_placement=True)
    session_config.gpu_options.allow_growth = True

    with tf.Session(config=session_config) as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir, np_embeddings)

        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
Ejemplo n.º 6
0
def main(_):
    # Do what you need to load datasets from FLAGS.data_dir
    datasetTrain = initialize_datasets(FLAGS.data_dir,
                                       'train.',
                                       debugMode=False)
    datasetVal = initialize_datasets(FLAGS.data_dir, 'val.', debugMode=False)
    datasetTrain.extend(datasetVal)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    #This is taking a long time
    tic = datetime.now()
    qa = QASystem(encoder, decoder, embed_path, FLAGS, rev_vocab)
    print('Time to setup the model: ', datetime.now() - tic)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    #saver = tf.train.Saver()

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        # Get directory to save model
        #save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        results_path = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now())
        save_train_dir = results_path + "model.weights/"
        if not os.path.exists(save_train_dir):
            os.makedirs(save_train_dir)

        qa.train(sess, datasetTrain, save_train_dir)  #, saver)

        qa.evaluate_answer(sess, datasetVal, rev_vocab, sample=1000, log=True)
Ejemplo n.º 7
0
def main(_):
    # Do what you need to load datasets from FLAGS.data_dir
    with open(pjoin("data", "squad", "train.ids.context"), encoding='utf-8') as fcontext, \
            open(pjoin("data", "squad", "train.ids.question"), encoding='utf-8') as fquestion, \
            open(pjoin("data", "squad", "train.span"), encoding='utf-8') as fspan:
        context = np.array([[int(idx) for idx in l.split()]
                            for l in fcontext.readlines()])
        question = np.array([[int(idx) for idx in l.split()]
                             for l in fquestion.readlines()])
        ans = np.array([[int(idx) for idx in l.split()]
                        for l in fspan.readlines()])
        dataset = {
            'context': context,
            'question': question,
            'answer_span_start': ans[:, 0],
            'answer_span_end': ans[:, 1]
        }

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    embeddings = load_embeddings(embed_path)
    qa = QASystem(embeddings, FLAGS)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    # print(vars(FLAGS))
    # with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w', encoding='utf-8') as fout:
    #     json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        # load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        load_train_dir = FLAGS.train_dir
        initialize_model(sess, qa, load_train_dir)

        # save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        save_train_dir = FLAGS.train_dir
        qa.train(sess, dataset, save_train_dir)

        qa.evaluate_answer(sess, dataset, FLAGS.evaluate, log=True)
Ejemplo n.º 8
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = {"train": load_data(FLAGS.data_dir, mode="train"), \
               "val": load_data(FLAGS.data_dir, mode="val")}

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(state_size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size)
    matcher = Matcher(perspective_dim=25,
                      input_size=FLAGS.state_size)  # add flag
    decoder = Decoder(output_size=FLAGS.output_size,
                      state_size=FLAGS.state_size,
                      n_perspective_dim=50 * 2)  # add flag

    qa = QASystem(encoder, matcher, decoder, \
                  vocab=vocab, vocab_dim=FLAGS.embedding_size, rev_vocab=rev_vocab, embed_path=embed_path)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        tf.global_variables_initializer().run()
        graph_writer = tf.summary.FileWriter("qa-graph")
        graph_writer.add_graph(sess.graph)
        qa.train(sess, dataset, save_train_dir)

        qa.evaluate_answer(sess, dataset, 500, log=True)
Ejemplo n.º 9
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    training_question_data_path = pjoin(FLAGS.data_dir, 'train.question')
    dataset = load_dataset(FLAGS.data_dir)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    pretrained_embeddings = np.load(embed_path)['glove']
    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      pretrained_embeddings=pretrained_embeddings,
                      max_question_length=FLAGS.max_question_length,
                      max_context_length=FLAGS.max_context_length)
    decoder = Decoder(output_size=FLAGS.output_size,
                      size=FLAGS.state_size,
                      max_context_length=FLAGS.max_context_length)

    qa = QASystem(encoder, decoder)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)

        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
Ejemplo n.º 10
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    

    dataset = None

    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    with np.load(embed_path) as data:
        glove_embeddings = np.asfarray(data["glove"], dtype=np.float32)
        
        dataset = load_and_preprocess_data()

        # print(train_data)

        encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, config=FLAGS)
        decoder = Decoder(output_size=FLAGS.output_size, config=FLAGS)

        qa = QASystem(encoder, decoder, embeddings=glove_embeddings, config=FLAGS, vocab=(vocab, rev_vocab))

        if not os.path.exists(FLAGS.log_dir):
            os.makedirs(FLAGS.log_dir)
        file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
        logging.getLogger().addHandler(file_handler)

        print(vars(FLAGS))
        with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
            json.dump(FLAGS.__flags, fout)

        with tf.Session() as sess:
            load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
            initialize_model(sess, qa, load_train_dir)

            save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
            qa.train(sess, dataset, save_train_dir)

            qa.evaluate_answer(sess, FLAGS.evaluate, log=True)
Ejemplo n.º 11
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = None

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.size)
    deocder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, deocder)

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        initialize_model(sess, qa, train_dir)

        qa.train(sess, dataset)

        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
Ejemplo n.º 12
0
def main(_):
    config = Config()
    dataset = None  # TODO ;load dateset ??? - look at dataset and seenhow it loooks - change model.py accordingly

    embed_path = config.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(config.embed_size))
    embeddingz = np.load(embed_path)
    embeddings = embeddingz['glove']
    embeddingz.close()

    vocab_len = embeddings.shape[0]

    train = load_squad(config.data_dir, "train", vocab_len, config.data_dir, max_samples=config.max_train_samples)
    val = load_squad(config.data_dir, "val", vocab_len, config.data_dir, max_samples=config.max_val_samples)

    print('train size: ', len(train), ' val size: ', len(val))

    vocab_path = config.vocab_path or pjoin(config.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    # print_samples(train,1, rev_vocab) #train is triplets of (context,question,answer)
    encoder = Encoder(state_size=config.hidden_size, embedding_size=config.embed_size)
    decoder = Decoder(state_size=config.hidden_size, embedding_size=config.embed_size)

    qa = QASystem(encoder, decoder)
    with tf.Session() as sess:
        load_train_dir = (config.load_train_dir or config.train_dir)  # put "" here if you want to build a new model
        initialize_model(sess, qa, load_train_dir)
        save_train_dir = config.train_dir
        ds_train = qa.pad_sequences(train)
        ret_q, ret_p, ret_labels = ds_train
        qa.train(sess, ds_train, save_train_dir)
        ds_val = qa.pad_sequences(val)

        print('train error')
        qa.evaluate_answer(sess, ds_train, log=True)

        print('val error')
        qa.evaluate_answer(sess, ds_val, log=True)
Ejemplo n.º 13
0
def main(_):
    FLAGS.config = int(sys.argv[1])
    load_config(current_config=FLAGS.config)

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = load_data(FLAGS.data_dir) # ((question, context), answer)
    train_data = preprocess_dataset(dataset['train'],
        FLAGS.output_size, FLAGS.question_size)
    val_data = preprocess_dataset(dataset['val'],
        FLAGS.output_size, FLAGS.question_size)

    # print(dataset)
    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, 
        "log" + '_config_' + str(FLAGS.config) + ".txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags" + '_config_' +\
        str(FLAGS.config) + ".json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)
        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, train_data, val_data, save_train_dir)
        qa.evaluate_answer(sess, train_data, val_data, FLAGS.evaluate, log=True)
Ejemplo n.º 14
0
def main(_):
		# Do what you need to load datasets from FLAGS.data_dir
		dataset = load_dataset() 

		embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
		vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
		vocab, rev_vocab = initialize_vocab(vocab_path)

		encoder = Encoder(state_size=FLAGS.state_size, embedding_size=FLAGS.embedding_size, output_size=FLAGS.output_size)
		decoder = Decoder(state_size=FLAGS.state_size, output_size=FLAGS.output_size)

		qa_args = {"embed_path": embed_path, "embedding_size": FLAGS.embedding_size,
							 "output_size": FLAGS.output_size, "optimizer": FLAGS.optimizer,
							 "learning_rate": FLAGS.learning_rate, "epochs": FLAGS.epochs,
							 "batch_size": FLAGS.batch_size, "max_gradient_norm": FLAGS.max_gradient_norm,
							 "dropout_keep_prob": 1.0 - FLAGS.dropout,
							 "train_dir": FLAGS.train_dir, "state_size": FLAGS.state_size}
		qa = QASystem(encoder, decoder, **qa_args)

		if not os.path.exists(FLAGS.log_dir):
				os.makedirs(FLAGS.log_dir)
		file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
		logging.getLogger().addHandler(file_handler)

		print(vars(FLAGS))
		with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
				json.dump(FLAGS.__flags, fout)

		with tf.Session() as sess:
				load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
				initialize_model(sess, qa, load_train_dir)

				save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
				qa.train(sess, dataset, save_train_dir)

				qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_):
    '''Check the Config.py to set up models pathes to be ensembled.'''

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)

    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    model_pathes = cfg.model_pathes
    num_m = len(model_pathes)
    train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    for i in xrange(num_m):
        tf.reset_default_graph()
        with tf.Session(config=config) as sess:
            encoder = Encoder(size=2 * cfg.lstm_num_hidden)
            decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
            qa = QASystem(encoder, decoder)
            init = tf.global_variables_initializer()
            sess.run(init)
            load_train_dir = get_normalized_train_dir(model_pathes[i])
            initialize_model(sess, qa, load_train_dir)

            ts, te, vs, ve = qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab,
                                                log=True,
                                                ensemble=True,
                                                training=True,
                                                sample=cfg.num_eval)
            train_s[:, i] = ts
            train_e[:, i] = te
            val_s[:, i] = vs
            val_e[:, i] = ve

            if i == num_m - 1:
                # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e])
                train_s = bin_count(train_s)
                train_e = bin_count(train_e)
                val_s = bin_count(val_s)
                val_e = bin_count(val_e)
                qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab,
                                   log=True,
                                   training=True,
                                   sendin=(train_s, train_e, val_s, val_e),
                                   sample=cfg.num_eval
                                   )
Ejemplo n.º 16
0
def main(_):
    # Do what you need to load datasets from FLAGS.data_dir
    dataset = {}
    question_max_len = 40
    context_max_len = 600
    # Preprocess and collect train data
    train_q_path = pjoin(FLAGS.data_dir, "train.ids.question")
    train_q_data, train_q_seq_len = pad_sentences(train_q_path,
                                                  question_max_len)
    assert not any(
        train_q_seq_len > train_q_data.shape[1]
    ), 'Some questions have length greater than max question length'
    train_c_path = pjoin(FLAGS.data_dir, "train.ids.context")
    train_c_data, train_c_seq_len = pad_sentences(train_c_path,
                                                  context_max_len)
    assert not any(
        train_c_seq_len > train_c_data.shape[1]
    ), 'Some contexts have length greater than max context length'
    train_s_path = pjoin(FLAGS.data_dir, "train.span")
    train_s_e_id = get_answer_span(train_s_path, context_max_len)
    dataset['train'] = [
        train_q_data, train_q_seq_len, train_c_data, train_c_seq_len,
        train_s_e_id
    ]
    # Preprocess and collect validation data
    val_q_path = pjoin(FLAGS.data_dir, "val.ids.question")
    val_q_data, val_q_seq_len = pad_sentences(val_q_path, question_max_len)
    val_c_path = pjoin(FLAGS.data_dir, "val.ids.context")
    val_c_data, val_c_seq_len = pad_sentences(val_c_path, context_max_len)
    val_s_path = pjoin(FLAGS.data_dir, "val.span")
    val_s_e_id = get_answer_span(val_s_path, context_max_len)
    dataset['val'] = [
        val_q_data, val_q_seq_len, val_c_data, val_c_seq_len, val_s_e_id
    ]

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(state_size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(state_size=FLAGS.state_size)

    qa = QASystem(encoder, decoder, question_max_len, context_max_len,
                  embed_path, FLAGS.learning_rate, FLAGS.batch_size,
                  FLAGS.dropout, FLAGS.optimizer,
                  FLAGS.max_gradient_norm)  # try without dropout

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)

        for i in range(FLAGS.epochs):
            qa.train(sess, dataset['train'], save_train_dir)  #
            print('Finish training epoch {}'.format(i))
            qa.evaluate_answer(sess, dataset['val'])  # vocab, FLAGS.evaluate
def main(_):
    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    args = parse_arg()
    update_config(args, c_time)
    # pprint.pprint(cfg)
    logging.info(cfg)
    if args.test:
        pdb.set_trace()

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    answers = read_answers(data_dir)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    embed_path = pjoin(data_dir, "glove.trimmed." + str(cfg.embed_size) + ".npz")
    # logging.info('embed size: {} for path {}'.format(cfg.embed_size, embed_path))
    # embedding = np.load(embed_path)['glove']

    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)
    file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    print_parameters()

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    tf.reset_default_graph()

    encoder = Encoder(size=2 * cfg.lstm_num_hidden)
    decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
    qa = QASystem(encoder, decoder, embed_path)

    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        load_train_dir = get_normalized_train_dir(cfg.train_dir)
        logging.info('=========== trainable varaibles ============')
        for i in tf.trainable_variables():
            logging.info(i.name)
        logging.info('=========== regularized varaibles ============')
        for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES):
            logging.info(i.name)

        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(cfg.train_dir)
        if args.test:
            qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir,
                     raw_answers=raw_answers,
                     debug_num=100,
                     rev_vocab=rev_vocab)
        else:
            qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir,
                     raw_answers=raw_answers,
                     rev_vocab=rev_vocab)
        qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab,
                           log=True,
                           training=True,
                           sample=4000)
Ejemplo n.º 18
0
def main(_):

    config_fname = FLAGS.config_path
    assert os.path.exists(config_fname), "config file does not exist"
    logging.info("Loaded configs from: " + config_fname)
    with open(config_fname, "rb") as fp:
        json_flag = json.load(fp)
    # print(json_flag)
    print(vars(FLAGS))
    for key, value in json_flag.iteritems():
        if key == "eval_on_train":
            continue
        if key == "dev_path":
            continue
        if key == "train_dir":
            continue
        FLAGS.__setattr__(key, value)

    print(vars(FLAGS))
    assert os.path.exists(FLAGS.train_dir), "train dir does not exist"
    # assert False

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)
    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # ========= Model-specific =========
    # You must change the following code to adjust to your model
    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    embeddings = load_glove_embeddings(embed_path)
    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)

    # mixer = Mixer()
    # decoder = Decoder(FLAGS)
    if FLAGS.model == 'baseline':
        qa = QASystem(encoder, FLAGS, embeddings, 1)
    elif FLAGS.model == 'matchLSTM':
        qa = QASystemMatchLSTM(FLAGS, embeddings, 1)

    print('\n\nrand_unknown is set to be ' + str(FLAGS.rand_unknown))

    if FLAGS.load_from_json:
        dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
        dev_filename = os.path.basename(FLAGS.dev_path)
        dataset = prepare_dev(dev_dirname, dev_filename, vocab)
        # remove answer
        #        context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data, s_labels, e_labels, true_answers = dataset
        context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data = dataset

        for i in range(1):
            logging.debug('context')
            logging.debug(' '.join(context_tokens_data[i]))
            logging.debug('context_data')
            logging.debug(context_data[i])
            logging.debug('question')
            logging.debug(' '.join(question_tokens_data[i]))
            logging.debug('question_data')
            logging.debug(question_data[i])
            logging.debug('uuid_data')
            logging.debug(question_uuid_data[i])

        with tf.Session() as sess:
            # train_dir = get_normalized_train_dir(FLAGS.train_dir)
            train_dir = FLAGS.train_dir
            initialize_model(sess, qa, train_dir)
            print('About to start generate_answers')
            print(FLAGS.eval_on_train)

            answers = generate_answers(sess, qa, dataset)

            # write to json file to root dir
            with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
                f.write(unicode(json.dumps(answers, ensure_ascii=False)))

    else:
        # load from files converted from json
        FLAGS.data_dir = os.path.join("data", "squad", "qa_answer")
        if (not os.path.isdir("/home/el")) or not os.listdir(FLAGS.data_dir):
            process_dev_json_to_files()
        else:
            print('Data directory %s is not empty: ' %
                  (FLAGS.data_dir, str(os.listdir(FLAGS.data_dir))))
        dataset, max_q_len, max_c_len = load_dataset(FLAGS.data_dir,
                                                     FLAGS.data_size,
                                                     FLAGS.max_question_length,
                                                     FLAGS.max_context_length,
                                                     ['dev'])

        dev_set = dataset['dev']
        print('Start running evaluate_answer on %d of data' % len(dev_set))
        dev_examples = qa.preprocess_question_answer(dev_set)
        dev_raw = dataset['dev_raw']
        dev_uuid = dataset['dev_uuid']
        dev_dataset = [dev_examples, dev_raw, dev_uuid]

        with tf.Session() as sess:
            train_dir = FLAGS.train_dir
            initialize_model(sess, qa, train_dir)
            answers_model = qa.evaluate_answer(session=sess,
                                               dataset=dev_dataset,
                                               sample=len(dev_set),
                                               return_answer_dict=True)
            # write to json file to root dir
            with io.open('dev-prediction-model.json', 'w',
                         encoding='utf-8') as f:
                f.write(unicode(json.dumps(answers_model, ensure_ascii=False)))
Ejemplo n.º 19
0
def main(_):

    # Reads file name for paragraph, parses paragraphs, changes form to list of lists of word embeddings
    # and stores length of each paragraph in paraLens. Next 5 blocks of code do the same for questions and get the indices
    # of answers within the paragraph. Also, does the same for all three elements of the validation dataset.
    paraFileName = pjoin(FLAGS.data_dir, "train") + ".ids.context"
    paraFile = open(paraFileName, "r")
    paras = [[int(a) for a in b.strip().split(" ") if a != '']
             for b in paraFile.read().strip().split("\n")]
    paras, paraLens = zip(*[padList(para, FLAGS.paraLen) for para in paras])

    questionFileName = pjoin(FLAGS.data_dir, "train") + ".ids.question"
    questionFile = open(questionFileName, "r")
    questions = [[int(a) for a in b.strip().split(" ") if a != '']
                 for b in questionFile.read().strip().split("\n")]
    questions, questionLens = zip(
        *[padList(question, FLAGS.qLen) for question in questions])

    ansFileName = pjoin(FLAGS.data_dir, "train") + ".span"
    ansFile = open(ansFileName, "r")
    ans = [[int(a) for a in b.strip().split(" ") if a != '']
           for b in ansFile.read().strip().split("\n")]

    paraFileNameVal = pjoin(FLAGS.data_dir, "val") + ".ids.context"
    paraFileVal = open(paraFileNameVal, "r")
    parasVal = [[int(a) for a in b.strip().split(" ") if a != '']
                for b in paraFileVal.read().strip().split("\n")]
    parasVal, paraLensVal = zip(
        *[padList(para, FLAGS.paraLen) for para in parasVal])

    questionFileNameVal = pjoin(FLAGS.data_dir, "val") + ".ids.question"
    questionFileVal = open(questionFileNameVal, "r")
    questionsVal = [[int(a) for a in b.strip().split(" ") if a != '']
                    for b in questionFileVal.read().strip().split("\n")]
    questionsVal, questionLensVal = zip(
        *[padList(question, FLAGS.qLen) for question in questionsVal])

    ansFileNameVal = pjoin(FLAGS.data_dir, "val") + ".span"
    ansFileVal = open(ansFileNameVal, "r")
    ansVal = [[int(a) for a in b.strip().split(" ") if a != '']
              for b in ansFileVal.read().strip().split("\n")]

    train_remove = []
    val_remove = []
    for i in range(len(ans)):
        if ans[i][0] >= FLAGS.paraLen or ans[i][1] >= FLAGS.paraLen:
            train_remove.append(i)
    for i in range(len(ansVal)):
        if ansVal[i][0] >= FLAGS.paraLen or ans[i][1] >= FLAGS.paraLen:
            val_remove.append(i)
    train_remove = list(reversed(train_remove))
    val_remove = list(reversed(val_remove))
    paras = list(paras)
    questions = list(questions)
    paraLens = list(paraLens)
    questionLens = list(questionLens)
    parasVal = list(parasVal)
    questionsVal = list(questionsVal)
    paraLensVal = list(paraLensVal)
    questionLensVal = list(questionLensVal)
    for i in range(len(train_remove)):
        ind = train_remove[i]
        paras.pop(ind)
        questions.pop(ind)
        paraLens.pop(ind)
        questionLens.pop(ind)
        ans.pop(ind)
    for i in range(len(val_remove)):
        ind = val_remove[i]
        parasVal.pop(ind)
        questionsVal.pop(ind)
        paraLensVal.pop(ind)
        questionLensVal.pop(ind)
        ansVal.pop(ind)
    paras = tuple(paras)
    questions = tuple(questions)
    paraLens = tuple(paraLens)
    questionLens = tuple(questionLens)
    parasVal = tuple(parasVal)
    questionsVal = tuple(questionsVal)
    paraLensVal = tuple(paraLensVal)
    questionLensVal = tuple(questionLensVal)
    #valDataset has paragraphs and questions as word embeddings, answers as the start and end indices in the
    #paragraph and length of each paragraph and question for the validation dataset
    valDataset = (parasVal, questionsVal, ansVal, paraLensVal, questionLensVal)
    #dataset is the same format as valDataset but for the training dataset. It also has valDataset as the last element
    dataset = (paras, questions, ans, paraLens, questionLens, valDataset)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    encoder = CoattentionEncoder(size=FLAGS.state_size,
                                 vocab_dim=FLAGS.embedding_size)
    #encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)
    #decoder = ChunkDecoder(output_size=FLAGS.output_size)
    embeddings = np.load(embed_path)['glove']
    #print(embeddings.keys())

    qa = QASystem(encoder, decoder, embeddings, vocab, rev_vocab)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir, saver)
        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        sample_rates = [1, 2, 5, 10]
        for rate in sample_rates:
            print("rate: ", rate)
            f1_l = []
            em_l = []
            for i in range(10):
                f1, em = qa.evaluate_answer(sess,
                                            valDataset,
                                            sample=len(valDataset[0]),
                                            sample_rate=rate)
                f1_l.append(f1)
                em_l.append(em)
        #print("f1: ", f1, "; em: ", em)
            print(rate)
            print(f1_l)
            print(em_l)
Ejemplo n.º 20
0
def main(_):
    '''Check the Config.py to set up models pathes to be ensembled.'''

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)

    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    file_handler = logging.FileHandler(
        pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    model_pathes = cfg.model_pathes
    num_m = len(model_pathes)
    train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    for i in xrange(num_m):
        tf.reset_default_graph()
        with tf.Session(config=config) as sess:
            encoder = Encoder(size=2 * cfg.lstm_num_hidden)
            decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
            qa = QASystem(encoder, decoder)
            init = tf.global_variables_initializer()
            sess.run(init)
            load_train_dir = get_normalized_train_dir(model_pathes[i])
            initialize_model(sess, qa, load_train_dir)

            ts, te, vs, ve = qa.evaluate_answer(sess,
                                                dataset,
                                                raw_answers,
                                                rev_vocab,
                                                log=True,
                                                ensemble=True,
                                                training=True,
                                                sample=cfg.num_eval)
            train_s[:, i] = ts
            train_e[:, i] = te
            val_s[:, i] = vs
            val_e[:, i] = ve

            if i == num_m - 1:
                # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e])
                train_s = bin_count(train_s)
                train_e = bin_count(train_e)
                val_s = bin_count(val_s)
                val_e = bin_count(val_e)
                qa.evaluate_answer(sess,
                                   dataset,
                                   raw_answers,
                                   rev_vocab,
                                   log=True,
                                   training=True,
                                   sendin=(train_s, train_e, val_s, val_e),
                                   sample=cfg.num_eval)
Ejemplo n.º 21
0
def main(_):
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    FLAGS.sessname = "{:%Y%m%d_%H%M%S}".format(datetime.now())
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir,
                        "log{}.txt".format(FLAGS.sessname)))
    logging.getLogger().addHandler(file_handler)

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = None
    train_p, raw_train_p, train_q, train_ans = \
            load_dataset("train", FLAGS.data_dir)
    val_p, raw_val_p, val_q, val_ans = \
            load_dataset("val", FLAGS.data_dir)

    max_len_p = max(max(map(len, train_p)), max(map(len, val_p)))
    max_len_p = FLAGS.output_size # truncate
    max_len_q = max(max(map(len, train_q)), max(map(len, val_q)))
    max_len_q = 60 # truncate in case things go awry...
    max_len_ans = max(map(len, train_ans)) # 2

    train_padded_p, train_mask_p, train_padded_q, train_mask_q, train_ans = \
            preprocess_data((train_p, train_q, train_ans), "train",
                max_len_p, max_len_q)
    val_padded_p, val_mask_p, val_padded_q, val_mask_q, val_ans = \
            preprocess_data((val_p, val_q, val_ans), "val",
                max_len_p, max_len_q)

    t_len = FLAGS.train_set_size
    if t_len != -1: # minibatch to check overfitting
        train_dataset = zip(train_padded_p[:t_len], train_mask_p[:t_len],
    	                train_padded_q[:t_len], train_mask_q[:t_len], train_ans[:t_len])
    else: # regular version
        train_dataset = zip(train_padded_p, train_mask_p,
    	                train_padded_q, train_mask_q, train_ans)
    FLAGS.num_iters = len(train_dataset)
    val_dataset = zip(val_padded_p, val_mask_p,
                    val_padded_q, val_mask_q, val_ans)
    raw_dataset = (raw_train_p, raw_val_p)
    dataset = (train_dataset, val_dataset, raw_dataset)
    logger.info("Sanity check on lengths: min %s, max %s" % \
            (lambda x: (min(x), max(x)))(map(len, train_padded_p)))

    logger.info("Loading glove embeddings...")
    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    embeddings = np.load(embed_path)#, glove=glove)
    glove = embeddings['glove'] # np array
    logger.info("glove dims {}".format(glove.shape))

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size,
            flags=FLAGS,
            max_len_p=max_len_p, max_len_q=max_len_q)
    decoder = Decoder(output_size=FLAGS.output_size, flags=FLAGS)

    qa = QASystem(encoder, decoder, glove, max_len_p, max_len_q,
            FLAGS)
    # create saver
    qa.saver = tf.train.Saver()


    logger.info("{}".format(vars(FLAGS)))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)

        #qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
        f1, em = qa.evaluate_answer(sess, train_dataset, log=True)
        logger.info("final evaluation: F1: {}, EM: {}".format(f1, em))
Ejemplo n.º 22
0
def main(_):
    global FLAGS
    print("FLAGS:", vars(FLAGS))
    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    embeddings = np.load(embed_path)  #, glove=glove)
    glove = embeddings['glove']  # np array

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    # print(vars(FLAGS))
    # with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
    #     json.dump(FLAGS.__flags, fout)

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)
    context_data, raw_context_data, question_data, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)

    # preprocess data by truncating
    p = map(lambda line: map(int, (line.strip()).split(' ')), context_data)
    q = map(lambda line: map(int, (line.strip()).split(' ')), question_data)
    raw_context_data = map(lambda line: (line.strip()).split(' '),
                           raw_context_data)
    max_len_p = min(max(map(len, p)), FLAGS.output_size)
    max_len_q = max(map(len, q))

    dataset = (p, raw_context_data, q, question_uuid_data)
    #dataset = (context_data, raw_context_data, question_data, question_uuid_data)
    train_p, raw_train_p, train_q, train_ans = \
            load_dataset("train", FLAGS.data_dir)
    train_padded_p, train_mask_p, train_padded_q, train_mask_q, train_ans = \
            preprocess_data((train_p, train_q, train_ans), "train",
                max_len_p, max_len_q)
    train_dataset = zip(train_padded_p, train_mask_p, train_padded_q,
                        train_mask_q, train_ans)

    # Reload flags
    print("loaded flags", vars(FLAGS))

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      flags=FLAGS,
                      max_len_p=max_len_p,
                      max_len_q=max_len_q)
    decoder = Decoder(output_size=FLAGS.output_size, flags=FLAGS)

    qa = QASystem(encoder, decoder, glove, max_len_p, max_len_q, FLAGS)
    # create saver
    qa.saver = tf.train.Saver()

    # train dir
    with tf.Session() as sess:
        train_dir = get_normalized_train_dir(FLAGS.train_dir)
        initialize_model(sess, qa, train_dir)
        qa.raw_train = raw_train_p
        f1, em = qa.evaluate_answer(sess, train_dataset)
        logging.info("train total f1 {}, em {}".format(f1, em))

    with tf.Session() as sess:
        train_dir = get_normalized_train_dir(FLAGS.train_dir)
        initialize_model(sess, qa, train_dir)
        answers = generate_answers(sess, qa, dataset, rev_vocab)

        # write to json file to root dir
        with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
Ejemplo n.º 23
0
def main(_):

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    answers = read_answers(data_dir)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    embed_path = pjoin(data_dir,
                       "glove.trimmed." + str(cfg.embed_size) + ".npz")
    # logging.info('embed size: {} for path {}'.format(cfg.embed_size, embed_path))
    # embedding = np.load(embed_path)['glove']

    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)
    file_handler = logging.FileHandler(
        pjoin(cfg.log_dir, 'log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    print_parameters()

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    tf.reset_default_graph()

    encoder = Encoder(size=2 * cfg.lstm_num_hidden)
    decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
    qa = QASystem(encoder, decoder, embed_path)

    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        load_train_dir = get_normalized_train_dir(cfg.train_dir)
        logging.info('=========== trainable varaibles ============')
        for i in tf.trainable_variables():
            logging.info(i.name)
        logging.info('=========== regularized varaibles ============')
        for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES):
            logging.info(i.name)

        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(cfg.train_dir)
        qa.train(
            cfg.start_lr,
            sess,
            dataset,
            answers,
            save_train_dir,
            raw_answers=raw_answers,
            # debug_num=1000,
            rev_vocab=rev_vocab)
        qa.evaluate_answer(sess,
                           dataset,
                           raw_answers,
                           rev_vocab,
                           log=True,
                           training=True,
                           sample=4000)
Ejemplo n.º 24
0
def main(_):
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    num_hidden = cfg.lstm_num_hidden
    data_dir = cfg.DATA_DIR
    embed_path = pjoin(data_dir,
                       "glove.trimmed." + str(cfg.embed_size) + ".npz")
    vocab_path = pjoin(data_dir, cfg.vocab_file)

    dataset = mask_dataset(data_dir, set_names=set_names, suffixes=suffixes)
    answers = read_answers(data_dir)
    raw_answers = read_raw_answers(data_dir)

    vocab, rev_vocab = initialize_vocab(vocab_path)

    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)

    file_handler = logging.FileHandler(
        pjoin(cfg.log_dir, 'log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    print_parameters()

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    encoder = Encoder(size=2 * num_hidden)
    decoder = Decoder(output_size=2 * num_hidden)

    qa = QASystem(encoder, decoder, embed_path)

    with tf.Session(config=config) as sess:

        load_train_dir = get_normalized_train_dir(cfg.train_dir)

        logging.info('=========== trainable varaibles ============')
        for i in tf.trainable_variables():
            logging.info(i.name)
        logging.info('=========== regularized varaibles ============')
        for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES):
            logging.info(i.name)

        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(cfg.train_dir)

        tic = time.time()
        qa.train(
            sess,
            dataset,
            answers,
            save_train_dir,
            raw_answers,
            rev_vocab,
            # debug_num=1000
        )
        #
        qa.evaluate_answer(sess,
                           dataset,
                           raw_answers,
                           rev_vocab,
                           log=True,
                           training=True,
                           sample=4000)
        toc = time.time()
        logging.info("Total training process took {} hours".format(
            (toc - tic) / 3600.))