Beispiel #1
0
def main(_):
    global FLAGS
    print("FLAGS:", vars(FLAGS))
    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    embeddings = np.load(embed_path)  #, glove=glove)
    glove = embeddings['glove']  # np array

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    # print(vars(FLAGS))
    # with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
    #     json.dump(FLAGS.__flags, fout)

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)
    context_data, raw_context_data, question_data, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)

    # preprocess data by truncating
    p = map(lambda line: map(int, (line.strip()).split(' ')), context_data)
    q = map(lambda line: map(int, (line.strip()).split(' ')), question_data)
    raw_context_data = map(lambda line: (line.strip()).split(' '),
                           raw_context_data)
    max_len_p = min(max(map(len, p)), FLAGS.output_size)
    max_len_q = max(map(len, q))

    dataset = (p, raw_context_data, q, question_uuid_data)
    #dataset = (context_data, raw_context_data, question_data, question_uuid_data)
    train_p, raw_train_p, train_q, train_ans = \
            load_dataset("train", FLAGS.data_dir)
    train_padded_p, train_mask_p, train_padded_q, train_mask_q, train_ans = \
            preprocess_data((train_p, train_q, train_ans), "train",
                max_len_p, max_len_q)
    train_dataset = zip(train_padded_p, train_mask_p, train_padded_q,
                        train_mask_q, train_ans)

    # Reload flags
    print("loaded flags", vars(FLAGS))

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      flags=FLAGS,
                      max_len_p=max_len_p,
                      max_len_q=max_len_q)
    decoder = Decoder(output_size=FLAGS.output_size, flags=FLAGS)

    qa = QASystem(encoder, decoder, glove, max_len_p, max_len_q, FLAGS)
    # create saver
    qa.saver = tf.train.Saver()

    # train dir
    with tf.Session() as sess:
        train_dir = get_normalized_train_dir(FLAGS.train_dir)
        initialize_model(sess, qa, train_dir)
        qa.raw_train = raw_train_p
        f1, em = qa.evaluate_answer(sess, train_dataset)
        logging.info("train total f1 {}, em {}".format(f1, em))

    with tf.Session() as sess:
        train_dir = get_normalized_train_dir(FLAGS.train_dir)
        initialize_model(sess, qa, train_dir)
        answers = generate_answers(sess, qa, dataset, rev_vocab)

        # write to json file to root dir
        with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
Beispiel #2
0
def main(_):
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    FLAGS.sessname = "{:%Y%m%d_%H%M%S}".format(datetime.now())
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir,
                        "log{}.txt".format(FLAGS.sessname)))
    logging.getLogger().addHandler(file_handler)

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = None
    train_p, raw_train_p, train_q, train_ans = \
            load_dataset("train", FLAGS.data_dir)
    val_p, raw_val_p, val_q, val_ans = \
            load_dataset("val", FLAGS.data_dir)

    max_len_p = max(max(map(len, train_p)), max(map(len, val_p)))
    max_len_p = FLAGS.output_size # truncate
    max_len_q = max(max(map(len, train_q)), max(map(len, val_q)))
    max_len_q = 60 # truncate in case things go awry...
    max_len_ans = max(map(len, train_ans)) # 2

    train_padded_p, train_mask_p, train_padded_q, train_mask_q, train_ans = \
            preprocess_data((train_p, train_q, train_ans), "train",
                max_len_p, max_len_q)
    val_padded_p, val_mask_p, val_padded_q, val_mask_q, val_ans = \
            preprocess_data((val_p, val_q, val_ans), "val",
                max_len_p, max_len_q)

    t_len = FLAGS.train_set_size
    if t_len != -1: # minibatch to check overfitting
        train_dataset = zip(train_padded_p[:t_len], train_mask_p[:t_len],
    	                train_padded_q[:t_len], train_mask_q[:t_len], train_ans[:t_len])
    else: # regular version
        train_dataset = zip(train_padded_p, train_mask_p,
    	                train_padded_q, train_mask_q, train_ans)
    FLAGS.num_iters = len(train_dataset)
    val_dataset = zip(val_padded_p, val_mask_p,
                    val_padded_q, val_mask_q, val_ans)
    raw_dataset = (raw_train_p, raw_val_p)
    dataset = (train_dataset, val_dataset, raw_dataset)
    logger.info("Sanity check on lengths: min %s, max %s" % \
            (lambda x: (min(x), max(x)))(map(len, train_padded_p)))

    logger.info("Loading glove embeddings...")
    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    embeddings = np.load(embed_path)#, glove=glove)
    glove = embeddings['glove'] # np array
    logger.info("glove dims {}".format(glove.shape))

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size,
            flags=FLAGS,
            max_len_p=max_len_p, max_len_q=max_len_q)
    decoder = Decoder(output_size=FLAGS.output_size, flags=FLAGS)

    qa = QASystem(encoder, decoder, glove, max_len_p, max_len_q,
            FLAGS)
    # create saver
    qa.saver = tf.train.Saver()


    logger.info("{}".format(vars(FLAGS)))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)

        #qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
        f1, em = qa.evaluate_answer(sess, train_dataset, log=True)
        logger.info("final evaluation: F1: {}, EM: {}".format(f1, em))