def run_func():
    config = Config()

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way
    vocab, rev_vocab = initialize_vocab(config.vocab_path)

    dev_path = "data/squad/fuse.json"
    dev_dirname = os.path.dirname(os.path.abspath(dev_path))
    dev_filename = os.path.basename(dev_path)
    context_data, question_data, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)

    ques_len = len(question_data)
    answers = [[0, 0] for _ in xrange(ques_len)]

    dataset = [question_data, context_data, answers]

    embed_path = config.embed_path

    embeddings = get_trimmed_glove_vectors(embed_path)

    encoder = Encoder(config.hidden_state_size)
    decoder = Decoder(config.hidden_state_size)

    qa = QASystem(encoder, decoder, embeddings, config)

    with tf.Session() as sess:
        qa.initialize_model(sess, config.train_dir)
        answers, _ = generate_answers(sess, qa, dataset, question_uuid_data,
                                      rev_vocab)
        # write to json file to root dir
        with io.open('temp/fuse-answer.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
Exemple #2
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    train_data, val_data = load_preprocess_data(FLAGS.data_dir,
                                                FLAGS.max_context_len,
                                                FLAGS.max_question_len)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    embeddings = tf.constant(load_embeddings(embed_path), tf.float32)

    encoder = Encoder(FLAGS.state_size, FLAGS.summary_flag,
                      FLAGS.max_context_len, FLAGS.max_question_len)
    decoder = Decoder(FLAGS.state_size, FLAGS.summary_flag)

    qa = QASystem(encoder, decoder, FLAGS, embeddings, rev_vocab)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, train_data, val_data, save_train_dir)
def run_func2(dataset, config):
    vocab, rev_vocab = initialize_vocab(config.vocab_path)

    q, c, a = zip(*[[_q, _c, _a] for (_q, _c, _a) in dataset])

    dataset = [q, c, a]

    embed_path = config.embed_path

    embeddings = get_trimmed_glove_vectors(embed_path)

    encoder = Encoder(config.hidden_state_size)
    decoder = Decoder(config.hidden_state_size)

    qa = QASystem(encoder, decoder, embeddings, config)
    question_uuid_data = [i for i in xrange(len(a))]

    with tf.Session() as sess:
        qa.initialize_model(sess, config.train_dir)
        answers, answers_canonical = generate_answers(sess, qa, dataset,
                                                      question_uuid_data,
                                                      rev_vocab)
        # write to json file to root dir
        with io.open('dev-prediction.txt', 'w', encoding='utf-8') as f:
            for i in xrange(len(a)):
                curr_ans = unicode(answers[i], "utf-8")
                f.write("%s\n" % (curr_ans))
Exemple #4
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)
    context_data, question_data, question_uuid_data, context_text = prepare_dev(
        dev_dirname, dev_filename, vocab)
    dataset = (context_data, question_data, question_uuid_data)

    idx_word = data_utils.invert_map(vocab)
    qa = QASystem(FLAGS, embed_path, idx_word, False, 0, True)
    with tf.Session() as sess:
        initialize_model(sess, qa, FLAGS.train_dir)
        start, end = qa.test(sess, dataset)

        answers = generate_answers(sess, qa, dataset, rev_vocab, context_text,
                                   idx_word)

        with io.open('dev-prediction_2.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def run_func():
    config = Config()
    vocab, rev_vocab = initialize_vocab(config.vocab_path)

    dev_path = "download/squad/test.json"
    dev_dirname = os.path.dirname(os.path.abspath(dev_path))
    dev_filename = os.path.basename(dev_path)
    context_data, question_data, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)

    ques_len = len(question_data)
    answers = [[0, 0] for _ in xrange(ques_len)]

    dataset = [question_data, context_data, answers]

    embed_path = config.embed_path
    embeddings = get_trimmed_glove_vectors(embed_path)

    encoder = Encoder(config.hidden_state_size)
    decoder = Decoder(config.hidden_state_size)

    qa = QASystem(encoder, decoder, embeddings, config)

    data = "Id,Answer\n"

    with tf.Session() as sess:
        qa.initialize_model(sess, config.train_dir)
        answers, _ = generate_answers(sess, qa, dataset, question_uuid_data,
                                      rev_vocab)
        for a in answers:
            ans = answers[a]
            data += a + "," + normalize_answer(ans).replace(" s ", "s ") + "\n"

    with open('submission.csv', 'wb') as file:
        file.write(data)
Exemple #6
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    # load all in once, maybe better to try batch by batch
    question_path = "./data/squad/train.ids.question"
    context_path = "./data/squad/train.ids.context"
    answer_path = "./data/squad/train.span"

    val_q = "./data/squad/val.ids.question"
    val_c = "./data/squad/val.ids.context"
    val_a = "./data/squad/val.span"

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")

    # embeddings is a matrix of shape [vocab_size, embedding_size]
    embeddings = np.load(embed_path)['glove'].astype(np.float32)
    val_data = load_and_pad_val_data(val_q, val_c, val_a)

    # vocab is the mapping from word -> token id
    # rev_vocab is the reverse mapping, from id -> word
    vocab, rev_vocab = initialize_vocab(vocab_path)

    # someone posted that the max length of question is 766
    info = (question_path, context_path, answer_path, FLAGS.batch_size,
            FLAGS.max_length, FLAGS.output_size)
    '''   
    batch_gen = batch_generator(question_path, context_path, answer_path, FLAGS.batch_size, FLAGS.max_length, FLAGS.output_size)
    i = 0;
    while True:
        batch_gen.next()
        i += 1
        logging.info(i)
    '''

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, FLAGS, embeddings)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, batch_generator, info, save_train_dir, val_data,
                 rev_vocab)
Exemple #7
0
def predict(model_name):
    qa = QASystem(model_name)

    with tf.Session() as sess:
        qa.initialize_model(sess)
        while True:
            question = input("Ask a question: ")
            for answer, confidence, doc in answer_question(qa, sess, question, best_n=10):
                print('{:.2f}:\t{}    ({})'.format(confidence, answer, doc))
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    #dataset = load_data(FLAGS.data_dir)  # None
    dataset = {}
    num_train = load_data_dq(dataset, 'train', FLAGS.data_dir)
    num_val = load_data_dq(dataset, 'val', FLAGS.data_dir)
    load_data_sa(dataset, 'train', FLAGS.data_dir, num_train)
    load_data_sa(dataset, 'val', FLAGS.data_dir, num_val)
    trim(dataset['train'])
    trim(dataset['val'])

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    # Session moved upfront to set the ifgpu flag before QASystem
    with tf.Session() as sess:
        pass
    local_device_protos = device_lib.list_local_devices()  # 38559755
    for x in local_device_protos:
        if x.device_type == 'GPU':
            FLAGS.ifgpu = True
            break

    qa = QASystem(encoder, decoder, embed_path, rev_vocab)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    #print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # tfdbg
    #with tf.Session() as sess:
    #    pass
    #sess = tfdbg.LocalCLIDebugWrapperSession(sess)
    #sess.add_tensor_filter("has_inf_or_nan", tfdbg.has_inf_or_nan)

    load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                              or FLAGS.train_dir)
    initialize_model(sess, qa, load_train_dir)

    save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
    qa.train(sess, dataset, save_train_dir)

    #qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
    sess.close()  #tfdbg
Exemple #9
0
def main(_):
    dataset = {}
    question_max_len = 40
    context_max_len = 600
    # Preprocess and collect small test data
    test_q_path = pjoin(FLAGS.data_dir, "test.ids.question")
    test_q_data, test_q_seq_len = pad_sentences(test_q_path, question_max_len)
    assert not any(
        test_q_seq_len > test_q_data.shape[1]
    ), 'Some questions have length greater than max question length'
    test_c_path = pjoin(FLAGS.data_dir, "test.ids.context")
    test_c_data, test_c_seq_len = pad_sentences(test_c_path, context_max_len)
    assert not any(
        test_c_seq_len > test_c_data.shape[1]
    ), 'Some contexts have length greater than max context length'
    test_s_path = pjoin(FLAGS.data_dir, "test.span")
    test_s_e_id = get_answer_span(test_s_path, context_max_len)
    dataset['test'] = [
        test_q_data, test_q_seq_len, test_c_data, test_c_seq_len, test_s_e_id
    ]

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(state_size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(state_size=FLAGS.state_size)

    qa = QASystem(encoder, decoder, question_max_len, context_max_len,
                  embed_path, FLAGS.learning_rate, FLAGS.batch_size,
                  FLAGS.dropout, FLAGS.optimizer,
                  FLAGS.max_gradient_norm)  # try without dropout

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)

        qa.train(sess, dataset['test'], save_train_dir,
                 small_data_test=True)  #
Exemple #10
0
def main(_):

    data_dir = cfg.DATA_DIR
    vocab, rev_vocab = initialize_vocab(FLAGS.vocab)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    tf.reset_default_graph()

    encoder = Encoder(size=2 * cfg.lstm_num_hidden)
    decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
    qa = QASystem(encoder, decoder, FLAGS.embed)

    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        load_train_dir = get_normalized_train_dir(FLAGS.ckpt)
        initialize_model(sess, qa, load_train_dir)
        print(
            '*********************************************************************'
        )
        print(
            "Welcome! You can use this to explore the behavior of the model.")
        print(
            '*********************************************************************'
        )

        while True:
            print('-------------------')
            print('Input the context: ')
            print('-------------------')
            sentence = raw_input()
            print('-------------------')
            print('Input the question: ')
            print('-------------------')
            query = raw_input()
            raw_context = nltk.word_tokenize(sentence)
            context = sentence_to_token_ids(sentence,
                                            vocab,
                                            tokenizer=nltk.word_tokenize)
            question = sentence_to_token_ids(query,
                                             vocab,
                                             tokenizer=nltk.word_tokenize)
            context_in = mask_input(context, cfg.context_max_len)
            question_in = mask_input(question, cfg.question_max_len)
            start, end = qa.answer(sess, [context_in], [question_in])
            answer = ' '.join(raw_context[start[0]:end[0] + 1])
            print('==========================================')
            print('ANSWER: {}'.format(answer))
            print('==========================================')
Exemple #11
0
def main(_):
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)
    logger = logging.getLogger()

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = None

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    train_data = zip(*load_data(FLAGS.data_dir, "train"))
    val_data = zip(*load_data(FLAGS.data_dir, "val"))
    dev_data = zip(*load_data(FLAGS.data_dir, "dev"))

    #model_train_data = train_data + val_data + dev_data
    model_train_data = train_data
    model_eval_data = val_data

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Graph().as_default():
        with tf.Session() as sess:
            logger.info("Loading embeddings")
            embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' +
                                 str(FLAGS.embedding_size) + '.npz')
            pretrained_embeddings = embeddings['glove']
            logger.info("Embeddings loaded with shape: %s %s" %
                        (pretrained_embeddings.shape))

            qa = QASystem(FLAGS,
                          pretrained_embeddings,
                          vocab_dim=len(vocab.keys()))
            initialize_model(sess, qa, train_dir)
            qa.train(sess, model_train_data, model_eval_data)
Exemple #12
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    # use .readlines() to load file ourselves
    # use python generator
    question_path = pjoin(FLAGS.data_dir, "data_train/train_tokenH.txt")
    paragraph_path = pjoin(FLAGS.data_dir, "data_train/train_tokenP.txt")
    answer_path = pjoin(FLAGS.data_dir, "data_train/train_index.txt")

    val_question_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenH.txt")
    val_paragraph_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenP.txt")
    val_answer_path = pjoin(FLAGS.data_dir, "data_dev/dev_index.txt")

    # for testing
    # dataset = [(1,1,1), (1,1,1)]
    dataset = load_dataset(question_path, paragraph_path, answer_path,
                           FLAGS.batch_size)
    val_dataset = load_dataset(val_question_path, val_paragraph_path,
                               val_answer_path, FLAGS.batch_size)
    #generate_histograms(dataset)
    #generate_histograms(val_dataset)

    # loads embedding
    FLAGS.embed_path = FLAGS.embed_path or pjoin("data", "sgns.merge.word.npz")
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.txt")
    vocab, rev_vocab = initialize_vocab(
        vocab_path)  # one is list and one is dict

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(size=FLAGS.state_size, output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, FLAGS)

    # log file
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # start training
    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, val_dataset, save_train_dir, rev_vocab)
Exemple #13
0
def run_func():
    config = Config()
    train = squad_dataset(config.question_train, config.context_train,
                          config.answer_train)
    dev = squad_dataset(config.question_dev, config.context_dev,
                        config.answer_dev)
    # print(config.question_train)
    embed_path = config.embed_path
    vocab_path = config.vocab_path
    # print(config.embed_path, config.vocab_path)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    embeddings = get_trimmed_glove_vectors(embed_path)

    encoder = Encoder(config.hidden_size)
    decoder = Decoder(config.hidden_size)

    qa = QASystem(encoder, decoder, embeddings, config)

    with tf.Session() as sess:
        # ====== Load a pretrained model if it exists or create a new one if no pretrained available ======
        qa.initialize_model(sess, config.train_dir)
        # train process
        # qa.train(sess, [train, dev], config.train_dir)
        # em = qa.evaluate_model(sess, dev)

        # run process
        while True:
            question = input('please input question: ')
            if question == 'exit':
                break
            raw_context = input('please input context: ')
            if raw_context == 'exit':
                break
            question = [
                vocab[x] if x in vocab.keys() else 2 for x in question.split()
            ]
            context = [
                vocab[x] if x in vocab.keys() else 2
                for x in raw_context.split()
            ]
            test = [[question], [context], [[1, 2]]]
            a_s, a_e = qa.answer(sess, test)
            if a_e == a_s:
                print("answer: ", raw_context.split()[a_s[0]])
            else:
                print("answer: ",
                      ' '.join(raw_context.split()[a_s[0]:a_e[0] + 1]))
Exemple #14
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)
    context_data, question_data, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)

    context_ids, ctx_mask = pad(context_data, FLAGS.output_size)
    question_ids, q_mask = pad(question_data, FLAGS.question_size)

    for i in range(0, len(context_ids)):
        context_ids[i] = context_ids[i][:FLAGS.output_size]
    for j in range(0, len(question_ids)):
        question_ids[j] = question_ids[j][:FLAGS.question_size]

    context_ids = np.array(context_ids)
    question_ids = np.array(question_ids)
    ctx_mask = np.array(ctx_mask)
    q_mask = np.array(q_mask)
    answer_span = np.array(
        [(0, 0)] * len(context_ids)
    )  #Need this because minibatches is expecting it this way
    dataset = [
        context_ids, question_ids, answer_span, ctx_mask, q_mask,
        question_uuid_data
    ]
    embeddings = initialize_embeddings(embed_path)
    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    #encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    #decoder = Decoder(output_size=FLAGS.output_size)

    #qa = QASystem(pretrained_embeddings=embeddings,flags=FLAGS)
    qa = QASystem(pretrained_embeddings=embeddings, flags=FLAGS)
    with tf.Session() as sess:
        train_dir = get_normalized_train_dir(FLAGS.train_dir)
        initialize_model(sess, qa, train_dir)
        answers = generate_answers(sess, qa, dataset, rev_vocab)

        # write to json file to root dir
        with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
Exemple #15
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    embeddings = load_glove_embeddings(embed_path)

    raw_embed_path = pjoin(
        "data", "squad", "glove.untrimmed.{}.npz".format(FLAGS.embedding_size))
    raw_glove_data = np.load(raw_embed_path)
    raw_glove = raw_glove_data['glove']
    raw_glove_vocab = raw_glove_data['glove_vocab_dict'][()]

    # expand vocab
    vocab, rev_vocab, embeddings = expand_vocab(dev_dirname, dev_filename,
                                                vocab, embeddings, raw_glove,
                                                raw_glove_vocab)

    context_data, question_data, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)
    context_len_data = [len(context.split()) for context in context_data]
    mydata = preprocessing(context_data, question_data, FLAGS.context_maxlen,
                           FLAGS.question_maxlen)
    dataset = (mydata, context_data, context_len_data, question_uuid_data)

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    #encoder = Encoder(vocab_dim=FLAGS.embedding_size, state_size = FLAGS.encoder_state_size)
    #decoder = Decoder(output_size=FLAGS.output_size, hidden_size = FLAGS.decoder_hidden_size, state_size = FLAGS.decoder_state_size)

    qa = QASystem(embeddings, FLAGS)

    with tf.Session() as sess:
        train_dir = get_normalized_train_dir(FLAGS.train_dir)
        initialize_model(sess, qa, train_dir)
        answers = generate_answers(sess, qa, dataset, rev_vocab)

        # write to json file to root dir
        with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
Exemple #16
0
def main(_):
    #======Fill the model name=============
    train_dir = "train/test"
    #======================================
    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)
    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))

    # ========= Load Dataset =========
    train_data,val_data  = load_and_preprocess_data(FLAGS.data_dir, FLAGS.max_context_len, FLAGS.max_question_len, size = FLAGS.train_size)
    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    embedding = tf.constant(load_embeddings(embed_path), dtype = tf.float32)
    encoder = Encoder(FLAGS.state_size, FLAGS.max_context_len, FLAGS.max_question_len, FLAGS.embedding_size, FLAGS.summary_flag, FLAGS.filter_flag)
    decoder = Decoder(FLAGS.state_size, FLAGS.max_context_len, FLAGS.max_question_len, FLAGS.output_size, FLAGS.summary_flag)
    qa = QASystem(encoder, decoder, FLAGS, embedding, rev_vocab)

    with tf.Session() as sess:
        train_dir = get_normalized_train_dir(train_dir)
        qa = initialize_model(sess, qa, train_dir)
        output_list, output_dict = generate_answers(sess, qa, val_data, rev_vocab)
        store_result(output_list, output_dict, train_dir)
Exemple #17
0
def main(_):
    # Do what you need to load datasets from FLAGS.data_dir
    dataset = load_dataset()

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(state_size=FLAGS.state_size,
                      embedding_size=FLAGS.embedding_size,
                      output_size=FLAGS.output_size)
    decoder = Decoder(state_size=FLAGS.state_size,
                      output_size=FLAGS.output_size)

    qa_args = {
        "embed_path": embed_path,
        "embedding_size": FLAGS.embedding_size,
        "output_size": FLAGS.output_size,
        "optimizer": FLAGS.optimizer,
        "learning_rate": FLAGS.learning_rate,
        "epochs": FLAGS.epochs,
        "batch_size": FLAGS.batch_size,
        "max_gradient_norm": FLAGS.max_gradient_norm,
        "dropout_keep_prob": 1.0 - FLAGS.dropout,
        "train_dir": FLAGS.train_dir,
        "state_size": FLAGS.state_size
    }
    qa = QASystem(encoder, decoder, **qa_args)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)
def evaluate(model_name, n=None):
    data = []
    with open(config.TREC_PATH, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in list(reader)[199:]:
            data.append((row[2].strip(), row[3].strip()))

    if not n:
        n = len(data)

    qa = QASystem(model_name)

    top_count = 0
    top_5_count = 0
    top_10_count = 0
    with tf.Session() as sess:
        qa.initialize_model(sess)

        with open(os.path.join(config.MODELS_DIR, model_name, 'trec.csv'),
                  'w') as f:
            writer = csv.writer(f)
            i = 0
            for question, answer_pattern in tqdm(data[:n]):
                answers = [
                    answer for answer, confidence, doc in answer_question(
                        qa, sess, question, 10)
                ]
                writer.writerow(answers)
                correct = [
                    bool(re.search(answer_pattern, answer))
                    for answer in answers
                ]
                if True in correct[:1]:
                    top_count += 1
                if True in correct[:5]:
                    top_5_count += 1
                if True in correct[:10]:
                    top_10_count += 1
                i += 1
                print('{}: {}, {}, {}'.format(i,
                                              float(top_count) / i,
                                              float(top_5_count) / i,
                                              float(top_10_count) / i))
    print('Top match: {}'.format(float(top_count) / n))
    print('Top 5 match: {}'.format(float(top_5_count) / n))
    print('Top 10 match: {}'.format(float(top_10_count) / n))
Exemple #19
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)

    context_data, question_data, question_uuid_data = prepare_dev(dev_dirname, dev_filename, vocab)
    dataset = load_data(context_data, question_data, question_uuid_data)

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    # ========= Model-specific =========
    # You must change the following code to adjust to your model
    embed_path = FLAGS.embed_path or "data/squad/glove.trimmed.{}.npz".format(FLAGS.embedding_size)
    embeddingz = np.load(embed_path)
    embeddings = embeddingz['glove']
    embeddingz.close()

    assert embeddings.shape[1] == FLAGS.embedding_size

    qa = QASystem(train_dir, embeddings)

    with tf.Session() as sess:
        initialize_model(sess, qa, train_dir)
        answers = generate_answers(sess, qa, dataset, rev_vocab)

        # write to json file to root dir
        with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_):

    data_dir = cfg.DATA_DIR
    vocab, rev_vocab = initialize_vocab(FLAGS.vocab)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    tf.reset_default_graph()

    encoder = Encoder(size=2 * cfg.lstm_num_hidden)
    decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
    qa = QASystem(encoder, decoder, FLAGS.embed)

    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        load_train_dir = get_normalized_train_dir(FLAGS.ckpt)
        initialize_model(sess, qa, load_train_dir)
        print('*********************************************************************')
        print("Welcome! You can use this to explore the behavior of the model.")
        print('*********************************************************************')

        while True:
            print('-------------------')
            print('Input the context: ')
            print('-------------------')
            sentence = raw_input()
            print('-------------------')
            print('Input the question: ')
            print('-------------------')
            query = raw_input()
            raw_context = nltk.word_tokenize(sentence)
            context = sentence_to_token_ids(sentence, vocab, tokenizer=nltk.word_tokenize)
            question = sentence_to_token_ids(query, vocab, tokenizer=nltk.word_tokenize)
            context_in = mask_input(context, cfg.context_max_len)
            question_in = mask_input(question, cfg.question_max_len)
            start, end = qa.answer(sess, [context_in], [question_in])
            answer = ' '.join(raw_context[start[0]: end[0] + 1])
            print('==========================================')
            print('ANSWER: {}'.format(answer))
            print('==========================================')
Exemple #21
0
def main(_):

    logging.info("Loading training data")
    dataset_train = load_train_data(FLAGS.data_dir, isValidation = False)
    logging.info("Loading validation data")
    dataset_val = load_train_data(FLAGS.data_dir, isValidation = True)

    logging.info("Building Model Graph")
    tf.set_random_seed(42)
    np.random.seed(43)
    
    select_test(0)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = SimpleLinearDecoder() #AnswerPointerDecoder()

    qa = QASystem(encoder, decoder, len(dataset_train[0]))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    logging.info(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    cris_flag = os.environ.get('CS224N_CRIS')

    if cris_flag is not None:
        logging.info('hi cris')
        sess = tf.Session(config = tf.ConfigProto(intra_op_parallelism_threads = 1))
    else:
        sess = tf.Session()

    with sess.as_default():
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset_train, dataset_val, save_train_dir)
       
    sess.close()
Exemple #22
0
def main(args):
    if args:
        restore = args

    embed_path = FLAGS.embed_path or "data/squad/glove.trimmed.{}.npz".format(
        FLAGS.embedding_size)
    embeddingz = np.load(embed_path)
    embeddings = embeddingz['glove']
    embeddingz.close()
    assert embeddings.shape[1] == FLAGS.embedding_size

    vocab_len = embeddings.shape[0]

    train = load_squad(FLAGS.data_dir,
                       "train",
                       max_vocab=vocab_len if FLAGS.check_embeddings else 0,
                       max_samples=FLAGS.max_train_samples)
    val = load_squad(FLAGS.data_dir,
                     "val",
                     max_vocab=vocab_len if FLAGS.check_embeddings else 0,
                     max_samples=FLAGS.max_val_samples)

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    if FLAGS.verify_only:
        print_samples(train, FLAGS.verify_only, rev_vocab)

        return

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    qa = QASystem(train_dir, embeddings)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        initialize_model(sess, qa, train_dir)

        qa.train(sess, train)

        qa.evaluate_answer(sess, qa.preprocess_sequence_data(val), log=True)
Exemple #23
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    FLAGS.embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)
    context_data, question_data, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)
    dataset = {
        "val_context": context_data,
        "val_questions": question_data,
        "val_question_uuids": question_uuid_data
    }

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      FLAGS=FLAGS)
    decoder = Decoder(FLAGS=FLAGS)

    qa = QASystem(encoder, decoder, FLAGS)

    with tf.Session() as sess:
        #train_dir = get_normalized_train_dir(FLAGS.train_dir)

        train_dir = FLAGS.train_dir
        print("train_dir: ", train_dir)
        initialize_model(sess, qa, train_dir)

        print("Generating Answers")
        answers = generate_answers(sess, qa, dataset, rev_vocab)

        # write to json file to root dir
        print("Writing to json file")
        with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
Exemple #24
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    datasetTrain = initialize_datasets(FLAGS.data_dir, 'train')
    datasetVal = initialize_datasets(FLAGS.data_dir, 'val')
    #datasetTrain = datasetTrain[0:100]


    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, embed_path, rev_vocab, FLAGS)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, datasetTrain, save_train_dir)

        qa.evaluate_answer(sess, datasetVal, sample=1000, log=True)
Exemple #25
0
def main(_):

    dataset = [
        load_dataset([
            "embedding/train.ids.question", "embedding/train.ids.context",
            "data/train.span"
        ]),
        load_dataset([
            "embedding/val.ids.question", "embedding/val.ids.context",
            "data/val.span"
        ])
    ]

    embed_path = pjoin("embedding",
                       "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    tf.reset_default_graph()
    encoder = Encoder(FLAGS.num_hidden_unit, tf.contrib.rnn.GRUCell)
    decoder = Decoder(FLAGS.num_hidden_unit, tf.contrib.rnn.BasicLSTMCell)
    embedding = np.load(embed_path)["glove"]

    qa = QASystem(encoder, decoder, embedding, FLAGS.keep_prob)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        initialize_model(sess, qa, FLAGS.train_dir)
        qa.train(sess, dataset, FLAGS.epochs, FLAGS.batch_size,
                 FLAGS.train_dir)
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = get_dataset(FLAGS.data_dir, FLAGS.max_question_size,
                          FLAGS.max_paragraph_size)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    FLAGS.embed_path = embed_path
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      FLAGS=FLAGS)
    decoder = Decoder(FLAGS=FLAGS)

    qa = QASystem(encoder, decoder, FLAGS)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        #load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) #Change these back for final submission
        load_train_dir = FLAGS.load_train_dir or FLAGS.train_dir
        print("load_train_dir: ", load_train_dir)
        initialize_model(sess, qa, load_train_dir)

        #save_train_dir = get_normalized_train_dir(FLAGS.train_dir) #Change back for final submission
        save_train_dir = FLAGS.train_dir
        print("save_train_dir: ", save_train_dir)
        qa.train(sess, dataset, save_train_dir, rev_vocab)
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = []
    dataset.append(pjoin(FLAGS.data_dir, "train.ids.question"))
    dataset.append(pjoin(FLAGS.data_dir, "train.ids.context"))
    dataset.append(pjoin(FLAGS.data_dir, "train.span"))
    dataset.append(pjoin(FLAGS.data_dir, "val.ids.question"))
    dataset.append(pjoin(FLAGS.data_dir, "val.ids.context"))
    dataset.append(pjoin(FLAGS.data_dir, "val.span"))

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.paragraph_output_size)

    qa = QASystem(encoder, decoder)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)
Exemple #28
0
def main(_):
    config = Config()
    dataset = None  # TODO ;load dateset ??? - look at dataset and seenhow it loooks - change model.py accordingly

    embed_path = config.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(config.embed_size))
    embeddingz = np.load(embed_path)
    embeddings = embeddingz['glove']
    embeddingz.close()

    vocab_len = embeddings.shape[0]

    train = load_squad(config.data_dir, "train", vocab_len, config.data_dir, max_samples=config.max_train_samples)
    val = load_squad(config.data_dir, "val", vocab_len, config.data_dir, max_samples=config.max_val_samples)

    print('train size: ', len(train), ' val size: ', len(val))

    vocab_path = config.vocab_path or pjoin(config.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    # print_samples(train,1, rev_vocab) #train is triplets of (context,question,answer)
    encoder = Encoder(state_size=config.hidden_size, embedding_size=config.embed_size)
    decoder = Decoder(state_size=config.hidden_size, embedding_size=config.embed_size)

    qa = QASystem(encoder, decoder)
    with tf.Session() as sess:
        load_train_dir = (config.load_train_dir or config.train_dir)  # put "" here if you want to build a new model
        initialize_model(sess, qa, load_train_dir)
        save_train_dir = config.train_dir
        ds_train = qa.pad_sequences(train)
        ret_q, ret_p, ret_labels = ds_train
        qa.train(sess, ds_train, save_train_dir)
        ds_val = qa.pad_sequences(val)

        print('train error')
        qa.evaluate_answer(sess, ds_train, log=True)

        print('val error')
        qa.evaluate_answer(sess, ds_val, log=True)
Exemple #29
0
def main(FLAGS):
    print(80 * "=")
    print("INITIALIZING")
    print(80 * "=")
    # Do what you need to load datasets from FLAGS.data_dir

    #parser, embeddings, train_examples, dev_set, test_set = load_and_preprocess_data(debug)
    if not os.path.exists('./data/weights/'):
        os.makedirs('./data/weights/')
    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    print("Loading Embedding Matrix")
    embeddings = np.load(embed_path)['glove']

    encoder = Encoder(size=FLAGS.output_size, vocab_dim=FLAGS.embedding_size)

    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, FLAGS, embeddings)
    #qa = QASystem(encoder, FLAGS, embeddings)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        print("Building Network ... ")
        initialize_model(sess, qa, load_train_dir)
        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        print("Load Training Data")
        dataset = initialize_datasets(FLAGS.data_dir,
                                      dataset='train',
                                      debugMode=True)
        # encoder.encode_question(
        #     dataset['Questions'], question['Questions_masks'])
        print(80 * "=")
        print("Training")
        print(80 * "=")
        qa.train(sess, dataset, save_train_dir)
        print("Finished Training")
        print("Load Validation Data")
        dataset = initialize_datasets(FLAGS.data_dir,
                                      dataset='val',
                                      debugMode=True)
        print(80 * "=")
        print("Evaluation")
        print(80 * "=")
        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
Exemple #30
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    train_path = 'data/squad'
    val_path = 'data/squad'

    config = Config(embed_path, train_path, val_path)

    encoder = EncoderCoattention(config)
    decoder = DecoderDynamic(config)

    qa = QASystem(encoder, decoder, config)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, save_train_dir)
def run_func(model_name):
    train = SquadData.load(config.SQUAD_TRAIN_PREFIX, size=config.TRAIN_SIZE)
    dev = SquadData.load(config.SQUAD_DEV_PREFIX, size=config.EVAL_SIZE)

    qa = QASystem(model_name)
    
    with tf.Session() as sess:
        # ====== Load a pretrained model if it exists or create a new one if no pretrained available ======
        qa.initialize_model(sess)
        qa.train(sess, [train, dev])
def main(_):
    '''Check the Config.py to set up models pathes to be ensembled.'''

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)

    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    model_pathes = cfg.model_pathes
    num_m = len(model_pathes)
    train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    for i in xrange(num_m):
        tf.reset_default_graph()
        with tf.Session(config=config) as sess:
            encoder = Encoder(size=2 * cfg.lstm_num_hidden)
            decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
            qa = QASystem(encoder, decoder)
            init = tf.global_variables_initializer()
            sess.run(init)
            load_train_dir = get_normalized_train_dir(model_pathes[i])
            initialize_model(sess, qa, load_train_dir)

            ts, te, vs, ve = qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab,
                                                log=True,
                                                ensemble=True,
                                                training=True,
                                                sample=cfg.num_eval)
            train_s[:, i] = ts
            train_e[:, i] = te
            val_s[:, i] = vs
            val_e[:, i] = ve

            if i == num_m - 1:
                # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e])
                train_s = bin_count(train_s)
                train_e = bin_count(train_e)
                val_s = bin_count(val_s)
                val_e = bin_count(val_e)
                qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab,
                                   log=True,
                                   training=True,
                                   sendin=(train_s, train_e, val_s, val_e),
                                   sample=cfg.num_eval
                                   )
def main(_):
    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    args = parse_arg()
    update_config(args, c_time)
    # pprint.pprint(cfg)
    logging.info(cfg)
    if args.test:
        pdb.set_trace()

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    answers = read_answers(data_dir)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    embed_path = pjoin(data_dir, "glove.trimmed." + str(cfg.embed_size) + ".npz")
    # logging.info('embed size: {} for path {}'.format(cfg.embed_size, embed_path))
    # embedding = np.load(embed_path)['glove']

    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)
    file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    print_parameters()

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    tf.reset_default_graph()

    encoder = Encoder(size=2 * cfg.lstm_num_hidden)
    decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
    qa = QASystem(encoder, decoder, embed_path)

    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        load_train_dir = get_normalized_train_dir(cfg.train_dir)
        logging.info('=========== trainable varaibles ============')
        for i in tf.trainable_variables():
            logging.info(i.name)
        logging.info('=========== regularized varaibles ============')
        for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES):
            logging.info(i.name)

        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(cfg.train_dir)
        if args.test:
            qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir,
                     raw_answers=raw_answers,
                     debug_num=100,
                     rev_vocab=rev_vocab)
        else:
            qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir,
                     raw_answers=raw_answers,
                     rev_vocab=rev_vocab)
        qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab,
                           log=True,
                           training=True,
                           sample=4000)