def run_func():
    config = Config()

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way
    vocab, rev_vocab = initialize_vocab(config.vocab_path)

    dev_path = "data/squad/fuse.json"
    dev_dirname = os.path.dirname(os.path.abspath(dev_path))
    dev_filename = os.path.basename(dev_path)
    context_data, question_data, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)

    ques_len = len(question_data)
    answers = [[0, 0] for _ in xrange(ques_len)]

    dataset = [question_data, context_data, answers]

    embed_path = config.embed_path

    embeddings = get_trimmed_glove_vectors(embed_path)

    encoder = Encoder(config.hidden_state_size)
    decoder = Decoder(config.hidden_state_size)

    qa = QASystem(encoder, decoder, embeddings, config)

    with tf.Session() as sess:
        qa.initialize_model(sess, config.train_dir)
        answers, _ = generate_answers(sess, qa, dataset, question_uuid_data,
                                      rev_vocab)
        # write to json file to root dir
        with io.open('temp/fuse-answer.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def run_func2(dataset, config):
    vocab, rev_vocab = initialize_vocab(config.vocab_path)

    q, c, a = zip(*[[_q, _c, _a] for (_q, _c, _a) in dataset])

    dataset = [q, c, a]

    embed_path = config.embed_path

    embeddings = get_trimmed_glove_vectors(embed_path)

    encoder = Encoder(config.hidden_state_size)
    decoder = Decoder(config.hidden_state_size)

    qa = QASystem(encoder, decoder, embeddings, config)
    question_uuid_data = [i for i in xrange(len(a))]

    with tf.Session() as sess:
        qa.initialize_model(sess, config.train_dir)
        answers, answers_canonical = generate_answers(sess, qa, dataset,
                                                      question_uuid_data,
                                                      rev_vocab)
        # write to json file to root dir
        with io.open('dev-prediction.txt', 'w', encoding='utf-8') as f:
            for i in xrange(len(a)):
                curr_ans = unicode(answers[i], "utf-8")
                f.write("%s\n" % (curr_ans))
Esempio n. 3
0
def run_func():
    config = Config()
    vocab, rev_vocab = initialize_vocab(config.vocab_path)

    dev_path = "download/squad/test.json"
    dev_dirname = os.path.dirname(os.path.abspath(dev_path))
    dev_filename = os.path.basename(dev_path)
    context_data, question_data, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)

    ques_len = len(question_data)
    answers = [[0, 0] for _ in xrange(ques_len)]

    dataset = [question_data, context_data, answers]

    embed_path = config.embed_path
    embeddings = get_trimmed_glove_vectors(embed_path)

    encoder = Encoder(config.hidden_state_size)
    decoder = Decoder(config.hidden_state_size)

    qa = QASystem(encoder, decoder, embeddings, config)

    data = "Id,Answer\n"

    with tf.Session() as sess:
        qa.initialize_model(sess, config.train_dir)
        answers, _ = generate_answers(sess, qa, dataset, question_uuid_data,
                                      rev_vocab)
        for a in answers:
            ans = answers[a]
            data += a + "," + normalize_answer(ans).replace(" s ", "s ") + "\n"

    with open('submission.csv', 'wb') as file:
        file.write(data)
Esempio n. 4
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    datasetTrain = initialize_datasets(FLAGS.data_dir, 'train')
    datasetVal = initialize_datasets(FLAGS.data_dir, 'val')
    #datasetTrain = datasetTrain[0:100]


    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, embed_path, rev_vocab, FLAGS)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, datasetTrain, save_train_dir)

        qa.evaluate_answer(sess, datasetVal, sample=1000, log=True)
Esempio n. 5
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    # load all in once, maybe better to try batch by batch
    question_path = "./data/squad/train.ids.question"
    context_path = "./data/squad/train.ids.context"
    answer_path = "./data/squad/train.span"

    val_q = "./data/squad/val.ids.question"
    val_c = "./data/squad/val.ids.context"
    val_a = "./data/squad/val.span"

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")

    # embeddings is a matrix of shape [vocab_size, embedding_size]
    embeddings = np.load(embed_path)['glove'].astype(np.float32)
    val_data = load_and_pad_val_data(val_q, val_c, val_a)

    # vocab is the mapping from word -> token id
    # rev_vocab is the reverse mapping, from id -> word
    vocab, rev_vocab = initialize_vocab(vocab_path)

    # someone posted that the max length of question is 766
    info = (question_path, context_path, answer_path, FLAGS.batch_size,
            FLAGS.max_length, FLAGS.output_size)
    '''   
    batch_gen = batch_generator(question_path, context_path, answer_path, FLAGS.batch_size, FLAGS.max_length, FLAGS.output_size)
    i = 0;
    while True:
        batch_gen.next()
        i += 1
        logging.info(i)
    '''

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, FLAGS, embeddings)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, batch_generator, info, save_train_dir, val_data,
                 rev_vocab)
Esempio n. 6
0
def main(FLAGS):
    print(80 * "=")
    print("INITIALIZING")
    print(80 * "=")
    # Do what you need to load datasets from FLAGS.data_dir

    #parser, embeddings, train_examples, dev_set, test_set = load_and_preprocess_data(debug)
    if not os.path.exists('./data/weights/'):
        os.makedirs('./data/weights/')
    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    print("Loading Embedding Matrix")
    embeddings = np.load(embed_path)['glove']

    encoder = Encoder(size=FLAGS.output_size, vocab_dim=FLAGS.embedding_size)

    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, FLAGS, embeddings)
    #qa = QASystem(encoder, FLAGS, embeddings)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        print("Building Network ... ")
        initialize_model(sess, qa, load_train_dir)
        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        print("Load Training Data")
        dataset = initialize_datasets(FLAGS.data_dir,
                                      dataset='train',
                                      debugMode=True)
        # encoder.encode_question(
        #     dataset['Questions'], question['Questions_masks'])
        print(80 * "=")
        print("Training")
        print(80 * "=")
        qa.train(sess, dataset, save_train_dir)
        print("Finished Training")
        print("Load Validation Data")
        dataset = initialize_datasets(FLAGS.data_dir,
                                      dataset='val',
                                      debugMode=True)
        print(80 * "=")
        print("Evaluation")
        print(80 * "=")
        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
Esempio n. 7
0
def main(_):

    FLAGS.config = int(sys.argv[1])
    load_config(current_config=FLAGS.config)

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.vocab_dim))

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)
    context_data, question_data, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)
    dataset = (context_data, question_data, question_uuid_data)

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, train_dir)

    with tf.Session() as sess:
        initialize_model(sess, qa, train_dir)
        answers = generate_answers(sess, qa, dataset, rev_vocab)

        # write to json file to root dir
        with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    #dataset = load_data(FLAGS.data_dir)  # None
    dataset = {}
    num_train = load_data_dq(dataset, 'train', FLAGS.data_dir)
    num_val = load_data_dq(dataset, 'val', FLAGS.data_dir)
    load_data_sa(dataset, 'train', FLAGS.data_dir, num_train)
    load_data_sa(dataset, 'val', FLAGS.data_dir, num_val)
    trim(dataset['train'])
    trim(dataset['val'])

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    # Session moved upfront to set the ifgpu flag before QASystem
    with tf.Session() as sess:
        pass
    local_device_protos = device_lib.list_local_devices()  # 38559755
    for x in local_device_protos:
        if x.device_type == 'GPU':
            FLAGS.ifgpu = True
            break

    qa = QASystem(encoder, decoder, embed_path, rev_vocab)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    #print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # tfdbg
    #with tf.Session() as sess:
    #    pass
    #sess = tfdbg.LocalCLIDebugWrapperSession(sess)
    #sess.add_tensor_filter("has_inf_or_nan", tfdbg.has_inf_or_nan)

    load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                              or FLAGS.train_dir)
    initialize_model(sess, qa, load_train_dir)

    save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
    qa.train(sess, dataset, save_train_dir)

    #qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
    sess.close()  #tfdbg
Esempio n. 9
0
def main(_):
    dataset = {}
    question_max_len = 40
    context_max_len = 600
    # Preprocess and collect small test data
    test_q_path = pjoin(FLAGS.data_dir, "test.ids.question")
    test_q_data, test_q_seq_len = pad_sentences(test_q_path, question_max_len)
    assert not any(
        test_q_seq_len > test_q_data.shape[1]
    ), 'Some questions have length greater than max question length'
    test_c_path = pjoin(FLAGS.data_dir, "test.ids.context")
    test_c_data, test_c_seq_len = pad_sentences(test_c_path, context_max_len)
    assert not any(
        test_c_seq_len > test_c_data.shape[1]
    ), 'Some contexts have length greater than max context length'
    test_s_path = pjoin(FLAGS.data_dir, "test.span")
    test_s_e_id = get_answer_span(test_s_path, context_max_len)
    dataset['test'] = [
        test_q_data, test_q_seq_len, test_c_data, test_c_seq_len, test_s_e_id
    ]

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(state_size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(state_size=FLAGS.state_size)

    qa = QASystem(encoder, decoder, question_max_len, context_max_len,
                  embed_path, FLAGS.learning_rate, FLAGS.batch_size,
                  FLAGS.dropout, FLAGS.optimizer,
                  FLAGS.max_gradient_norm)  # try without dropout

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)

        qa.train(sess, dataset['test'], save_train_dir,
                 small_data_test=True)  #
Esempio n. 10
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    FLAGS.embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)
    context_data, question_data, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)
    dataset = {
        "val_context": context_data,
        "val_questions": question_data,
        "val_question_uuids": question_uuid_data
    }

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      FLAGS=FLAGS)
    decoder = Decoder(FLAGS=FLAGS)

    qa = QASystem(encoder, decoder, FLAGS)

    with tf.Session() as sess:
        #train_dir = get_normalized_train_dir(FLAGS.train_dir)

        train_dir = FLAGS.train_dir
        print("train_dir: ", train_dir)
        initialize_model(sess, qa, train_dir)

        print("Generating Answers")
        answers = generate_answers(sess, qa, dataset, rev_vocab)

        # write to json file to root dir
        print("Writing to json file")
        with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
Esempio n. 11
0
def main(_):

    # TODO
    # Do what you need to load datasets from FLAGS.data_dir
    dataset = dict()
    for dataset_type in ['train', 'val']:
        with open(os.path.join(FLAGS.data_dir,
                               "%s.ids.context" % dataset_type)) as f:
            data_context = [
                map(int, line.split()) for line in f.read().splitlines()
            ]
        with open(
                os.path.join(FLAGS.data_dir,
                             "%s.ids.question" % dataset_type)) as f:
            data_question = [
                map(int, line.split()) for line in f.read().splitlines()
            ]
        with open(os.path.join(FLAGS.data_dir, "%s.span" % dataset_type)) as f:
            data_span = [
                map(int, line.split()) for line in f.read().splitlines()
            ]
        dataset[dataset_type] = (data_context, data_question, data_span)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      config=config)
    decoder = Decoder(output_size=FLAGS.output_size, config=config)

    qa = QASystem(encoder, decoder, config=config)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)

        qa.evaluate_answer(sess, dataset, FLAGS.evaluate, log=True)
Esempio n. 12
0
def main(_):

    data_dir = cfg.DATA_DIR
    vocab, rev_vocab = initialize_vocab(FLAGS.vocab)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    tf.reset_default_graph()

    encoder = Encoder(size=2 * cfg.lstm_num_hidden)
    decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
    qa = QASystem(encoder, decoder, FLAGS.embed)

    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        load_train_dir = get_normalized_train_dir(FLAGS.ckpt)
        initialize_model(sess, qa, load_train_dir)
        print(
            '*********************************************************************'
        )
        print(
            "Welcome! You can use this to explore the behavior of the model.")
        print(
            '*********************************************************************'
        )

        while True:
            print('-------------------')
            print('Input the context: ')
            print('-------------------')
            sentence = raw_input()
            print('-------------------')
            print('Input the question: ')
            print('-------------------')
            query = raw_input()
            raw_context = nltk.word_tokenize(sentence)
            context = sentence_to_token_ids(sentence,
                                            vocab,
                                            tokenizer=nltk.word_tokenize)
            question = sentence_to_token_ids(query,
                                             vocab,
                                             tokenizer=nltk.word_tokenize)
            context_in = mask_input(context, cfg.context_max_len)
            question_in = mask_input(question, cfg.question_max_len)
            start, end = qa.answer(sess, [context_in], [question_in])
            answer = ' '.join(raw_context[start[0]:end[0] + 1])
            print('==========================================')
            print('ANSWER: {}'.format(answer))
            print('==========================================')
Esempio n. 13
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    # use .readlines() to load file ourselves
    # use python generator
    question_path = pjoin(FLAGS.data_dir, "data_train/train_tokenH.txt")
    paragraph_path = pjoin(FLAGS.data_dir, "data_train/train_tokenP.txt")
    answer_path = pjoin(FLAGS.data_dir, "data_train/train_index.txt")

    val_question_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenH.txt")
    val_paragraph_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenP.txt")
    val_answer_path = pjoin(FLAGS.data_dir, "data_dev/dev_index.txt")

    # for testing
    # dataset = [(1,1,1), (1,1,1)]
    dataset = load_dataset(question_path, paragraph_path, answer_path,
                           FLAGS.batch_size)
    val_dataset = load_dataset(val_question_path, val_paragraph_path,
                               val_answer_path, FLAGS.batch_size)
    #generate_histograms(dataset)
    #generate_histograms(val_dataset)

    # loads embedding
    FLAGS.embed_path = FLAGS.embed_path or pjoin("data", "sgns.merge.word.npz")
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.txt")
    vocab, rev_vocab = initialize_vocab(
        vocab_path)  # one is list and one is dict

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(size=FLAGS.state_size, output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, FLAGS)

    # log file
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # start training
    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, val_dataset, save_train_dir, rev_vocab)
Esempio n. 14
0
File: train.py Progetto: truncs/QA
def main(_):

    # TODO maybe pass as loaded dataset abstraction instead of
    # file_paths?

    default_hparams = create_hparams(FLAGS)

    context_file_path = FLAGS.data_dir + '/train.ids.context'
    question_file_path = FLAGS.data_dir + '/train.ids.question'
    span_file_path = FLAGS.data_dir + '/train.span'
    dataset = (context_file_path, question_file_path, span_file_path)

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    default_hparams.add_hparam('vocab_size', len(vocab))

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(state_size=FLAGS.state_size)

    qa = QASystem(encoder, decoder, default_hparams)

    # Setup embeddings
    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    np_embeddings = np.float32(np.load(embed_path)['glove'])

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    session_config = tf.ConfigProto(log_device_placement=False,
                                    allow_soft_placement=True)
    session_config.gpu_options.allow_growth = True

    with tf.Session(config=session_config) as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir, np_embeddings)

        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
Esempio n. 15
0
def run_func():
    config = Config()
    train = squad_dataset(config.question_train, config.context_train,
                          config.answer_train)
    dev = squad_dataset(config.question_dev, config.context_dev,
                        config.answer_dev)
    # print(config.question_train)
    embed_path = config.embed_path
    vocab_path = config.vocab_path
    # print(config.embed_path, config.vocab_path)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    embeddings = get_trimmed_glove_vectors(embed_path)

    encoder = Encoder(config.hidden_size)
    decoder = Decoder(config.hidden_size)

    qa = QASystem(encoder, decoder, embeddings, config)

    with tf.Session() as sess:
        # ====== Load a pretrained model if it exists or create a new one if no pretrained available ======
        qa.initialize_model(sess, config.train_dir)
        # train process
        # qa.train(sess, [train, dev], config.train_dir)
        # em = qa.evaluate_model(sess, dev)

        # run process
        while True:
            question = input('please input question: ')
            if question == 'exit':
                break
            raw_context = input('please input context: ')
            if raw_context == 'exit':
                break
            question = [
                vocab[x] if x in vocab.keys() else 2 for x in question.split()
            ]
            context = [
                vocab[x] if x in vocab.keys() else 2
                for x in raw_context.split()
            ]
            test = [[question], [context], [[1, 2]]]
            a_s, a_e = qa.answer(sess, test)
            if a_e == a_s:
                print("answer: ", raw_context.split()[a_s[0]])
            else:
                print("answer: ",
                      ' '.join(raw_context.split()[a_s[0]:a_e[0] + 1]))
Esempio n. 16
0
def main(_):
    # Do what you need to load datasets from FLAGS.data_dir
    datasetTrain = initialize_datasets(FLAGS.data_dir,
                                       'train.',
                                       debugMode=False)
    datasetVal = initialize_datasets(FLAGS.data_dir, 'val.', debugMode=False)
    datasetTrain.extend(datasetVal)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    #This is taking a long time
    tic = datetime.now()
    qa = QASystem(encoder, decoder, embed_path, FLAGS, rev_vocab)
    print('Time to setup the model: ', datetime.now() - tic)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    #saver = tf.train.Saver()

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        # Get directory to save model
        #save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        results_path = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now())
        save_train_dir = results_path + "model.weights/"
        if not os.path.exists(save_train_dir):
            os.makedirs(save_train_dir)

        qa.train(sess, datasetTrain, save_train_dir)  #, saver)

        qa.evaluate_answer(sess, datasetVal, rev_vocab, sample=1000, log=True)
Esempio n. 17
0
def main(_):
    # Do what you need to load datasets from FLAGS.data_dir
    dataset = load_dataset()

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(state_size=FLAGS.state_size,
                      embedding_size=FLAGS.embedding_size,
                      output_size=FLAGS.output_size)
    decoder = Decoder(state_size=FLAGS.state_size,
                      output_size=FLAGS.output_size)

    qa_args = {
        "embed_path": embed_path,
        "embedding_size": FLAGS.embedding_size,
        "output_size": FLAGS.output_size,
        "optimizer": FLAGS.optimizer,
        "learning_rate": FLAGS.learning_rate,
        "epochs": FLAGS.epochs,
        "batch_size": FLAGS.batch_size,
        "max_gradient_norm": FLAGS.max_gradient_norm,
        "dropout_keep_prob": 1.0 - FLAGS.dropout,
        "train_dir": FLAGS.train_dir,
        "state_size": FLAGS.state_size
    }
    qa = QASystem(encoder, decoder, **qa_args)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)
Esempio n. 18
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = {"train": load_data(FLAGS.data_dir, mode="train"), \
               "val": load_data(FLAGS.data_dir, mode="val")}

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(state_size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size)
    matcher = Matcher(perspective_dim=25,
                      input_size=FLAGS.state_size)  # add flag
    decoder = Decoder(output_size=FLAGS.output_size,
                      state_size=FLAGS.state_size,
                      n_perspective_dim=50 * 2)  # add flag

    qa = QASystem(encoder, matcher, decoder, \
                  vocab=vocab, vocab_dim=FLAGS.embedding_size, rev_vocab=rev_vocab, embed_path=embed_path)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        tf.global_variables_initializer().run()
        graph_writer = tf.summary.FileWriter("qa-graph")
        graph_writer.add_graph(sess.graph)
        qa.train(sess, dataset, save_train_dir)

        qa.evaluate_answer(sess, dataset, 500, log=True)
Esempio n. 19
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    training_question_data_path = pjoin(FLAGS.data_dir, 'train.question')
    dataset = load_dataset(FLAGS.data_dir)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    pretrained_embeddings = np.load(embed_path)['glove']
    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      pretrained_embeddings=pretrained_embeddings,
                      max_question_length=FLAGS.max_question_length,
                      max_context_length=FLAGS.max_context_length)
    decoder = Decoder(output_size=FLAGS.output_size,
                      size=FLAGS.state_size,
                      max_context_length=FLAGS.max_context_length)

    qa = QASystem(encoder, decoder)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)

        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
Esempio n. 20
0
def main(_):

    dataset = [
        load_dataset([
            "embedding/train.ids.question", "embedding/train.ids.context",
            "data/train.span"
        ]),
        load_dataset([
            "embedding/val.ids.question", "embedding/val.ids.context",
            "data/val.span"
        ])
    ]

    embed_path = pjoin("embedding",
                       "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    tf.reset_default_graph()
    encoder = Encoder(FLAGS.num_hidden_unit, tf.contrib.rnn.GRUCell)
    decoder = Decoder(FLAGS.num_hidden_unit, tf.contrib.rnn.BasicLSTMCell)
    embedding = np.load(embed_path)["glove"]

    qa = QASystem(encoder, decoder, embedding, FLAGS.keep_prob)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        initialize_model(sess, qa, FLAGS.train_dir)
        qa.train(sess, dataset, FLAGS.epochs, FLAGS.batch_size,
                 FLAGS.train_dir)
Esempio n. 21
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    

    dataset = None

    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    with np.load(embed_path) as data:
        glove_embeddings = np.asfarray(data["glove"], dtype=np.float32)
        
        dataset = load_and_preprocess_data()

        # print(train_data)

        encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, config=FLAGS)
        decoder = Decoder(output_size=FLAGS.output_size, config=FLAGS)

        qa = QASystem(encoder, decoder, embeddings=glove_embeddings, config=FLAGS, vocab=(vocab, rev_vocab))

        if not os.path.exists(FLAGS.log_dir):
            os.makedirs(FLAGS.log_dir)
        file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
        logging.getLogger().addHandler(file_handler)

        print(vars(FLAGS))
        with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
            json.dump(FLAGS.__flags, fout)

        with tf.Session() as sess:
            load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
            initialize_model(sess, qa, load_train_dir)

            save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
            qa.train(sess, dataset, save_train_dir)

            qa.evaluate_answer(sess, FLAGS.evaluate, log=True)
Esempio n. 22
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = None

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.size)
    deocder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, deocder)

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        initialize_model(sess, qa, train_dir)

        qa.train(sess, dataset)

        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
Esempio n. 23
0
def run_func():
    config = Config()
    train = squad_dataset(config.question_train, config.context_train,
                          config.answer_train)
    dev = squad_dataset(config.question_dev, config.context_dev,
                        config.answer_dev)

    embed_path = config.embed_path
    vocab_path = config.vocab_path
    vocab, rev_vocab = initialize_vocab(vocab_path)

    embeddings = get_trimmed_glove_vectors(embed_path)

    encoder = Encoder(config.hidden_state_size)
    decoder = Decoder(config.hidden_state_size)

    qa = QASystem(encoder, decoder, embeddings, config)

    with tf.Session() as sess:
        # ====== Load a pretrained model if it exists or create a new one if no pretrained available ======
        qa.initialize_model(sess, config.train_dir)
        qa.train(sess, [train, dev], config.train_dir)
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = get_dataset(FLAGS.data_dir, FLAGS.max_question_size,
                          FLAGS.max_paragraph_size)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    FLAGS.embed_path = embed_path
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      FLAGS=FLAGS)
    decoder = Decoder(FLAGS=FLAGS)

    qa = QASystem(encoder, decoder, FLAGS)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        #load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) #Change these back for final submission
        load_train_dir = FLAGS.load_train_dir or FLAGS.train_dir
        print("load_train_dir: ", load_train_dir)
        initialize_model(sess, qa, load_train_dir)

        #save_train_dir = get_normalized_train_dir(FLAGS.train_dir) #Change back for final submission
        save_train_dir = FLAGS.train_dir
        print("save_train_dir: ", save_train_dir)
        qa.train(sess, dataset, save_train_dir, rev_vocab)
Esempio n. 25
0
def main(_):
    FLAGS.config = int(sys.argv[1])
    load_config(current_config=FLAGS.config)

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = load_data(FLAGS.data_dir) # ((question, context), answer)
    train_data = preprocess_dataset(dataset['train'],
        FLAGS.output_size, FLAGS.question_size)
    val_data = preprocess_dataset(dataset['val'],
        FLAGS.output_size, FLAGS.question_size)

    # print(dataset)
    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, 
        "log" + '_config_' + str(FLAGS.config) + ".txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags" + '_config_' +\
        str(FLAGS.config) + ".json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)
        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, train_data, val_data, save_train_dir)
        qa.evaluate_answer(sess, train_data, val_data, FLAGS.evaluate, log=True)
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = []
    dataset.append(pjoin(FLAGS.data_dir, "train.ids.question"))
    dataset.append(pjoin(FLAGS.data_dir, "train.ids.context"))
    dataset.append(pjoin(FLAGS.data_dir, "train.span"))
    dataset.append(pjoin(FLAGS.data_dir, "val.ids.question"))
    dataset.append(pjoin(FLAGS.data_dir, "val.ids.context"))
    dataset.append(pjoin(FLAGS.data_dir, "val.span"))

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.paragraph_output_size)

    qa = QASystem(encoder, decoder)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)
Esempio n. 27
0
def main(_):
    config = Config()
    dataset = None  # TODO ;load dateset ??? - look at dataset and seenhow it loooks - change model.py accordingly

    embed_path = config.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(config.embed_size))
    embeddingz = np.load(embed_path)
    embeddings = embeddingz['glove']
    embeddingz.close()

    vocab_len = embeddings.shape[0]

    train = load_squad(config.data_dir, "train", vocab_len, config.data_dir, max_samples=config.max_train_samples)
    val = load_squad(config.data_dir, "val", vocab_len, config.data_dir, max_samples=config.max_val_samples)

    print('train size: ', len(train), ' val size: ', len(val))

    vocab_path = config.vocab_path or pjoin(config.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    # print_samples(train,1, rev_vocab) #train is triplets of (context,question,answer)
    encoder = Encoder(state_size=config.hidden_size, embedding_size=config.embed_size)
    decoder = Decoder(state_size=config.hidden_size, embedding_size=config.embed_size)

    qa = QASystem(encoder, decoder)
    with tf.Session() as sess:
        load_train_dir = (config.load_train_dir or config.train_dir)  # put "" here if you want to build a new model
        initialize_model(sess, qa, load_train_dir)
        save_train_dir = config.train_dir
        ds_train = qa.pad_sequences(train)
        ret_q, ret_p, ret_labels = ds_train
        qa.train(sess, ds_train, save_train_dir)
        ds_val = qa.pad_sequences(val)

        print('train error')
        qa.evaluate_answer(sess, ds_train, log=True)

        print('val error')
        qa.evaluate_answer(sess, ds_val, log=True)
Esempio n. 28
0
def main(_):
    # Do what you need to load datasets from FLAGS.data_dir
    dataset = {}
    question_max_len = 40
    context_max_len = 600
    # Preprocess and collect train data
    train_q_path = pjoin(FLAGS.data_dir, "train.ids.question")
    train_q_data, train_q_seq_len = pad_sentences(train_q_path,
                                                  question_max_len)
    assert not any(
        train_q_seq_len > train_q_data.shape[1]
    ), 'Some questions have length greater than max question length'
    train_c_path = pjoin(FLAGS.data_dir, "train.ids.context")
    train_c_data, train_c_seq_len = pad_sentences(train_c_path,
                                                  context_max_len)
    assert not any(
        train_c_seq_len > train_c_data.shape[1]
    ), 'Some contexts have length greater than max context length'
    train_s_path = pjoin(FLAGS.data_dir, "train.span")
    train_s_e_id = get_answer_span(train_s_path, context_max_len)
    dataset['train'] = [
        train_q_data, train_q_seq_len, train_c_data, train_c_seq_len,
        train_s_e_id
    ]
    # Preprocess and collect validation data
    val_q_path = pjoin(FLAGS.data_dir, "val.ids.question")
    val_q_data, val_q_seq_len = pad_sentences(val_q_path, question_max_len)
    val_c_path = pjoin(FLAGS.data_dir, "val.ids.context")
    val_c_data, val_c_seq_len = pad_sentences(val_c_path, context_max_len)
    val_s_path = pjoin(FLAGS.data_dir, "val.span")
    val_s_e_id = get_answer_span(val_s_path, context_max_len)
    dataset['val'] = [
        val_q_data, val_q_seq_len, val_c_data, val_c_seq_len, val_s_e_id
    ]

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(state_size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(state_size=FLAGS.state_size)

    qa = QASystem(encoder, decoder, question_max_len, context_max_len,
                  embed_path, FLAGS.learning_rate, FLAGS.batch_size,
                  FLAGS.dropout, FLAGS.optimizer,
                  FLAGS.max_gradient_norm)  # try without dropout

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)

        for i in range(FLAGS.epochs):
            qa.train(sess, dataset['train'], save_train_dir)  #
            print('Finish training epoch {}'.format(i))
            qa.evaluate_answer(sess, dataset['val'])  # vocab, FLAGS.evaluate
Esempio n. 29
0
def main(_):

    # Reads file name for paragraph, parses paragraphs, changes form to list of lists of word embeddings
    # and stores length of each paragraph in paraLens. Next 5 blocks of code do the same for questions and get the indices
    # of answers within the paragraph. Also, does the same for all three elements of the validation dataset.
    paraFileName = pjoin(FLAGS.data_dir, "train") + ".ids.context"
    paraFile = open(paraFileName, "r")
    paras = [[int(a) for a in b.strip().split(" ") if a != '']
             for b in paraFile.read().strip().split("\n")]
    paras, paraLens = zip(*[padList(para, FLAGS.paraLen) for para in paras])

    questionFileName = pjoin(FLAGS.data_dir, "train") + ".ids.question"
    questionFile = open(questionFileName, "r")
    questions = [[int(a) for a in b.strip().split(" ") if a != '']
                 for b in questionFile.read().strip().split("\n")]
    questions, questionLens = zip(
        *[padList(question, FLAGS.qLen) for question in questions])

    ansFileName = pjoin(FLAGS.data_dir, "train") + ".span"
    ansFile = open(ansFileName, "r")
    ans = [[int(a) for a in b.strip().split(" ") if a != '']
           for b in ansFile.read().strip().split("\n")]

    paraFileNameVal = pjoin(FLAGS.data_dir, "val") + ".ids.context"
    paraFileVal = open(paraFileNameVal, "r")
    parasVal = [[int(a) for a in b.strip().split(" ") if a != '']
                for b in paraFileVal.read().strip().split("\n")]
    parasVal, paraLensVal = zip(
        *[padList(para, FLAGS.paraLen) for para in parasVal])

    questionFileNameVal = pjoin(FLAGS.data_dir, "val") + ".ids.question"
    questionFileVal = open(questionFileNameVal, "r")
    questionsVal = [[int(a) for a in b.strip().split(" ") if a != '']
                    for b in questionFileVal.read().strip().split("\n")]
    questionsVal, questionLensVal = zip(
        *[padList(question, FLAGS.qLen) for question in questionsVal])

    ansFileNameVal = pjoin(FLAGS.data_dir, "val") + ".span"
    ansFileVal = open(ansFileNameVal, "r")
    ansVal = [[int(a) for a in b.strip().split(" ") if a != '']
              for b in ansFileVal.read().strip().split("\n")]

    train_remove = []
    val_remove = []
    for i in range(len(ans)):
        if ans[i][0] >= FLAGS.paraLen or ans[i][1] >= FLAGS.paraLen:
            train_remove.append(i)
    for i in range(len(ansVal)):
        if ansVal[i][0] >= FLAGS.paraLen or ans[i][1] >= FLAGS.paraLen:
            val_remove.append(i)
    train_remove = list(reversed(train_remove))
    val_remove = list(reversed(val_remove))
    paras = list(paras)
    questions = list(questions)
    paraLens = list(paraLens)
    questionLens = list(questionLens)
    parasVal = list(parasVal)
    questionsVal = list(questionsVal)
    paraLensVal = list(paraLensVal)
    questionLensVal = list(questionLensVal)
    for i in range(len(train_remove)):
        ind = train_remove[i]
        paras.pop(ind)
        questions.pop(ind)
        paraLens.pop(ind)
        questionLens.pop(ind)
        ans.pop(ind)
    for i in range(len(val_remove)):
        ind = val_remove[i]
        parasVal.pop(ind)
        questionsVal.pop(ind)
        paraLensVal.pop(ind)
        questionLensVal.pop(ind)
        ansVal.pop(ind)
    paras = tuple(paras)
    questions = tuple(questions)
    paraLens = tuple(paraLens)
    questionLens = tuple(questionLens)
    parasVal = tuple(parasVal)
    questionsVal = tuple(questionsVal)
    paraLensVal = tuple(paraLensVal)
    questionLensVal = tuple(questionLensVal)
    #valDataset has paragraphs and questions as word embeddings, answers as the start and end indices in the
    #paragraph and length of each paragraph and question for the validation dataset
    valDataset = (parasVal, questionsVal, ansVal, paraLensVal, questionLensVal)
    #dataset is the same format as valDataset but for the training dataset. It also has valDataset as the last element
    dataset = (paras, questions, ans, paraLens, questionLens, valDataset)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    encoder = CoattentionEncoder(size=FLAGS.state_size,
                                 vocab_dim=FLAGS.embedding_size)
    #encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)
    #decoder = ChunkDecoder(output_size=FLAGS.output_size)
    embeddings = np.load(embed_path)['glove']
    #print(embeddings.keys())

    qa = QASystem(encoder, decoder, embeddings, vocab, rev_vocab)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir, saver)
        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        sample_rates = [1, 2, 5, 10]
        for rate in sample_rates:
            print("rate: ", rate)
            f1_l = []
            em_l = []
            for i in range(10):
                f1, em = qa.evaluate_answer(sess,
                                            valDataset,
                                            sample=len(valDataset[0]),
                                            sample_rate=rate)
                f1_l.append(f1)
                em_l.append(em)
        #print("f1: ", f1, "; em: ", em)
            print(rate)
            print(f1_l)
            print(em_l)
Esempio n. 30
0
def main(_):
    '''Check the Config.py to set up models pathes to be ensembled.'''

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)

    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    file_handler = logging.FileHandler(
        pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    model_pathes = cfg.model_pathes
    num_m = len(model_pathes)
    train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    for i in xrange(num_m):
        tf.reset_default_graph()
        with tf.Session(config=config) as sess:
            encoder = Encoder(size=2 * cfg.lstm_num_hidden)
            decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
            qa = QASystem(encoder, decoder)
            init = tf.global_variables_initializer()
            sess.run(init)
            load_train_dir = get_normalized_train_dir(model_pathes[i])
            initialize_model(sess, qa, load_train_dir)

            ts, te, vs, ve = qa.evaluate_answer(sess,
                                                dataset,
                                                raw_answers,
                                                rev_vocab,
                                                log=True,
                                                ensemble=True,
                                                training=True,
                                                sample=cfg.num_eval)
            train_s[:, i] = ts
            train_e[:, i] = te
            val_s[:, i] = vs
            val_e[:, i] = ve

            if i == num_m - 1:
                # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e])
                train_s = bin_count(train_s)
                train_e = bin_count(train_e)
                val_s = bin_count(val_s)
                val_e = bin_count(val_e)
                qa.evaluate_answer(sess,
                                   dataset,
                                   raw_answers,
                                   rev_vocab,
                                   log=True,
                                   training=True,
                                   sendin=(train_s, train_e, val_s, val_e),
                                   sample=cfg.num_eval)