コード例 #1
0
def test_mask_dataset():
    logging.info("test_mask_dataset")
    dataset = mask_dataset(data_dir,
                           set_names=['train', 'val'],
                           suffixes=['context', 'question'])
    print dataset.keys()
    for k, v in dataset.iteritems():
        print k, len(v), v[0]
コード例 #2
0
ファイル: model_test.py プロジェクト: jeffrey1hu/qa_system
def main():
    logging.basicConfig(level=logging.INFO)
    file_handler = logging.FileHandler('log_test.txt')
    logging.getLogger().addHandler(file_handler)

    answer = read_answers(data_dir)
    dataset = mask_dataset(data_dir, set_names=set_names, suffixes=suffixes)

    embed_path = pjoin(data_dir, "glove.trimmed.100.npz")
    embedding = np.load(embed_path)['glove']

    test_model(100, dataset['train_context'][:100],
               dataset['train_question'][:100], embedding,
               answer['train_answer'][:100])
コード例 #3
0
ファイル: rnn_test.py プロジェクト: jeffrey1hu/qa_system
def rnn_test():

    embed_path = pjoin(data_dir, "glove.trimmed.100.npz")
    embedding = np.load(embed_path)['glove']

    dataset = mask_dataset(data_dir, set_names=set_names, suffixes=suffixes)
    test_data = dataset['train_context'][:2]
    inputs = [x[0] for x in test_data]
    masks = [x[1] for x in test_data]

    inputs = np.array(inputs)
    print('shape of inputs {}'.format(inputs.shape))
    masks = np.array(masks)
    print('shape of masks {}'.format(masks.shape))

    with tf.Graph().as_default():
        # embedding_tf = tf.Variable(embedding)
        x = tf.placeholder(tf.int32, (None, 400))
        x_m = tf.placeholder(tf.bool, (None, 400))
        l_x = tf.placeholder(tf.int32, (None,))
        print(x)
        print(x_m)
        print(l_x)

        embed = tf.nn.embedding_lookup(embedding, x)
        # x_in = tf.boolean_mask(embed, x_m)
        print('shape of embed {}'.format(embed.shape))
        # print('shape of x_in {}'.format(x_in.shape))

        num_hidden = 5
        lstm_fw_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)
        lstm_bw_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)
        outputs, outputs_states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,lstm_bw_cell,
                                                              embed,sequence_length=sequence_length(x_m),dtype=tf.float64)
        outputs = tf.concat(outputs, axis=2)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            outp, outps = sess.run([ outputs, outputs_states], feed_dict={x:inputs,
                                                                         x_m:masks})
            # print('shape of input embeddings is : {}'.format(xin.shape))
            print("shape of output is :{}".format(outp.shape))
            assert outp.shape == (2, 400, 2 * num_hidden), 'the shape of outp should be {} but it is {}'\
                .format((2, 400, 2 * num_hidden), outp.shape)
            print(outp)
コード例 #4
0
def main(_):
    '''Check the Config.py to set up models pathes to be ensembled.'''

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)

    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    file_handler = logging.FileHandler(
        pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    model_pathes = cfg.model_pathes
    num_m = len(model_pathes)
    train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    for i in xrange(num_m):
        tf.reset_default_graph()
        with tf.Session(config=config) as sess:
            encoder = Encoder(size=2 * cfg.lstm_num_hidden)
            decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
            qa = QASystem(encoder, decoder)
            init = tf.global_variables_initializer()
            sess.run(init)
            load_train_dir = get_normalized_train_dir(model_pathes[i])
            initialize_model(sess, qa, load_train_dir)

            ts, te, vs, ve = qa.evaluate_answer(sess,
                                                dataset,
                                                raw_answers,
                                                rev_vocab,
                                                log=True,
                                                ensemble=True,
                                                training=True,
                                                sample=cfg.num_eval)
            train_s[:, i] = ts
            train_e[:, i] = te
            val_s[:, i] = vs
            val_e[:, i] = ve

            if i == num_m - 1:
                # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e])
                train_s = bin_count(train_s)
                train_e = bin_count(train_e)
                val_s = bin_count(val_s)
                val_e = bin_count(val_e)
                qa.evaluate_answer(sess,
                                   dataset,
                                   raw_answers,
                                   rev_vocab,
                                   log=True,
                                   training=True,
                                   sendin=(train_s, train_e, val_s, val_e),
                                   sample=cfg.num_eval)
コード例 #5
0
def main(_):

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    answers = read_answers(data_dir)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    embed_path = pjoin(data_dir,
                       "glove.trimmed." + str(cfg.embed_size) + ".npz")
    # logging.info('embed size: {} for path {}'.format(cfg.embed_size, embed_path))
    # embedding = np.load(embed_path)['glove']

    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)
    file_handler = logging.FileHandler(
        pjoin(cfg.log_dir, 'log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    print_parameters()

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    tf.reset_default_graph()

    encoder = Encoder(size=2 * cfg.lstm_num_hidden)
    decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
    qa = QASystem(encoder, decoder, embed_path)

    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        load_train_dir = get_normalized_train_dir(cfg.train_dir)
        logging.info('=========== trainable varaibles ============')
        for i in tf.trainable_variables():
            logging.info(i.name)
        logging.info('=========== regularized varaibles ============')
        for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES):
            logging.info(i.name)

        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(cfg.train_dir)
        qa.train(
            cfg.start_lr,
            sess,
            dataset,
            answers,
            save_train_dir,
            raw_answers=raw_answers,
            # debug_num=1000,
            rev_vocab=rev_vocab)
        qa.evaluate_answer(sess,
                           dataset,
                           raw_answers,
                           rev_vocab,
                           log=True,
                           training=True,
                           sample=4000)
コード例 #6
0
def main(_):
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    num_hidden = cfg.lstm_num_hidden
    data_dir = cfg.DATA_DIR
    embed_path = pjoin(data_dir,
                       "glove.trimmed." + str(cfg.embed_size) + ".npz")
    vocab_path = pjoin(data_dir, cfg.vocab_file)

    dataset = mask_dataset(data_dir, set_names=set_names, suffixes=suffixes)
    answers = read_answers(data_dir)
    raw_answers = read_raw_answers(data_dir)

    vocab, rev_vocab = initialize_vocab(vocab_path)

    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)

    file_handler = logging.FileHandler(
        pjoin(cfg.log_dir, 'log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    print_parameters()

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    encoder = Encoder(size=2 * num_hidden)
    decoder = Decoder(output_size=2 * num_hidden)

    qa = QASystem(encoder, decoder, embed_path)

    with tf.Session(config=config) as sess:

        load_train_dir = get_normalized_train_dir(cfg.train_dir)

        logging.info('=========== trainable varaibles ============')
        for i in tf.trainable_variables():
            logging.info(i.name)
        logging.info('=========== regularized varaibles ============')
        for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES):
            logging.info(i.name)

        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(cfg.train_dir)

        tic = time.time()
        qa.train(
            sess,
            dataset,
            answers,
            save_train_dir,
            raw_answers,
            rev_vocab,
            # debug_num=1000
        )
        #
        qa.evaluate_answer(sess,
                           dataset,
                           raw_answers,
                           rev_vocab,
                           log=True,
                           training=True,
                           sample=4000)
        toc = time.time()
        logging.info("Total training process took {} hours".format(
            (toc - tic) / 3600.))
コード例 #7
0
def main(_):
    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    args = parse_arg()
    update_config(args, c_time)
    # pprint.pprint(cfg)
    logging.info(cfg)
    if args.test:
        pdb.set_trace()

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    answers = read_answers(data_dir)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    embed_path = pjoin(data_dir, "glove.trimmed." + str(cfg.embed_size) + ".npz")
    # logging.info('embed size: {} for path {}'.format(cfg.embed_size, embed_path))
    # embedding = np.load(embed_path)['glove']

    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)
    file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    print_parameters()

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    tf.reset_default_graph()

    encoder = Encoder(size=2 * cfg.lstm_num_hidden)
    decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
    qa = QASystem(encoder, decoder, embed_path)

    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        load_train_dir = get_normalized_train_dir(cfg.train_dir)
        logging.info('=========== trainable varaibles ============')
        for i in tf.trainable_variables():
            logging.info(i.name)
        logging.info('=========== regularized varaibles ============')
        for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES):
            logging.info(i.name)

        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(cfg.train_dir)
        if args.test:
            qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir,
                     raw_answers=raw_answers,
                     debug_num=100,
                     rev_vocab=rev_vocab)
        else:
            qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir,
                     raw_answers=raw_answers,
                     rev_vocab=rev_vocab)
        qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab,
                           log=True,
                           training=True,
                           sample=4000)
コード例 #8
0
def main(_):
    '''Check the Config.py to set up models pathes to be ensembled.'''

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)

    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    model_pathes = cfg.model_pathes
    num_m = len(model_pathes)
    train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    for i in xrange(num_m):
        tf.reset_default_graph()
        with tf.Session(config=config) as sess:
            encoder = Encoder(size=2 * cfg.lstm_num_hidden)
            decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
            qa = QASystem(encoder, decoder)
            init = tf.global_variables_initializer()
            sess.run(init)
            load_train_dir = get_normalized_train_dir(model_pathes[i])
            initialize_model(sess, qa, load_train_dir)

            ts, te, vs, ve = qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab,
                                                log=True,
                                                ensemble=True,
                                                training=True,
                                                sample=cfg.num_eval)
            train_s[:, i] = ts
            train_e[:, i] = te
            val_s[:, i] = vs
            val_e[:, i] = ve

            if i == num_m - 1:
                # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e])
                train_s = bin_count(train_s)
                train_e = bin_count(train_e)
                val_s = bin_count(val_s)
                val_e = bin_count(val_e)
                qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab,
                                   log=True,
                                   training=True,
                                   sendin=(train_s, train_e, val_s, val_e),
                                   sample=cfg.num_eval
                                   )