Ejemplo n.º 1
0
def main(argv):
    argparser = build_argparser()
    args = argparser.parse_args(argv)

    with open(args.data, encoding=ENCODING) as i_:
        text = i_.read()

    text_encoder = utils.TextEncoder()
    text_encoder.build_vocab([text], VOCAB_SIZE)
    gen = build_batch_generator(text_encoder.encode(text), BATCH_SIZE, MAX_LEN)

    inputs = tf.placeholder(tf.int32, (None, None), name='inputs')
    targets = tf.placeholder(tf.int32, (None, None), name='targets')

    rnn_cell = tf.contrib.rnn.MultiRNNCell(
        [tf.contrib.rnn.GRUCell(HIDDEN_SIZE) for _ in range(N_LAYERS)])
    initial_state = rnn_cell.zero_state(BATCH_SIZE, dtype=tf.float32)
    train_outputs, inference_outputs = rnn_model(inputs, MAX_LEN, rnn_cell,
                                                 initial_state, EMB_SIZE,
                                                 VOCAB_SIZE)

    with tf.variable_scope('loss'):
        masks = tf.ones_like(targets, dtype=tf.float32)
        loss = tf.contrib.seq2seq.sequence_loss(train_outputs.rnn_output,
                                                targets, masks)

    with tf.variable_scope('opt'):
        optimizer = tf.train.AdamOptimizer()
        train_op = optimizer.minimize(loss)

    checkpoint_path = os.path.join(MODEL_DIR, 'model.ckpt')
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sum_loss = 0
        for step in range(1, STEPS + 1):
            inputs_, outputs_ = next(gen)
            loss_, _ = sess.run([loss, train_op],
                                feed_dict={
                                    inputs: inputs_,
                                    targets: outputs_
                                })
            sum_loss += loss_
            if step % 100 == 0:
                saver.save(sess, checkpoint_path, global_step=step)

                print(step, sum_loss / 100)
                sum_loss = 0
                # 直前の input_ に対する推定結果を例示
                inferences = sess.run(inference_outputs.sample_id,
                                      feed_dict={inputs: inputs_[:, :1]})

                for _ in range(3):
                    print('---- SEED ----')
                    print(text_encoder.decode(inputs_[_, :1]))
                    print('---- OUTPUT ----')
                    print(text_encoder.decode(inferences[_]))
Ejemplo n.º 2
0
def load_data(dataset, opt, vocab, vocabulary_path):
    if dataset == "atomic":
        data_loader = load_atomic_data(opt, vocab)
    elif dataset == "conceptnet":
        data_loader = load_conceptnet_data(opt, vocab)

    # Initialize TextEncoder
    encoder_path = vocabulary_path + "encoder_bpe_40000.json"
    bpe_path = vocabulary_path + "vocab_40000.bpe"
    text_encoder = utils.TextEncoder(encoder_path, bpe_path)
    text_encoder.encoder = data_loader.vocab_encoder
    text_encoder.decoder = data_loader.vocab_decoder

    return data_loader, text_encoder
Ejemplo n.º 3
0
def main(argv):
    argparser = build_argparser()
    args = argparser.parse_args(argv)

    train_file = os.path.join(args.data_dir, TRAIN_FILE_NAME)
    valid_file = os.path.join(args.data_dir, VALID_FILE_NAME)
    test_file = os.path.join(args.data_dir, TEST_FILE_NAME)

    train_df = pd.read_csv(train_file)
    valid_df = pd.read_csv(valid_file)
    test_df = pd.read_csv(test_file)

    label_names = sorted(train_df.label.unique())
    print('Labels: {}'.format(label_names))

    text_encoder = utils.TextEncoder()
    text_encoder.build_vocab(train_df.text, VOCAB_SIZE)

    train_x, train_y = build_datamart(train_df, label_names, text_encoder,
                                      MAX_LEN)
    valid_x, valid_y = build_datamart(valid_df, label_names, text_encoder,
                                      MAX_LEN)
    test_x, test_y = build_datamart(test_df, label_names, text_encoder,
                                    MAX_LEN)

    train_gen = build_generator(train_x, train_y, BATCH_SIZE)

    inputs = tf.placeholder(tf.int32, (None, MAX_LEN), name='inputs')
    labels = tf.placeholder(tf.int32, (None, ), name='labels')

    rnn_cell = tf.contrib.rnn.GRUCell(HIDDEN_SIZE)
    pred, softmax, loss, acc, train_op = rnn_model(inputs, labels, rnn_cell,
                                                   len(label_names), EMB_SIZE,
                                                   text_encoder.vocab_size,
                                                   HIDDEN_SIZE)

    checkpoint_path = os.path.join(MODEL_DIR, 'model.ckpt')
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        loss_sum_ = 0
        for step in range(1, STEPS + 1):
            # 学習用バッチを取得
            batch_x, batch_y = next(train_gen)
            loss_, _ = sess.run([loss, train_op],
                                feed_dict={
                                    inputs: batch_x,
                                    labels: batch_y
                                })
            loss_sum_ += loss_

            # 100ステップごとに検証用データで精度を確認
            if step % 100 == 0:
                pred_, valid_loss_, acc_ = sess.run([pred, loss, acc],
                                                    feed_dict={
                                                        inputs: valid_x,
                                                        labels: valid_y
                                                    })
                # 学習用データについてのロスは過去100ステップの平均をとる
                avg_loss_ = loss_sum_ / 100
                loss_sum_ = 0
                print(
                    'step: {}, train loss: {}, valid loss: {}, acc: {}'.format(
                        step, avg_loss_, valid_loss_, acc_))
                saver.save(sess, checkpoint_path, global_step=step)

    # Test
    ckpt = tf.train.get_checkpoint_state(MODEL_DIR)
    with tf.Session() as sess:
        saver.restore(sess=sess, save_path=ckpt.model_checkpoint_path)
        pred_, loss_, acc_ = sess.run([pred, loss, acc],
                                      feed_dict={
                                          inputs: test_x,
                                          labels: test_y
                                      })
        print('loss: {}, acc: {}'.format(loss_, acc_))
Ejemplo n.º 4
0
def main(argv):
    argparser = build_argparser()
    args = argparser.parse_args(argv)
    train_df = pd.read_csv(args.train_data)

    input_text_encoder = utils.TextEncoder()
    input_text_encoder.build_vocab(train_df.input, INPUT_VOCAB_SIZE)
    output_text_encoder = utils.TextEncoder()
    output_text_encoder.build_vocab(train_df.output, OUTPUT_VOCAB_SIZE)
    train_gen = build_batch_generator(
        train_df.input.map(input_text_encoder.encode),
        train_df.output.map(output_text_encoder.encode), INPUT_MAX_LEN,
        OUTPUT_MAX_LEN, BATCH_SIZE)

    if args.test_data:
        test_df = pd.read_csv(args.test_data)
        test_inputs_, test_len_ = build_test_data(
            test_df.input.map(input_text_encoder.encode),
            INPUT_MAX_LEN,
        )

    encoder_cell = tf.contrib.rnn.MultiRNNCell(
        [tf.contrib.rnn.GRUCell(HIDDEN_SIZE) for _ in range(N_LAYERS)])

    decoder_cell = tf.contrib.rnn.MultiRNNCell(
        [tf.contrib.rnn.GRUCell(HIDDEN_SIZE) for _ in range(N_LAYERS)])

    encoder_inputs = tf.placeholder(tf.int32, (None, None),
                                    name='encoder_inputs')
    encoder_len = tf.placeholder(tf.int32, (None, ), name='encoder_len')
    decoder_inputs = tf.placeholder(tf.int32, (None, None),
                                    name='decoder_inputs')
    decoder_len = tf.placeholder(tf.int32, (None, ), name='decoder_len')
    targets = tf.placeholder(tf.int32, (None, None), name='targets')

    train_outputs, inference_outputs = seq2seq(encoder_inputs, encoder_len,
                                               encoder_cell, decoder_inputs,
                                               decoder_len, OUTPUT_MAX_LEN,
                                               decoder_cell)

    with tf.variable_scope('loss'):
        masks = tf.sequence_mask(decoder_len,
                                 OUTPUT_MAX_LEN,
                                 dtype=tf.float32,
                                 name='masks')
        loss = tf.contrib.seq2seq.sequence_loss(
            logits=train_outputs.rnn_output, targets=targets, weights=masks)
        tf.summary.scalar('train_loss', loss)
    with tf.variable_scope('opt'):
        optimizer = tf.train.AdamOptimizer()
        train_op = optimizer.minimize(loss)

    checkpoint_path = os.path.join(MODEL_DIR, 'model.ckpt')
    saver = tf.train.Saver()
    if not args.no_train:
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(MODEL_DIR)
            if ckpt:
                print('Use checkpoint file: ' + ckpt.model_checkpoint_path)
                saver.restore(sess=sess, save_path=ckpt.model_checkpoint_path)
            sum_loss = 0
            for step in range(1, STEPS + 1):
                encoder_inputs_, encoder_len_, decoder_inputs_, decoder_outputs_, decoder_len_ = next(
                    train_gen)
                loss_, _ = sess.run(
                    [loss, train_op],
                    feed_dict={
                        encoder_inputs: encoder_inputs_,
                        encoder_len: encoder_len_,
                        decoder_inputs: decoder_inputs_,
                        decoder_len: decoder_len_,
                        targets: decoder_outputs_
                    })
                sum_loss += loss_
                if step % 100 == 0:
                    saver.save(sess, checkpoint_path, global_step=step)

                    print(step, sum_loss / 100)
                    sum_loss = 0
                    # 直前の input_ に対する推定結果を例示
                    train_inferences = sess.run(
                        inference_outputs.sample_id,
                        feed_dict={
                            encoder_inputs:
                            encoder_inputs_,
                            encoder_len:
                            encoder_len_,
                            decoder_inputs: [[
                                utils.TextEncoder.RESERVED_TOKENS.index(
                                    '<BOS>')
                            ]] * len(encoder_inputs_)
                        })
                    for _ in range(3):
                        print('---- INPUT (TRAIN) ----')
                        print(input_text_encoder.decode(encoder_inputs_[_]))
                        print('---- OUTPUT ----')
                        print(output_text_encoder.decode(train_inferences[_]))

                    # テストデータについてもはじめの3件だけ結果を見る
                    if args.test_data:
                        test_inferences = sess.run(
                            inference_outputs.sample_id,
                            feed_dict={
                                encoder_inputs:
                                test_inputs_[:3],
                                encoder_len:
                                test_len_[:3],
                                decoder_inputs: [[
                                    utils.TextEncoder.RESERVED_TOKENS.index(
                                        '<BOS>')
                                ]] * len(test_inputs_[:3])
                            })
                        for _ in range(len(test_inferences)):
                            print('---- INPUT (TEST) ----')
                            print(input_text_encoder.decode(test_inputs_[_]))
                            print('---- OUTPUT ----')
                            print(
                                output_text_encoder.decode(test_inferences[_]))
    if args.test_data:
        ckpt = tf.train.get_checkpoint_state(MODEL_DIR)
        print('Use checkpoint file: ' + ckpt.model_checkpoint_path)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            saver.restore(sess=sess, save_path=ckpt.model_checkpoint_path)
            inferences = sess.run(
                inference_outputs.sample_id,
                feed_dict={
                    encoder_inputs:
                    test_inputs_,
                    encoder_len:
                    test_len_,
                    decoder_inputs:
                    [[utils.TextEncoder.RESERVED_TOKENS.index('<BOS>')]] *
                    len(test_inputs_)
                })
            for _ in range(len(test_inputs_)):
                print('---- INPUT ----')
                print(input_text_encoder.decode(test_inputs_[_]))
                print('---- OUTPUT ----')
                print(output_text_encoder.decode(inferences[_]))