Example #1
0
def train(train_set, test_set, idx2word_history, word2idx_history, idx2word_target, word2idx_target):
    with tf.variable_scope("history_length"):
        history_length = train_set['features'].shape[1]

    encoder_lstm_size = 16*4
    encoder_embedding_size = 16*8
    encoder_vocabulary_length = len(idx2word_history)
    with tf.variable_scope("encoder_sequence_length"):
        encoder_sequence_length = train_set['features'].shape[2]

    decoder_lstm_size = 16*4
    decoder_embedding_size = 16*4
    decoder_vocabulary_length = len(idx2word_target)
    with tf.variable_scope("decoder_sequence_length"):
        decoder_sequence_length = train_set['targets'].shape[1]

    # inference model
    with tf.name_scope('model'):
        features = tf.placeholder("int32", name='features')
        targets = tf.placeholder("int32", name='true_targets')
        use_dropout_prob = tf.placeholder("float32", name='use_dropout_prob')

        with tf.variable_scope("batch_size"):
            batch_size = tf.shape(features)[0]

        encoder_embedding = embedding(
                input=features,
                length=encoder_vocabulary_length,
                size=encoder_embedding_size,
                name='encoder_embedding'
        )

        with tf.name_scope("UtterancesEncoder"):
            with tf.name_scope("RNNForwardUtteranceEncoderCell_1"):
                cell_fw_1 = LSTMCell(
                        num_units=encoder_lstm_size,
                        input_size=encoder_embedding_size,
                        use_peepholes=True
                )
                initial_state_fw_1 = cell_fw_1.zero_state(batch_size, tf.float32)

            with tf.name_scope("RNNBackwardUtteranceEncoderCell_1"):
                cell_bw_1 = LSTMCell(
                        num_units=encoder_lstm_size,
                        input_size=encoder_embedding_size,
                        use_peepholes=True
                )
                initial_state_bw_1 = cell_bw_1.zero_state(batch_size, tf.float32)

            with tf.name_scope("RNNForwardUtteranceEncoderCell_2"):
                cell_fw_2 = LSTMCell(
                        num_units=encoder_lstm_size,
                        input_size=cell_fw_1.output_size + cell_bw_1.output_size,
                        use_peepholes=True
                )
                initial_state_fw_2 = cell_fw_2.zero_state(batch_size, tf.float32)

            # the input data has this dimensions
            # [
            #   #batch,
            #   #utterance in a history (a dialogue),
            #   #word in an utterance (a sentence),
            #   embedding dimension
            # ]

            # encode all utterances along the word axis
            encoder_states_2d = []

            for utterance in range(history_length):
                encoder_outputs, _ = brnn(
                        cell_fw=cell_fw_1,
                        cell_bw=cell_bw_1,
                        inputs=[encoder_embedding[:, utterance, word, :] for word in range(encoder_sequence_length)],
                        initial_state_fw=initial_state_fw_1,
                        initial_state_bw=initial_state_bw_1,
                        name='RNNUtteranceBidirectionalLayer',
                        reuse=True if utterance > 0 else None
                )

                _, encoder_states = rnn(
                        cell=cell_fw_2,
                        inputs=encoder_outputs,
                        initial_state=initial_state_fw_2,
                        name='RNNUtteranceForwardEncoder',
                        reuse=True if utterance > 0 else None
                )

                # print(encoder_states[-1])
                encoder_states = tf.concat(1, tf.expand_dims(encoder_states[-1], 1))
                # print(encoder_states)
                encoder_states_2d.append(encoder_states)

            encoder_states_2d = tf.concat(1, encoder_states_2d)
            # print('encoder_states_2d', encoder_states_2d)

        with tf.name_scope("HistoryEncoder"):
            # encode all histories along the utterance axis
            with tf.name_scope("RNNFrowardHistoryEncoderCell_1"):
                cell_fw_1 = LSTMCell(
                        num_units=encoder_lstm_size,
                        input_size=cell_fw_2.state_size,
                        use_peepholes=True
                )
                initial_state_fw_1 = cell_fw_1.zero_state(batch_size, tf.float32)

            with tf.name_scope("RNNBackwardHistoryEncoderCell_1"):
                cell_bw_1 = LSTMCell(
                        num_units=encoder_lstm_size,
                        input_size=cell_fw_2.state_size,
                        use_peepholes=True
                )
                initial_state_bw_1 = cell_fw_2.zero_state(batch_size, tf.float32)

            with tf.name_scope("RNNFrowardHistoryEncoderCell_2"):
                cell_fw_2 = LSTMCell(
                        num_units=encoder_lstm_size,
                        input_size=cell_fw_1.output_size + cell_bw_1.output_size,
                        use_peepholes=True
                )
                initial_state_fw_2 = cell_fw_2.zero_state(batch_size, tf.float32)

            encoder_outputs, _ = brnn(
                    cell_fw=cell_fw_1,
                    cell_bw=cell_bw_1,
                    inputs=[encoder_states_2d[:, utterance, :] for utterance in range(history_length)],
                    initial_state_fw=initial_state_fw_1,
                    initial_state_bw=initial_state_bw_1,
                    name='RNNHistoryBidirectionalLayer',
                    reuse=None
            )

            _, encoder_states = rnn(
                    cell=cell_fw_2,
                    inputs=encoder_outputs,
                    initial_state=initial_state_fw_2,
                    name='RNNHistoryForwardEncoder',
                    reuse=None
            )

        with tf.name_scope("Decoder"):
            use_inputs_prob = tf.Variable(1.0, name='use_inputs_prob', trainable=False)
            use_inputs_prob_decay_op = use_inputs_prob.assign(use_inputs_prob * FLAGS.use_inputs_prob_decay)

            with tf.name_scope("RNNDecoderCell"):
                cell = LSTMCell(
                        num_units=decoder_lstm_size,
                        input_size=decoder_embedding_size,
                        use_peepholes=True,
                )

            # decode all histories along the utterance axis
            final_encoder_state = encoder_states[-1]

            decoder_states, decoder_outputs, decoder_outputs_softmax = rnn_decoder(
                    cell=cell,
                    inputs=[targets[:, word] for word in range(decoder_sequence_length)],
                    initial_state=final_encoder_state,
                    embedding_size=decoder_embedding_size,
                    embedding_length=decoder_vocabulary_length,
                    sequence_length=decoder_sequence_length,
                    name='RNNDecoder',
                    reuse=False,
                    use_inputs_prob=use_inputs_prob
            )

            targets_give_features = tf.concat(1, decoder_outputs_softmax)
            # print(p_o_i)

    if FLAGS.print_variables:
        for v in tf.trainable_variables():
            print(v.name)

    with tf.name_scope('loss'):
        one_hot_labels = dense_to_one_hot(targets, decoder_vocabulary_length)
        loss = tf.reduce_mean(- one_hot_labels * tf.log(targets_give_features), name='loss')
        for v in tf.trainable_variables():
            for n in ['/W_', '/W:', '/B:']:
                if n in v.name:
                    print('Regularization using', v.name)
                    loss += FLAGS.regularization * tf.reduce_mean(tf.pow(v, 2))
        tf.scalar_summary('loss', loss)

    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(tf.argmax(one_hot_labels, 2), tf.argmax(targets_give_features, 2))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
        tf.scalar_summary('accuracy', accuracy)

    # with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
    with tf.Session() as sess:
        # Merge all the summaries and write them out to ./log
        merged = tf.merge_all_summaries()
        writer = tf.train.SummaryWriter('./log', sess.graph_def)
        saver = tf.train.Saver()

        # training
        tvars = tf.trainable_variables()
        # tvars = [v for v in tvars if 'embedding_table' not in v.name] # all variables except embeddings
        learning_rate = tf.Variable(float(FLAGS.learning_rate), trainable=False)

        # train_op = tf.train.GradientDescentOptimizer(
        train_op = AdamPlusOptimizer(
                learning_rate=learning_rate,
                beta1=FLAGS.beta1,
                beta2=FLAGS.beta2,
                epsilon=FLAGS.epsilon,
                pow=FLAGS.pow,
                use_locking=False,
                name='trainer')

        learning_rate_decay_op = learning_rate.assign(learning_rate * FLAGS.decay)
        global_step = tf.Variable(0, trainable=False)
        gradients = tf.gradients(loss, tvars)

        clipped_gradients, _ = tf.clip_by_global_norm(gradients, FLAGS.max_gradient_norm)
        train_op = train_op.apply_gradients(zip(clipped_gradients, tvars), global_step=global_step)

        tf.initialize_all_variables().run()

        # prepare batch indexes
        train_set_size = train_set['features'].shape[0]
        print('Train set size:', train_set_size)
        batch_size = FLAGS.batch_size
        print('Batch size:', batch_size)
        batch_indexes = [[i, i + batch_size] for i in range(0, train_set_size, batch_size)]
        print('#Batches:', len(batch_indexes))
        # print('Batch indexes', batch_indexes)

        previous_accuracies = []
        previous_losses = []
        for epoch in range(FLAGS.max_epochs):
            print('Batch: ', end=' ', flush=True)
            for b, batch in enumerate(batch_indexes):
                print(b, end=' ', flush=True)
                sess.run(
                        train_op,
                        feed_dict={
                            features: train_set['features'][batch[0]:batch[1]],
                            targets: train_set['targets'][batch[0]:batch[1]],
                        }
                )
            print()
            shuffle(batch_indexes)

            if epoch % max(min(int(FLAGS.max_epochs / 100), 100), 1) == 0:
                summary, lss, acc = sess.run([merged, loss, accuracy],
                                             feed_dict={features: test_set['features'], targets: test_set['targets']})
                writer.add_summary(summary, epoch)
                print()
                print('Epoch: {epoch}'.format(epoch=epoch))
                print(' - accuracy        = {acc:f}'.format(acc=acc))
                print(' - loss            = {lss:f}'.format(lss=lss))
                print(' - learning rate   = {lr:f}'.format(lr=learning_rate.eval()))
                print(' - use inputs prob = {uip:f}'.format(uip=use_inputs_prob.eval()))
                print()

                # decrease learning rate if no improvement was seen over last 3 times.
                if len(previous_losses) > 2 and lss > max(previous_losses[-3:]):
                    sess.run(learning_rate_decay_op)
                previous_losses.append(lss)

                # stop when reached a threshold maximum or when no improvement in the last 20 steps
                previous_accuracies.append(acc)
                if acc > 0.9999 or max(previous_accuracies) > max(previous_accuracies[-20:]):
                    break

            sess.run(use_inputs_prob_decay_op)

        save_path = saver.save(sess, ".rnn-model.ckpt")
        print()
        print("Model saved in file: %s" % save_path)
        print()

        # print('Test features')
        # print(test_set['features'])
        # print('Test targets')
        print('Shape of targets:', test_set['targets'].shape)
        # print(test_set['targets'])
        print('Predictions')
        targets_give_features = sess.run(targets_give_features,
                                         feed_dict={features: test_set['features'], targets: test_set['targets']})
        targets_given_features_argmax = np.argmax(targets_give_features, 2)
        print('Shape of predictions:', targets_give_features.shape)
        print('Argmax predictions')
        # print(p_o_i_argmax)
        print()
        for features in range(0, targets_given_features_argmax.shape[0], max(int(targets_given_features_argmax.shape[0]/10), 1)):
            print('History', features)

            for j in range(test_set['features'].shape[1]):
                utterance = []
                for k in range(test_set['features'].shape[2]):
                    w = idx2word_history[test_set['features'][features, j, k]]
                    if w not in ['_SOS_', '_EOS_']:
                        utterance.append(w)
                print('U {j}: {c:80}'.format(j=j, c=' '.join(utterance)))

            prediction = []
            for j in range(targets_given_features_argmax.shape[1]):
                w = idx2word_target[targets_given_features_argmax[features, j]]
                if w not in ['_SOS_', '_EOS_']:
                    prediction.append(w)

            print('P  : {t:80}'.format(t=' '.join(prediction)))

            target = []
            for j in range(test_set['targets'].shape[1]):
                w = idx2word_target[test_set['targets'][features, j]]
                if w not in ['_SOS_', '_EOS_']:
                    target.append(w)

            print('T  : {t:80}'.format(t=' '.join(target)))
            print()
Example #2
0
    def __init__(self, data, FLAGS):
        with tf.variable_scope("history_length"):
            history_length = data.train_set['features'].shape[1]

        encoder_embedding_size = 32 * 4
        encoder_vocabulary_length = len(data.idx2word_history)
        with tf.variable_scope("encoder_sequence_length"):
            encoder_sequence_length = data.train_set['features'].shape[2]

        decoder_lstm_size = 16 * 2
        decoder_embedding_size = 16 * 2
        decoder_vocabulary_length = len(data.idx2word_target)
        with tf.variable_scope("decoder_sequence_length"):
            decoder_sequence_length = data.train_set['targets'].shape[1]

        # inference model
        with tf.name_scope('model'):
            features = tf.placeholder("int32", name='features')
            targets = tf.placeholder("int32", name='true_targets')
            use_dropout_prob = tf.placeholder("float32", name='use_dropout_prob')

            with tf.variable_scope("batch_size"):
                batch_size = tf.shape(features)[0]

            encoder_embedding = embedding(
                    input=features,
                    length=encoder_vocabulary_length,
                    size=encoder_embedding_size,
                    name='encoder_embedding'
            )

            with tf.name_scope("UtterancesEncoder"):
                conv3 = encoder_embedding
                # conv3 = conv2d(
                #         input=conv3,
                #         filter=[1, 3, encoder_embedding_size, encoder_embedding_size],
                #         name='conv_utt_size_3_layer_1'
                # )
                # conv_s3 = conv2d(
                #         input=conv_s3,
                #         filter=[1, 3, encoder_embedding_size, encoder_embedding_size],
                #         name='conv_utt_size_3_layer_2'
                # )
                # print(conv3)
                # k = encoder_sequence_length
                # mp_s3 = max_pool(conv_s3, ksize=[1, 1, k, 1], strides=[1, 1, k, 1])
                # print(mp_s3)

                # encoded_utterances = mp_s3
                encoded_utterances = tf.reduce_max(conv3, [2], keep_dims=True)

            with tf.name_scope("HistoryEncoder"):
                conv3 = encoded_utterances
                # conv3 = conv2d(
                #         input=conv3,
                #         filter=[3, 1, encoder_embedding_size, encoder_embedding_size],
                #         name='conv_hist_size_3_layer_1'
                # )
                # conv_s3 = conv2d(
                #         input=conv_s3,
                #         filter=[3, 1, encoder_embedding_size, encoder_embedding_size],
                #         name='conv_hist_size_3_layer_2'
                # )
                # print(conv3)
                # k = encoder_sequence_length
                # mp_s3 = max_pool(conv_s3, ksize=[1, 1, k, 1], strides=[1, 1, k, 1])
                # print(mp_s3)

                encoded_history = tf.reduce_max(conv3, [1, 2])

                # projection = linear(
                #         input=encoded_history,
                #         input_size=encoder_embedding_size,
                #         output_size=encoder_embedding_size,
                #         name='linear_projection_1'
                # )
                # encoded_history = tf.nn.relu(projection)
                # projection = linear(
                #         input=encoded_history,
                #         input_size=encoder_embedding_size,
                #         output_size=encoder_embedding_size,
                #         name='linear_projection_2'
                # )
                # encoded_history = tf.nn.relu(projection)
                # projection = linear(
                #         input=encoded_history,
                #         input_size=encoder_embedding_size,
                #         output_size=decoder_lstm_size * 2,
                #         name='linear_projection_3'
                # )
                # encoded_history = tf.nn.relu(projection)

            with tf.name_scope("Decoder"):
                use_inputs_prob = tf.placeholder("float32", name='use_inputs_prob')

                with tf.name_scope("RNNDecoderCell"):
                    cell = LSTMCell(
                            num_units=decoder_lstm_size,
                            input_size=decoder_embedding_size+encoder_embedding_size,
                            use_peepholes=True,
                    )
                    initial_state = cell.zero_state(batch_size, tf.float32)

                # decode all histories along the utterance axis
                final_encoder_state = encoded_history

                decoder_states, decoder_outputs, decoder_outputs_softmax = rnn_decoder(
                        cell=cell,
                        inputs=[targets[:, word] for word in range(decoder_sequence_length)],
                        static_input=final_encoder_state,
                        initial_state=initial_state, #final_encoder_state,
                        embedding_size=decoder_embedding_size,
                        embedding_length=decoder_vocabulary_length,
                        sequence_length=decoder_sequence_length,
                        name='RNNDecoder',
                        reuse=False,
                        use_inputs_prob=use_inputs_prob
                )

                targets_given_features = tf.concat(1, decoder_outputs_softmax)
                # print(p_o_i)

        if FLAGS.print_variables:
            for v in tf.trainable_variables():
                print(v.name)

        with tf.name_scope('loss'):
            one_hot_labels = dense_to_one_hot(targets, decoder_vocabulary_length)
            loss = tf.reduce_mean(- one_hot_labels * tf.log(targets_given_features), name='loss')
            for v in tf.trainable_variables():
                for n in ['/W_', '/W:', '/B:']:
                    if n in v.name:
                        print('Regularization using', v.name)
                        loss += FLAGS.regularization * tf.reduce_mean(tf.pow(v, 2))
            tf.scalar_summary('loss', loss)

        with tf.name_scope('accuracy'):
            correct_prediction = tf.equal(tf.argmax(one_hot_labels, 2), tf.argmax(targets_given_features, 2))
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
            tf.scalar_summary('accuracy', accuracy)

        self.data = data
        self.train_set = data.train_set
        self.test_set = data.test_set
        self.idx2word_history = data.idx2word_history
        self.word2idx_history = data.word2idx_history
        self.idx2word_target = data.idx2word_target
        self.word2idx_target = data.word2idx_target

        self.history_length = history_length
        self.encoder_sequence_length = encoder_sequence_length
        self.features = features
        self.targets = targets
        self.batch_size = batch_size
        self.use_inputs_prob = use_inputs_prob
        self.targets_given_features = targets_given_features
        self.loss = loss
        self.accuracy = accuracy
Example #3
0
def train(train_set, test_set, idx2word, word2idx):
    encoder_lstm_size = 5
    encoder_embedding_size = 5
    encoder_vocabulary_length = len(idx2word)
    encoder_sequence_length = train_set['features'].shape[1]

    decoder_lstm_size = 5
    decoder_embedding_size = 5
    decoder_vocabulary_length = len(idx2word)
    decoder_sequence_length = train_set['targets'].shape[1]

    # inference model
    with tf.name_scope('model'):
        i = tf.placeholder("int32", name='input')
        o = tf.placeholder("int32", name='true_output')

        with tf.variable_scope("batch_size"):
            batch_size = tf.shape(i)[0]

        encoder_embedding = embedding(
                input=i,
                length=encoder_vocabulary_length,
                size=encoder_embedding_size,
                name='encoder_embedding'
        )

        with tf.name_scope("RNNEncoderCell"):
            cell = LSTMCell(
                    num_units=encoder_lstm_size,
                    input_size=encoder_embedding_size,
                    use_peepholes=False
            )
            initial_state = cell.zero_state(batch_size, tf.float32)

        encoder_outputs, encoder_states = rnn(
                cell=cell,
                inputs=[encoder_embedding[:, j, :] for j in range(encoder_sequence_length)],
                initial_state=initial_state,
                name='RNNForwardEncoder'
        )

        final_encoder_state = encoder_states[-1]

        with tf.name_scope("RNNDecoderCell"):
            cell = LSTMCell(
                    num_units=decoder_lstm_size,
                    input_size=decoder_embedding_size,
                    use_peepholes=False,
            )

        decoder_states, decoder_outputs, decoder_outputs_softmax = rnn_decoder(
                cell=cell,
                initial_state=final_encoder_state,
                embedding_size=decoder_embedding_size,
                embedding_length=decoder_vocabulary_length,
                sequence_length=decoder_sequence_length,
                name='RNNDecoder'
        )

        p_o_i = tf.concat(1, decoder_outputs_softmax)

    with tf.name_scope('loss'):
        one_hot_labels = dense_to_one_hot(o, decoder_vocabulary_length)
        loss = tf.reduce_mean(-one_hot_labels * tf.log(p_o_i), name='loss')
        # loss = tf.constant(0.0, dtype=tf.float32)
        tf.scalar_summary('loss', loss)

    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(tf.argmax(one_hot_labels, 2), tf.argmax(p_o_i, 2))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
        # accuracy = tf.constant(0.0, dtype=tf.float32)
        tf.scalar_summary('accuracy', accuracy)

    with tf.Session() as sess:
        # Merge all the summaries and write them out to ./log
        merged = tf.merge_all_summaries()
        writer = tf.train.SummaryWriter('./log', sess.graph_def)
        saver = tf.train.Saver()

        # training
        train_op = tf.train.AdamOptimizer(FLAGS.learning_rate, name='trainer').minimize(loss)
        tf.initialize_all_variables().run()

        for epoch in range(FLAGS.max_epochs):
            sess.run(train_op, feed_dict={i: train_set['features'], o: train_set['targets']})

            if epoch % max(int(FLAGS.max_epochs / 100), 1) == 0:
                summary, lss, acc = sess.run([merged, loss, accuracy],
                                             feed_dict={i: test_set['features'], o: test_set['targets']})
                writer.add_summary(summary, epoch)
                print()
                print('Epoch: {epoch}'.format(epoch=epoch))
                print(' - accuracy = {acc}'.format(acc=acc))
                print(' - loss     = {lss}'.format(lss=lss))

        save_path = saver.save(sess, "model.ckpt")
        print()
        print("Model saved in file: %s" % save_path)
        print()

        print('Test features')
        print(test_set['features'])
        print('Test targets')
        print('Shape of targets:', test_set['targets'].shape)
        print(test_set['targets'])
        print('Predictions')
        p_o_i = sess.run(p_o_i, feed_dict={i: test_set['features'], o: test_set['targets']})
        p_o_i_argmax = np.argmax(p_o_i, 2)
        print('Shape of predictions:', p_o_i.shape)
        print('Argmax predictions')
        print(p_o_i_argmax)
        print()
        for i in range(p_o_i_argmax.shape[0]):
            for j in range(p_o_i_argmax.shape[1]):
                w = idx2word[p_o_i_argmax[i, j]]
                if w not in ['_SOS_', '_EOS_']:
                    print(w, end=' ')
            print()
Example #4
0
    def __init__(self, data, FLAGS):
        with tf.variable_scope("history_length"):
            history_length = data.train_set["features"].shape[1]

        encoder_lstm_size = 16
        encoder_embedding_size = 16 * 2
        encoder_vocabulary_length = len(data.idx2word_history)
        with tf.variable_scope("encoder_sequence_length"):
            encoder_sequence_length = data.train_set["features"].shape[2]

        decoder_lstm_size = 16
        decoder_embedding_size = 16
        decoder_vocabulary_length = len(data.idx2word_target)
        with tf.variable_scope("decoder_sequence_length"):
            decoder_sequence_length = data.train_set["targets"].shape[1]

        # inference model
        with tf.name_scope("model"):
            features = tf.placeholder("int32", name="features")
            targets = tf.placeholder("int32", name="true_targets")
            use_dropout_prob = tf.placeholder("float32", name="use_dropout_prob")

            with tf.variable_scope("batch_size"):
                batch_size = tf.shape(features)[0]

            encoder_embedding = embedding(
                input=features, length=encoder_vocabulary_length, size=encoder_embedding_size, name="encoder_embedding"
            )

            with tf.name_scope("UtterancesEncoder"):
                with tf.name_scope("RNNForwardUtteranceEncoderCell_1"):
                    cell_fw_1 = LSTMCell(
                        num_units=encoder_lstm_size, input_size=encoder_embedding_size, use_peepholes=True
                    )
                    initial_state_fw_1 = cell_fw_1.zero_state(batch_size, tf.float32)

                with tf.name_scope("RNNBackwardUtteranceEncoderCell_1"):
                    cell_bw_1 = LSTMCell(
                        num_units=encoder_lstm_size, input_size=encoder_embedding_size, use_peepholes=True
                    )
                    initial_state_bw_1 = cell_bw_1.zero_state(batch_size, tf.float32)

                with tf.name_scope("RNNForwardUtteranceEncoderCell_2"):
                    cell_fw_2 = LSTMCell(
                        num_units=encoder_lstm_size,
                        input_size=cell_fw_1.output_size + cell_bw_1.output_size,
                        use_peepholes=True,
                    )
                    initial_state_fw_2 = cell_fw_2.zero_state(batch_size, tf.float32)

                # the input data has this dimensions
                # [
                #   #batch,
                #   #utterance in a history (a dialogue),
                #   #word in an utterance (a sentence),
                #   embedding dimension
                # ]

                # encode all utterances along the word axis
                encoder_states_2d = []

                for utterance in range(history_length):
                    encoder_outputs, _ = brnn(
                        cell_fw=cell_fw_1,
                        cell_bw=cell_bw_1,
                        inputs=[encoder_embedding[:, utterance, word, :] for word in range(encoder_sequence_length)],
                        initial_state_fw=initial_state_fw_1,
                        initial_state_bw=initial_state_bw_1,
                        name="RNNUtteranceBidirectionalLayer",
                        reuse=True if utterance > 0 else None,
                    )

                    _, encoder_states = rnn(
                        cell=cell_fw_2,
                        inputs=encoder_outputs,
                        initial_state=initial_state_fw_2,
                        name="RNNUtteranceForwardEncoder",
                        reuse=True if utterance > 0 else None,
                    )

                    # print(encoder_states[-1])
                    encoder_states = tf.concat(1, tf.expand_dims(encoder_states[-1], 1))
                    # print(encoder_states)
                    encoder_states_2d.append(encoder_states)

                encoder_states_2d = tf.concat(1, encoder_states_2d)
                # print('encoder_states_2d', encoder_states_2d)

            with tf.name_scope("HistoryEncoder"):
                # encode all histories along the utterance axis
                with tf.name_scope("RNNForwardHistoryEncoderCell_1"):
                    cell_fw_1 = LSTMCell(
                        num_units=encoder_lstm_size, input_size=cell_fw_2.state_size, use_peepholes=True
                    )
                    initial_state_fw_1 = cell_fw_1.zero_state(batch_size, tf.float32)

                with tf.name_scope("RNNBackwardHistoryEncoderCell_1"):
                    cell_bw_1 = LSTMCell(
                        num_units=encoder_lstm_size, input_size=cell_fw_2.state_size, use_peepholes=True
                    )
                    initial_state_bw_1 = cell_fw_2.zero_state(batch_size, tf.float32)

                with tf.name_scope("RNNForwardHistoryEncoderCell_2"):
                    cell_fw_2 = LSTMCell(
                        num_units=encoder_lstm_size,
                        input_size=cell_fw_1.output_size + cell_bw_1.output_size,
                        use_peepholes=True,
                    )
                    initial_state_fw_2 = cell_fw_2.zero_state(batch_size, tf.float32)

                encoder_outputs, _ = brnn(
                    cell_fw=cell_fw_1,
                    cell_bw=cell_bw_1,
                    inputs=[encoder_states_2d[:, utterance, :] for utterance in range(history_length)],
                    initial_state_fw=initial_state_fw_1,
                    initial_state_bw=initial_state_bw_1,
                    name="RNNHistoryBidirectionalLayer",
                    reuse=None,
                )

                _, encoder_states = rnn(
                    cell=cell_fw_2,
                    inputs=encoder_outputs,
                    initial_state=initial_state_fw_2,
                    name="RNNHistoryForwardEncoder",
                    reuse=None,
                )

            with tf.name_scope("Decoder"):
                use_inputs_prob = tf.placeholder("float32", name="use_inputs_prob")

                with tf.name_scope("RNNDecoderCell"):
                    cell = LSTMCell(
                        num_units=decoder_lstm_size,
                        input_size=decoder_embedding_size + cell_fw_2.state_size,
                        use_peepholes=True,
                    )
                    initial_state = cell.zero_state(batch_size, tf.float32)

                # decode all histories along the utterance axis
                final_encoder_state = encoder_states[-1]

                decoder_states, decoder_outputs, decoder_outputs_softmax = rnn_decoder(
                    cell=cell,
                    inputs=[targets[:, word] for word in range(decoder_sequence_length)],
                    static_input=final_encoder_state,
                    initial_state=initial_state,  # final_encoder_state,
                    embedding_size=decoder_embedding_size,
                    embedding_length=decoder_vocabulary_length,
                    sequence_length=decoder_sequence_length,
                    name="RNNDecoder",
                    reuse=False,
                    use_inputs_prob=use_inputs_prob,
                )

                targets_given_features = tf.concat(1, decoder_outputs_softmax)
                # print(p_o_i)

        if FLAGS.print_variables:
            for v in tf.trainable_variables():
                print(v.name)

        with tf.name_scope("loss"):
            one_hot_labels = dense_to_one_hot(targets, decoder_vocabulary_length)
            loss = tf.reduce_mean(-one_hot_labels * tf.log(targets_given_features), name="loss")
            for v in tf.trainable_variables():
                for n in ["/W_", "/W:", "/B:"]:
                    if n in v.name:
                        print("Regularization using", v.name)
                        loss += FLAGS.regularization * tf.reduce_mean(tf.pow(v, 2))
            tf.scalar_summary("loss", loss)

        with tf.name_scope("accuracy"):
            correct_prediction = tf.equal(tf.argmax(one_hot_labels, 2), tf.argmax(targets_given_features, 2))
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
            tf.scalar_summary("accuracy", accuracy)

        self.data = data
        self.train_set = data.train_set
        self.test_set = data.test_set
        self.idx2word_history = data.idx2word_history
        self.word2idx_history = data.word2idx_history
        self.idx2word_target = data.idx2word_target
        self.word2idx_target = data.word2idx_target

        self.history_length = history_length
        self.encoder_sequence_length = encoder_sequence_length
        self.features = features
        self.targets = targets
        self.batch_size = batch_size
        self.use_inputs_prob = use_inputs_prob
        self.targets_given_features = targets_given_features
        self.loss = loss
        self.accuracy = accuracy