def _build_word_projections(self):
     """Helper to update word embedding and output projection variables."""
     c = self._config
     rnn_size = c.rnn_size
     word_size = c.rnn_word_size
     softmax_size = self._softmax_size
     token_type = c.token_type
     place_var_on_cpu = token_type == 'word'
     
     #with tf.variable_scope('decoder/rnn_decoder', reuse=tf.AUTO_REUSE):
     dec_out_layer = Dense(softmax_size, name='output_projection')
     dec_out_layer.build(rnn_size)
     self.decoder_output_layer = dec_out_layer
     print('INFO: Building separate embedding matrix.')
     kwargs = dict(name='embedding_map',
                   shape=[softmax_size, word_size],
                   dtype=tf.float32,
                   trainable=True)
     if place_var_on_cpu:
         with tf.device('/cpu:0'):
             self._word_embed_map = tf.get_variable(**kwargs)
     else:
         self._word_embed_map = tf.get_variable(**kwargs)
     return self._word_embed_map
    def __init__(self, features: dict, hyperparameters: dict, is_train: bool):
        batch_size = hyperparameters['batch_size']
        grid_size = hyperparameters['grid_size']
        vocab_size = hyperparameters['vocab_size']
        word_emb_size = hyperparameters['word_emb_size']
        grid_emb_size = hyperparameters['grid_emb_size']
        grid_feat_dim = hyperparameters['grid_feat_dim']

        # Create the projection layer and the word embedding matrix:
        init = tf.constant_initializer(
            0.01 *
            numpy.random.uniform(-1, 1, size=(vocab_size, word_emb_size)))
        projection_layer = Dense(vocab_size,
                                 use_bias=True,
                                 kernel_initializer=init)
        projection_layer.build((vocab_size, word_emb_size))
        emb = tf.transpose(projection_layer.trainable_weights[0]
                           )  # word_emb_size x vocab_size

        # Retrieve information from hyperparameters and a batch of tf records:
        if is_train:
            dropout_keep_rate = hyperparameters['dropout_keep_rate']
            grid_feat_batch, visual_concept_batch, caption_batch, target_batch = tf.train.shuffle_batch(
                [
                    features['grid_feat'], features['visual_concept'],
                    features['caption'], features['target']
                ],
                batch_size=batch_size,
                capacity=20000,
                min_after_dequeue=200)

            # The caption_batch and target_batch are sparse matrices.
            # we need to convert the to dense matrices:
            caption_batch_dense = tf.sparse_tensor_to_dense(
                sp_input=caption_batch, default_value=0)  # B x max_len
            target_batch_dense = tf.sparse_tensor_to_dense(
                sp_input=target_batch, default_value=0)  # B x max_len

            # embedding the ids of the captions:
            caption_batch_dense_emb = tf.nn.embedding_lookup(
                emb, caption_batch_dense)  # B x max_len x word_emb_size
        else:
            # at the test time we do not have a caption, only image features
            # and visual concepts
            grid_feat_batch = features['grid_feat']
            visual_concept_batch = features['visual_concept']
            dropout_keep_rate = 1.0

        lstm_decoder_first_layer = tf.nn.rnn_cell.LSTMCell(
            num_units=word_emb_size)

        W_visual_concept = tf.Variable(
            0.01 * tf.random_normal([VISUAL_CONCEPT_SIZE, word_emb_size]))
        b_visual_concept = tf.Variable(tf.zeros([word_emb_size]))
        visual_concept_proj = tf.tensordot(
            visual_concept_batch, W_visual_concept,
            [[1], [0]]) + b_visual_concept  # B x word_emb_size
        visual_concept_proj = tf.reshape(
            visual_concept_proj,
            [-1, 1, word_emb_size])  # B x 1 x word_emb_size

        _, deocder_first_layer_init_state = tf.nn.dynamic_rnn(
            lstm_decoder_first_layer, visual_concept_proj, dtype=tf.float32)
        # init_st is B x word_emb_size

        self.__grid_feat_batch = grid_feat_batch  # B x (grid_size * grid_size * grid_feat_dim)

        grid_feat_batch = tf.reshape(
            grid_feat_batch, [-1, grid_size * grid_size, grid_feat_dim
                              ])  # B x (grid_size * grid_size) x grid_feat_dim

        # project image features into a space of dimension grid_emb_size using
        # a densely-connected layer:
        W_grid = tf.Variable(0.01 *
                             tf.random_normal([grid_feat_dim, grid_emb_size]))
        b_grid = tf.Variable(tf.zeros([grid_emb_size]))
        feat_batch_proj = tf.tensordot(
            grid_feat_batch, W_grid,
            [[2], [0]]) + b_grid  # B x (grid_size * grid_size) x grid_emb_size
        feat_batch_proj = tf.nn.dropout(feat_batch_proj,
                                        keep_prob=dropout_keep_rate)

        # apply a Grid LSTM to the image features
        grid_lstm_cell = grid_rnn.Grid2LSTMCell(grid_emb_size,
                                                use_peepholes=True,
                                                output_is_tuple=True,
                                                state_is_tuple=True)
        # top_left_to_bottom_right:
        grid_lstm_outputs_top_left_to_bottom_right, _ = tf.nn.dynamic_rnn(
            grid_lstm_cell,
            feat_batch_proj,
            sequence_length=grid_size * grid_size *
            tf.ones([batch_size], dtype=tf.int32),
            dtype=tf.float32,
            scope='rnn0')

        temp = tf.reshape(feat_batch_proj,
                          [-1, grid_size, grid_size, grid_emb_size
                           ])  # B x grid_size x grid_size x grid_emb_size
        # top_right_to_bottom_left:
        feat_batch_proj_rev1 = tf.reverse(temp, axis=[1])
        feat_batch_proj_rev1 = tf.reshape(
            feat_batch_proj_rev1, [-1, grid_size * grid_size, grid_emb_size])
        grid_lstm_outputs_top_right_to_bottom_left, _ = tf.nn.dynamic_rnn(
            grid_lstm_cell,
            feat_batch_proj_rev1,
            sequence_length=grid_size * grid_size *
            tf.ones([batch_size], dtype=tf.int32),
            dtype=tf.float32,
            scope='rnn1')
        # bottom_left_to_top_right:
        feat_batch_proj_rev2 = tf.reverse(temp, axis=[2])
        feat_batch_proj_rev2 = tf.reshape(
            feat_batch_proj_rev2, [-1, grid_size * grid_size, grid_emb_size])
        grid_lstm_outputs_bottom_left_to_top_right, _ = tf.nn.dynamic_rnn(
            grid_lstm_cell,
            feat_batch_proj_rev2,
            sequence_length=grid_size * grid_size *
            tf.ones([batch_size], dtype=tf.int32),
            dtype=tf.float32,
            scope='rnn2')
        # bottom_right_to_top_left:
        feat_batch_proj_rev3 = tf.reverse(temp, axis=[1, 2])
        feat_batch_proj_rev3 = tf.reshape(
            feat_batch_proj_rev3, [-1, grid_size * grid_size, grid_emb_size])
        grid_lstm_outputs_bottom_right_to_top_left, _ = tf.nn.dynamic_rnn(
            grid_lstm_cell,
            feat_batch_proj_rev3,
            sequence_length=grid_size * grid_size *
            tf.ones([batch_size], dtype=tf.int32),
            dtype=tf.float32,
            scope='rnn3')

        grid_lstm_outputs = grid_lstm_outputs_top_left_to_bottom_right + \
        grid_lstm_outputs_top_right_to_bottom_left + \
        grid_lstm_outputs_bottom_left_to_top_right + \
        grid_lstm_outputs_bottom_right_to_top_left

        deocder_second_layer_init_state = tf.nn.rnn_cell.LSTMStateTuple(
            c=tf.zeros([batch_size, 512], dtype=tf.float32),
            h=tf.zeros([batch_size, 512], dtype=tf.float32))

        attention_depth = 512
        if not is_train:
            # at the test time we need to tile the encoder_outputs:
            beam_width = 20
            encoder_outputs = grid_lstm_outputs[0]
            tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
                encoder_outputs, multiplier=beam_width)
            encoder_final_state = deocder_second_layer_init_state
            tiled_encoder_second_layer_final_state = tf.contrib.seq2seq.tile_batch(
                encoder_final_state, multiplier=beam_width)

            sequence_length = grid_size * grid_size * tf.ones([batch_size],
                                                              dtype=tf.int64)
            tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
                sequence_length, multiplier=beam_width)
            encoder_final_state = deocder_first_layer_init_state
            tiled_encoder_first_layer_final_state = tf.contrib.seq2seq.tile_batch(
                encoder_final_state, multiplier=beam_width)

        else:
            tiled_encoder_outputs = grid_lstm_outputs[0]
            tiled_sequence_length = None

        # define decoder two-layer LSTM (first layer gets the visual concepts
        # and the second layer gets the image features via an attention mechanism):
        cells = [lstm_decoder_first_layer]
        lstm_decoder_second_layer = tf.nn.rnn_cell.LSTMCell(word_emb_size)
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
            num_units=attention_depth,
            memory=tiled_encoder_outputs,
            memory_sequence_length=tiled_sequence_length)
        attention_cell = tf.contrib.seq2seq.AttentionWrapper(
            lstm_decoder_second_layer,
            attention_mechanism,
            alignment_history=True,
            attention_layer_size=word_emb_size)
        cells.append(attention_cell)
        decoder_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
        if is_train:
            # training the decoder:
            # https://www.tensorflow.org/api_guides/python/contrib.seq2seq
            training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=caption_batch_dense_emb,
                sequence_length=tf.shape(caption_batch_dense_emb)[1] *
                tf.ones([batch_size], dtype=tf.int32),
                time_major=False)

            decoder_initial_state2 = attention_cell.zero_state(
                dtype=tf.float32, batch_size=batch_size * 1)
            decoder_initial_state2 = decoder_initial_state2.clone(
                cell_state=deocder_second_layer_init_state)

            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=decoder_cell,
                helper=training_helper,
                initial_state=(deocder_first_layer_init_state,
                               decoder_initial_state2),
                output_layer=projection_layer)

            training_decoder_output, AttentionWrapperState, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=training_decoder,
                impute_finished=False,
                maximum_iterations=100)
            training_logits = training_decoder_output.rnn_output

            # the sum of the negative log likelihood of the correct word at
            # each time step is chosen as the loss:
            lgt = training_logits
            mask = tf.cast(target_batch_dense > 0, dtype=tf.float32)
            cost = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=target_batch_dense, logits=lgt)
            cost_mask = tf.multiply(mask, cost)
            cost_mask_sum = tf.reduce_sum(cost_mask, 1)
            cross_entropy = tf.reduce_mean(cost_mask_sum)

            # the loss is minimized:
            learning_rate = hyperparameters['learning_rate']
            optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                                  decay=0.9)
            train_step = optimizer.minimize(cross_entropy)
            ind_tensor = tf.range(AttentionWrapperState[1][4].size())
            att_w = AttentionWrapperState[1][4].gather(indices=ind_tensor,
                                                       name=None)
            info = [att_w, lgt]
            self.__train_step = train_step
            self.__info = info
            self.__cross_entropy = cross_entropy
            self.__logits = lgt
            self.__all_att_weights = att_w
            self.__caption_batch = caption_batch
        if not is_train:
            # at the test time use beam search to generate a caption for an image:
            true_batch_size = batch_size
            decoder_initial_state = attention_cell.zero_state(
                dtype=tf.float32, batch_size=true_batch_size * beam_width)
            decoder_initial_state = decoder_initial_state.clone(
                cell_state=tiled_encoder_second_layer_final_state)
            initial_state = (tiled_encoder_first_layer_final_state,
                             decoder_initial_state)
            start_tokens = tf.zeros([batch_size], dtype=tf.int32)
            end_token = tf.constant(1, dtype=tf.int32)
            decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                decoder_cell,
                emb,
                start_tokens,
                end_token,
                initial_state,
                beam_width,
                output_layer=projection_layer,
                length_penalty_weight=0.0,
                coverage_penalty_weight=0.0,
                reorder_tensor_arrays=True)
            outputs, AttentionWrapperState, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder, maximum_iterations=25)
            ids = outputs.predicted_ids
            self.__ids = ids
            # get the attetion weights for image regions:
            self.__all_att_weights = AttentionWrapperState[0][1][5]
        self.__W_emb = emb
        self.__visual_concept_batch = visual_concept_batch
    def __init__(self, features: dict, hyperparameters: dict, is_train: bool):
        batch_size = hyperparameters['batch_size']
        grid_size = hyperparameters['grid_size']
        vocab_size = hyperparameters['vocab_size']
        word_emb_size = hyperparameters['word_emb_size']

        # Create the projection layer and the word embedding matrix:
        init = tf.constant_initializer(
            0.01 *
            numpy.random.uniform(-1, 1, size=(vocab_size, word_emb_size)))
        projection_layer = Dense(vocab_size,
                                 use_bias=True,
                                 kernel_initializer=init)
        projection_layer.build((vocab_size, word_emb_size))
        emb = tf.transpose(projection_layer.trainable_weights[0])

        # Retrieve information from hyperparameters and a batch of tf records:
        if is_train:
            grid_feat_batch, caption_batch, target_batch = tf.train.shuffle_batch(
                [
                    features['grid_feat'], features['caption'],
                    features['target']
                ],
                batch_size=batch_size,
                capacity=20000,
                min_after_dequeue=200)
            dropout_keep_rate = hyperparameters['dropout_keep_rate']

            # The caption_batch and target_batch are sparse matrices
            # we need to convert the to dense matrices:
            caption_batch_dense = tf.sparse_tensor_to_dense(
                sp_input=caption_batch, default_value=0, validate_indices=True)
            target_batch_dense = tf.sparse_tensor_to_dense(
                sp_input=target_batch, default_value=0, validate_indices=True)

            # embedding the ids of the captions:
            caption_batch_dense_emb = tf.nn.embedding_lookup(
                emb, caption_batch_dense)  # B x max_len x word_emb_size
        else:
            # at the test time we do not have a caption, only image features
            grid_feat_batch = features['grid_feat']
            dropout_keep_rate = 1.0

        grid_emb_size = hyperparameters['grid_emb_size']
        grid_feat_dim = hyperparameters['grid_feat_dim']

        self.__grid_feat_batch = grid_feat_batch  # B x (grid_size * grid_size * 1024)
        grid_feat_batch = tf.reshape(
            grid_feat_batch, [-1, grid_size * grid_size, grid_feat_dim
                              ])  # B x (grid_size * grid_size) x grid_feat_dim

        # project image region features into a space of dimension word_emb_size
        # using a densely-connected layer:
        W_grid = tf.Variable(0.01 *
                             tf.random_normal([grid_feat_dim, grid_emb_size]))
        b_grid = tf.Variable(tf.zeros([grid_emb_size]))
        feat_batch_proj = tf.tensordot(
            grid_feat_batch, W_grid,
            [[2], [0]]) + b_grid  # B x (grid_size * grid_size) x grid_emb_size
        # Adds a Layer Normalization layer:
        feat_batch_proj = tf.contrib.layers.layer_norm(feat_batch_proj)
        feat_batch_proj = tf.nn.dropout(feat_batch_proj,
                                        keep_prob=dropout_keep_rate)

        init_st = tf.nn.rnn_cell.LSTMStateTuple(c=tf.zeros([batch_size, 512],
                                                           dtype=tf.float32),
                                                h=tf.zeros([batch_size, 512],
                                                           dtype=tf.float32))
        attention_depth = word_emb_size
        if not is_train:
            # at the test time we need to tile the encoder_outputs:
            beam_width = 20
            encoder_outputs = feat_batch_proj
            tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
                encoder_outputs, multiplier=beam_width)

            sequence_length = grid_size * grid_size * tf.ones([batch_size],
                                                              dtype=tf.int64)
            tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
                sequence_length, multiplier=beam_width)

            tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
                init_st, multiplier=beam_width)
        else:
            tiled_encoder_outputs = feat_batch_proj
            tiled_sequence_length = None

        # define decoder LSTM with an attention mechanism (Bahdanau):
        lstm = tf.nn.rnn_cell.LSTMCell(word_emb_size)
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
            num_units=attention_depth,
            memory=tiled_encoder_outputs,
            memory_sequence_length=tiled_sequence_length)
        attention_cell = tf.contrib.seq2seq.AttentionWrapper(
            lstm,
            attention_mechanism,
            alignment_history=True,
            attention_layer_size=word_emb_size)
        attention_cell = tf.contrib.rnn.DropoutWrapper(
            attention_cell, output_keep_prob=dropout_keep_rate)
        if is_train:
            # training the decoder:
            training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=caption_batch_dense_emb,
                sequence_length=tf.shape(caption_batch_dense_emb)[1] *
                tf.ones([batch_size], dtype=tf.int32),
                time_major=False)

            decoder_initial_state2 = attention_cell.zero_state(
                dtype=tf.float32, batch_size=batch_size * 1)
            decoder_initial_state2 = decoder_initial_state2.clone(
                cell_state=init_st)

            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=attention_cell,
                helper=training_helper,
                initial_state=decoder_initial_state2,
                output_layer=projection_layer)

            training_decoder_output, AttentionWrapperState, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=training_decoder,
                impute_finished=False,
                maximum_iterations=100)
            training_logits = training_decoder_output.rnn_output

            # the sum of the negative log likelihood of the correct word at
            # each time step is chosen as the loss:
            lgt = training_logits
            mask = tf.cast(target_batch_dense > 0, dtype=tf.float32)
            cost = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=target_batch_dense, logits=lgt)
            cost_mask = tf.multiply(mask, cost)
            cost_mask_sum = tf.reduce_sum(cost_mask, 1)
            cross_entropy = tf.reduce_mean(cost_mask_sum)

            # the loss is minimized
            learning_rate = hyperparameters['learning_rate']
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            train_step = optimizer.minimize(cross_entropy)
            ind_tensor = tf.range(AttentionWrapperState[4].size())

            # get the attention weights for image regions
            att_w = AttentionWrapperState[4].gather(indices=ind_tensor,
                                                    name=None)

            info = [lgt]
            self.__train_step = train_step
            self.__info = info
            self.__cross_entropy = cross_entropy
            self.__logits = lgt
            self.__all_att_weights = att_w
            self.__caption_batch = caption_batch
        if not is_train:
            # at the test time use beam search to generate a caption for an image:
            true_batch_size = batch_size
            decoder_initial_state = attention_cell.zero_state(
                dtype=tf.float32, batch_size=true_batch_size * beam_width)
            decoder_initial_state = decoder_initial_state.clone(
                cell_state=tiled_encoder_final_state)
            initial_state = decoder_initial_state
            start_tokens = tf.zeros([batch_size], dtype=tf.int32)
            end_token = tf.constant(1, dtype=tf.int32)
            decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                attention_cell,
                emb,
                start_tokens,
                end_token,
                initial_state,
                beam_width,
                output_layer=projection_layer,
                length_penalty_weight=0.0,
                coverage_penalty_weight=0.0,
                reorder_tensor_arrays=True)
            outputs, AttentionWrapperState, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder, maximum_iterations=25)
            ids = outputs.predicted_ids
            self.__ids = ids
            self.__all_att_weights = AttentionWrapperState[0][5]
        self.__W_emb = emb
    def __init__(self, features: dict, hyperparameters: dict, is_train: bool):
        batch_size = hyperparameters['batch_size']
        feat_dim = hyperparameters['feat_dim']
        vocab_size = hyperparameters['vocab_size']
        word_emb_size = hyperparameters['word_emb_size']
        model = hyperparameters['model']

        # Create the projection layer and the word embedding matrix:
        init = tf.constant_initializer(
            0.01 *
            numpy.random.uniform(-1, 1, size=(vocab_size, word_emb_size)))
        projection_layer = Dense(vocab_size,
                                 use_bias=True,
                                 kernel_initializer=init,
                                 activation=None,
                                 name='emb_matrix')
        projection_layer.build((vocab_size, word_emb_size))
        emb = tf.transpose(projection_layer.trainable_weights[0])

        # Retrieve information from hyperparameters and a batch of tf records:
        if is_train:
            learning_rate = hyperparameters['learning_rate']
            dropout_keep_rate = hyperparameters['dropout_keep_rate']
            caption_batch, target_batch, data_type_batch, im_id_batch, feat_batch \
                = tf.train.shuffle_batch([features['caption'], features['target'],
                                          features['data_type'], features['im_id'],
                                          features['feat']], batch_size=batch_size,
                                         capacity=2000, min_after_dequeue=200)
            # The caption_batch and target_batch are sparse matrices.
            # we need to convert the to dense matrices:
            caption_batch_dense = tf.sparse_tensor_to_dense(
                sp_input=caption_batch,
                default_value=0,
                validate_indices=True,
                name=None)  # B x max_len
            target_batch_dense = tf.sparse_tensor_to_dense(
                sp_input=target_batch,
                default_value=0,
                validate_indices=True,
                name=None)  # B x max_len
            # embedding the ids of the captions:
            caption_batch_dense_emb = tf.nn.embedding_lookup(
                emb, caption_batch_dense)  # B x max_len x word_emb_size
        else:
            # at the test time we do not have a caption, only image features
            dropout_keep_rate = 1.0
            feat_batch = features['feat']

        # project image features into a space of dimension word_emb_size using
        # a densely-connected layer:
        W_feat = tf.Variable(0.01 *
                             tf.random_normal([feat_dim, word_emb_size]))
        b_feat = tf.Variable(tf.zeros([word_emb_size]))
        feat_proj = tf.tensordot(feat_batch, W_feat,
                                 [[1], [0]]) + b_feat  # B x word_emb_size
        feat_proj = tf.reshape(feat_proj,
                               [-1, 1, word_emb_size])  # B x 1 x word_emb_size
        feat_proj = tf.nn.dropout(feat_proj, keep_prob=dropout_keep_rate)

        # encoding image features (initializing the decoder LSTM):
        if model == 'icg':
            lstm = tf.nn.rnn_cell.LSTMCell(num_units=word_emb_size)
        if model == 'icg_deep':
            lstm1 = tf.nn.rnn_cell.LSTMCell(num_units=word_emb_size)
            lstm2 = tf.nn.rnn_cell.LSTMCell(num_units=word_emb_size)
            lstm = tf.contrib.rnn.MultiRNNCell([lstm1, lstm2],
                                               state_is_tuple=True)
        _, encoder_final_state = tf.nn.dynamic_rnn(lstm,
                                                   feat_proj,
                                                   dtype=tf.float32)
        if is_train:
            # training the decoder: give the last output of the encoder as an
            # input to the decoder:
            training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=caption_batch_dense_emb,
                sequence_length=tf.shape(caption_batch_dense_emb)[1] *
                tf.ones([batch_size], dtype=tf.int32),
                time_major=False)
            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=lstm,
                helper=training_helper,
                initial_state=encoder_final_state,
                output_layer=projection_layer)

            training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=training_decoder,
                impute_finished=False,
                maximum_iterations=100)
            training_logits = training_decoder_output.rnn_output
            probs = tf.nn.softmax(training_logits)  # B x max_len x vocab_size

            # the sum of the negative log likelihood of the correct word at
            # each time step is chosen as the loss:
            cost = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=target_batch_dense, logits=training_logits)
            mask = tf.cast(target_batch_dense > 0, dtype=tf.float32)
            cost_mask = tf.multiply(mask, cost)
            cost_mask_sum = tf.reduce_sum(cost_mask, 1)  # B x 1
            cross_entropy = tf.reduce_mean(cost_mask_sum)  # 1

            # the loss is minimized using RMSprop:
            optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                                  decay=0.9)
            train_step = optimizer.minimize(cross_entropy)
            self.__train_step = train_step
            self.__cross_entropy = cross_entropy
            self.__logits = training_logits
            self.__probs = probs
            self.__caption_batch = caption_batch
        else:
            # at the test time use beam search to generate a caption for an image:
            tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
                encoder_final_state, multiplier=BEAM_WIDTH)
            start_tokens = tf.zeros([tf.shape(feat_batch)[0]], dtype=tf.int32)
            end_token = tf.constant(1, dtype=tf.int32)
            decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                lstm,
                emb,
                start_tokens,
                end_token,
                tiled_encoder_final_state,
                BEAM_WIDTH,
                output_layer=projection_layer,
                length_penalty_weight=0.0,
                coverage_penalty_weight=0.0,
                reorder_tensor_arrays=True)
            outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder, maximum_iterations=25)
            self.__ids = outputs.predicted_ids
            self.__scores = outputs[1].scores
        self.__feat_batch = feat_batch
        self.__W_emb = emb
        self.__info = encoder_final_state
Esempio n. 5
0
    def __init__(self,
                 config,
                 batch_size,
                 decoder_input,
                 latent_variables,
                 embedding,
                 output_len,
                 vocab_size,
                 go_idx,
                 eos_idx,
                 is_training=True,
                 ru=False):
        self.config = config
        with tf.name_scope("decoder_input"):
            self.batch_size = batch_size
            self.decoder_input = decoder_input
            self.latent_variables = latent_variables
            self.embedding = embedding
            self.output_len = output_len
            self.vocab_size = vocab_size
            self.go_idx = go_idx
            self.eos_idx = eos_idx
            self.is_training = is_training

        with tf.variable_scope("Length_Control"):
            if self.config.LEN_EMB_SIZE > 0:
                self.len_embeddings = tf.get_variable(
                    name="len_embeddings",
                    shape=(self.config.NUM_LEN_EMB, self.config.LEN_EMB_SIZE),
                    dtype=tf.float32,
                    initializer=tf.random_normal_initializer(stddev=0.1))

        def create_cell():
            if self.config.RNN_CELL == 'lnlstm':
                cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
                    self.config.DEC_RNN_SIZE)
            elif self.config.RNN_CELL == 'lstm':
                cell = tf.contrib.rnn.BasicLSTMCell(self.config.DEC_RNN_SIZE)
            elif self.config.RNN_CELL == 'gru':
                cell = tf.contrib.rnn.GRUCell(self.config.DEC_RNN_SIZE)
            else:
                logger.error('rnn_cell {} not supported'.format(
                    self.config.RNN_CELL))
            if self.is_training:
                cell = tf.nn.rnn_cell.DropoutWrapper(
                    cell, output_keep_prob=self.config.DROPOUT_KEEP)
            return cell

        cell = tf.nn.rnn_cell.MultiRNNCell([create_cell() for _ in range(2)])

        projection_layer = Dense(self.vocab_size)
        projection_layer.build(self.config.DEC_RNN_SIZE)
        self.beam_ids = self.get_beam_ids(cell, projection_layer)

        if self.config.LEN_EMB_SIZE > 0:
            initial_state = cell.zero_state(self.batch_size, dtype=tf.float32)
            cell = LenControlWrapper(cell,
                                     self.output_len,
                                     self.len_embeddings,
                                     initial_cell_state=initial_state)
        initial_state = cell.zero_state(self.batch_size, dtype=tf.float32)
        cell = AlignmentWrapper(cell,
                                latent_variables,
                                initial_cell_state=initial_state)
        initial_state = cell.zero_state(self.batch_size, dtype=tf.float32)

        if self.is_training:
            decoder_emb_inputs = tf.nn.embedding_lookup(
                self.embedding, self.decoder_input)
            helper = seq2seq.ScheduledEmbeddingTrainingHelper(
                decoder_emb_inputs, self.output_len, self.embedding,
                self.config.SAMP_PROB)
        else:
            helper = seq2seq.GreedyEmbeddingHelper(self.embedding,
                                                   self.go_input(),
                                                   self.eos_idx)

        decoder = seq2seq.BasicDecoder(cell,
                                       helper,
                                       initial_state=initial_state,
                                       output_layer=None)
        outputs, _, seq_len = seq2seq.dynamic_decode(
            decoder, maximum_iterations=tf.reduce_max(self.output_len))
        self.rnn_output = outputs.rnn_output
        self.proj_weights = projection_layer.kernel
        self.proj_bias = projection_layer.bias

        bow_h = tf.layers.dense(self.latent_variables,
                                self.config.BOW_SIZE,
                                activation=tf.tanh)
        if self.is_training:
            bow_h = tf.nn.dropout(bow_h, self.config.DROPOUT_KEEP)

        self.bow_logits = tf.layers.dense(bow_h,
                                          self.vocab_size,
                                          name="bow_logits")
Esempio n. 6
0
def get_model(config, embeddings=None, num_words=None, stitch_inputs=None):
    inputs = dict()
    outputs = dict()

    if stitch_inputs is None:
        inputs['x'] = tf.placeholder(tf.int32, shape=[None, None], name="x")
        inputs['y'] = tf.placeholder(tf.int32, shape=[None, None], name="y")
        inputs['seq_length'] = tf.placeholder(tf.int32,
                                              shape=[None],
                                              name="seq_length")
    else:
        inputs['x'] = stitch_inputs['x']
        inputs['y'] = stitch_inputs['y']
        inputs['seq_length'] = stitch_inputs['seq_length']

    if embeddings is None:
        logging.info('initialize embeddings')
        embeddings = tf.get_variable(
            name="embedding",
            shape=[num_words, config['embedding_size']],
            dtype=tf.float32,
            initializer=tf.random_normal_initializer(stddev=0.1),
            trainable=True)
    else:
        logging.info('use pretrained embeddings')
        logging.info('embeddings trainable: {}'.format(
            config.get('embedding_trainable', False)))
        embeddings = tf.get_variable(
            "embeddings",
            shape=embeddings.shape,
            initializer=tf.constant_initializer(embeddings),
            trainable=config.get('embedding_trainable', False))

    inputs['input_keep_prob'] = tf.placeholder_with_default(
        tf.constant(1, dtype=tf.float32), shape=[], name="input_keep_prob")
    inputs['output_keep_prob'] = tf.placeholder_with_default(
        tf.constant(1, dtype=tf.float32), shape=[], name="output_keep_prob")
    inputs['learning_rate'] = tf.placeholder_with_default(tf.constant(
        config['learning_rate'], dtype=tf.float32),
                                                          shape=[],
                                                          name="learning_rate")
    batch_size = tf.shape(inputs['x'])[0]

    def create_cell():
        rnn_cell_type = config.get('rnn_cell', 'lnlstm')
        if rnn_cell_type == 'lstm':
            logging.info('Use LSTMBlockCell cell')
            _cell = tf.contrib.rnn.LSTMBlockCell(config['rnn_size'])
        else:
            logging.info('Use LayerNormBasicLSTMCell cell')
            _cell = tf.contrib.rnn.LayerNormBasicLSTMCell(config['rnn_size'])
        _cell = tf.nn.rnn_cell.DropoutWrapper(
            _cell,
            input_keep_prob=inputs['input_keep_prob'],
            output_keep_prob=inputs['output_keep_prob'])
        return _cell

    cells = [create_cell() for _ in range(config['num_layers'])]
    cell = tf.nn.rnn_cell.MultiRNNCell(cells)

    x_embedded = tf.nn.embedding_lookup(embeddings, inputs['x'])
    helper = seq2seq.TrainingHelper(x_embedded, inputs['seq_length'])

    projection_layer = Dense(embeddings.shape[0],
                             name='projection_layer',
                             use_bias=True,
                             dtype=tf.float32)
    initial_state = cell.zero_state(batch_size, dtype=tf.float32)
    mask = sequence_mask(inputs['seq_length'], dtype=tf.float32)

    decoder = seq2seq.BasicDecoder(cell, helper, initial_state=initial_state)
    decode_output, _, _ = seq2seq.dynamic_decode(decoder,
                                                 impute_finished=True,
                                                 swap_memory=config.get(
                                                     'swap_memory', False))

    if config.get('sampled_softmax', 0) > 0:
        projection_layer.build(input_shape=decode_output.rnn_output.shape)

        def _sampled_loss(labels, logits):
            return tf.nn.sampled_softmax_loss(
                tf.transpose(projection_layer.kernel),
                projection_layer.bias,
                tf.expand_dims(labels, -1),
                logits,
                num_sampled=config['sampled_softmax'],
                num_classes=num_words)

        softmax_loss_function = _sampled_loss
        logits_input = decode_output.rnn_output
    else:
        softmax_loss_function = None
        logits_input = projection_layer(decode_output.rnn_output)

    losses = seq2seq.sequence_loss(logits_input,
                                   inputs['y'],
                                   mask,
                                   softmax_loss_function=softmax_loss_function,
                                   average_across_batch=False,
                                   average_across_timesteps=False)

    outputs['total_loss'] = tf.reduce_sum(losses)
    outputs['num_tokens'] = tf.reduce_sum(mask)

    outputs['loss'] = outputs['total_loss'] / outputs['num_tokens']
    outputs['perplexity'] = tf.exp(outputs['loss'], name='perplexity')

    if stitch_inputs is not None:
        losses = seq2seq.sequence_loss(logits_input,
                                       inputs['y'],
                                       mask,
                                       average_across_batch=False)
        outputs['losses'] = tf.identity(losses, name='losses')
        outputs['perplexities'] = tf.exp(losses, name='perplexities')

    if stitch_inputs is None:
        with tf.variable_scope('Optimizer'):
            optimizer_name = config.get('optimizer', 'sgd')
            if optimizer_name == 'adam':
                logging.info('use adam optimizer')
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=inputs['learning_rate'])
            else:
                logging.info('use sgd optimizer')
                optimizer = tf.contrib.opt.MomentumWOptimizer(
                    weight_decay=config['weight_decay'],
                    learning_rate=inputs['learning_rate'],
                    momentum=config['momentum'])
            if config.get('aggregation_method', 'default') == 'experimental':
                logging.info('use gradient aggregation method: experimental')
                gradient_var_pairs = optimizer.compute_gradients(
                    outputs['total_loss'],
                    var_list=tf.trainable_variables(),
                    aggregation_method=tf.AggregationMethod.
                    EXPERIMENTAL_ACCUMULATE_N)
            else:
                logging.info('use gradient aggregation method: default')
                gradient_var_pairs = optimizer.compute_gradients(
                    outputs['total_loss'], var_list=tf.trainable_variables())
            vars = [x[1] for x in gradient_var_pairs if x[0] is not None]
            gradients = [x[0] for x in gradient_var_pairs if x[0] is not None]
            gc = config.get('gradient_clipping', 120.0)
            gradients, _ = tf.clip_by_global_norm(gradients, clip_norm=gc)
            outputs['train_op'] = optimizer.apply_gradients(
                zip(gradients, vars))

    return inputs, outputs