Beispiel #1
0
 def Grid2LSTM(x):
     #import pdb
     #pdb.set_trace()
     lstm_cell_fw = grid_rnn.Grid2LSTMCell(n_hidden,
                                           forget_bias=1.0,
                                           use_peepholes=True,
                                           state_is_tuple=True,
                                           tied=True)
     lstm_cell_bw = grid_rnn.Grid2LSTMCell(n_hidden,
                                           forget_bias=1.0,
                                           use_peepholes=True,
                                           state_is_tuple=True,
                                           tied=True)
     # lstm_cell = rnn_cell.GridLSTMCell(n_hidden, use_peepholes=True)
     #lstm_cell_fw = rnn.MultiRNNCell([lstm_cell_fw] * 2, state_is_tuple=True)
     #lstm_cell_bw = rnn.MultiRNNCell([lstm_cell_bw] * 2, state_is_tuple=True)
     lstm_cell_fw = rnn.DropoutWrapper(lstm_cell_fw,
                                       output_keep_prob=self.keep_prob2)
     lstm_cell_bw = rnn.DropoutWrapper(lstm_cell_bw,
                                       output_keep_prob=self.keep_prob2)
     #outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
     outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_cell_fw,
                                                  lstm_cell_bw,
                                                  x,
                                                  dtype=tf.float32)
     return tf.concat(outputs, 2)
def LSTM_Model(input_data, config):
    """
    LSTM模型
    :param input_data:
    :param config:
    :return:
    """
    input_data = tf.transpose(input_data, [1, 0, 2])
    input_data = tf.reshape(input_data, [-1, config.features])
    input_data = tf.nn.sigmoid(
        tf.matmul(input_data, config.W['hidden']) + config.biases['hidden'])
    input_data = tf.split(input_data, config.timesteps, 0)
    #lsem_cell
    lstm_cell1 = grid_rnn.Grid2LSTMCell(num_units=config.hidden_nums,
                                        forget_bias=1.0,
                                        state_is_tuple=True)
    # lstm_cell1=grid_rnn.Grid2BasicLSTMCell(num_units=config.hidden_nums,forget_bias=1.0,state_is_tuple=True)
    # lstm_cell1=tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cell1,output_keep_prob=config.keep_prob)
    lstm_cell11 = grid_rnn.Grid2LSTMCell(num_units=config.hidden_two,
                                         forget_bias=1.0,
                                         state_is_tuple=True)
    # lstm_cell11=grid_rnn.Grid2BasicLSTMCell(num_units=config.hidden_two,forget_bias=1.0,state_is_tuple=True)
    # # lstm_cell11=tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cell11,output_keep_prob=config.keep_prob)
    lstm_cell2 = grid_rnn.Grid2LSTMCell(num_units=config.hidden_two,
                                        forget_bias=1.0,
                                        state_is_tuple=True)
    # # lstm_cell2=tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cell2,output_keep_prob=config.keep_prob)
    stack_lstm = tf.nn.rnn_cell.MultiRNNCell(
        cells=[lstm_cell1, lstm_cell11, lstm_cell2])
    # init_state=stack_lstm.zero_state(batch_size=config.batch_size,dtype=tf.float32)
    outputs, _ = tf.nn.static_rnn(cell=stack_lstm,
                                  inputs=input_data,
                                  dtype=tf.float32)
    output = tf.matmul(outputs[-1],
                       config.W['output']) + config.biases['output']
    return output
    def __init__(self, features: dict, hyperparameters: dict, is_train: bool):
        batch_size = hyperparameters['batch_size']
        grid_size = hyperparameters['grid_size']
        vocab_size = hyperparameters['vocab_size']
        word_emb_size = hyperparameters['word_emb_size']
        grid_emb_size = hyperparameters['grid_emb_size']
        grid_feat_dim = hyperparameters['grid_feat_dim']

        # Create the projection layer and the word embedding matrix:
        init = tf.constant_initializer(
            0.01 *
            numpy.random.uniform(-1, 1, size=(vocab_size, word_emb_size)))
        projection_layer = Dense(vocab_size,
                                 use_bias=True,
                                 kernel_initializer=init)
        projection_layer.build((vocab_size, word_emb_size))
        emb = tf.transpose(projection_layer.trainable_weights[0]
                           )  # word_emb_size x vocab_size

        # Retrieve information from hyperparameters and a batch of tf records:
        if is_train:
            dropout_keep_rate = hyperparameters['dropout_keep_rate']
            grid_feat_batch, visual_concept_batch, caption_batch, target_batch = tf.train.shuffle_batch(
                [
                    features['grid_feat'], features['visual_concept'],
                    features['caption'], features['target']
                ],
                batch_size=batch_size,
                capacity=20000,
                min_after_dequeue=200)

            # The caption_batch and target_batch are sparse matrices.
            # we need to convert the to dense matrices:
            caption_batch_dense = tf.sparse_tensor_to_dense(
                sp_input=caption_batch, default_value=0)  # B x max_len
            target_batch_dense = tf.sparse_tensor_to_dense(
                sp_input=target_batch, default_value=0)  # B x max_len

            # embedding the ids of the captions:
            caption_batch_dense_emb = tf.nn.embedding_lookup(
                emb, caption_batch_dense)  # B x max_len x word_emb_size
        else:
            # at the test time we do not have a caption, only image features
            # and visual concepts
            grid_feat_batch = features['grid_feat']
            visual_concept_batch = features['visual_concept']
            dropout_keep_rate = 1.0

        lstm_decoder_first_layer = tf.nn.rnn_cell.LSTMCell(
            num_units=word_emb_size)

        W_visual_concept = tf.Variable(
            0.01 * tf.random_normal([VISUAL_CONCEPT_SIZE, word_emb_size]))
        b_visual_concept = tf.Variable(tf.zeros([word_emb_size]))
        visual_concept_proj = tf.tensordot(
            visual_concept_batch, W_visual_concept,
            [[1], [0]]) + b_visual_concept  # B x word_emb_size
        visual_concept_proj = tf.reshape(
            visual_concept_proj,
            [-1, 1, word_emb_size])  # B x 1 x word_emb_size

        _, deocder_first_layer_init_state = tf.nn.dynamic_rnn(
            lstm_decoder_first_layer, visual_concept_proj, dtype=tf.float32)
        # init_st is B x word_emb_size

        self.__grid_feat_batch = grid_feat_batch  # B x (grid_size * grid_size * grid_feat_dim)

        grid_feat_batch = tf.reshape(
            grid_feat_batch, [-1, grid_size * grid_size, grid_feat_dim
                              ])  # B x (grid_size * grid_size) x grid_feat_dim

        # project image features into a space of dimension grid_emb_size using
        # a densely-connected layer:
        W_grid = tf.Variable(0.01 *
                             tf.random_normal([grid_feat_dim, grid_emb_size]))
        b_grid = tf.Variable(tf.zeros([grid_emb_size]))
        feat_batch_proj = tf.tensordot(
            grid_feat_batch, W_grid,
            [[2], [0]]) + b_grid  # B x (grid_size * grid_size) x grid_emb_size
        feat_batch_proj = tf.nn.dropout(feat_batch_proj,
                                        keep_prob=dropout_keep_rate)

        # apply a Grid LSTM to the image features
        grid_lstm_cell = grid_rnn.Grid2LSTMCell(grid_emb_size,
                                                use_peepholes=True,
                                                output_is_tuple=True,
                                                state_is_tuple=True)
        # top_left_to_bottom_right:
        grid_lstm_outputs_top_left_to_bottom_right, _ = tf.nn.dynamic_rnn(
            grid_lstm_cell,
            feat_batch_proj,
            sequence_length=grid_size * grid_size *
            tf.ones([batch_size], dtype=tf.int32),
            dtype=tf.float32,
            scope='rnn0')

        temp = tf.reshape(feat_batch_proj,
                          [-1, grid_size, grid_size, grid_emb_size
                           ])  # B x grid_size x grid_size x grid_emb_size
        # top_right_to_bottom_left:
        feat_batch_proj_rev1 = tf.reverse(temp, axis=[1])
        feat_batch_proj_rev1 = tf.reshape(
            feat_batch_proj_rev1, [-1, grid_size * grid_size, grid_emb_size])
        grid_lstm_outputs_top_right_to_bottom_left, _ = tf.nn.dynamic_rnn(
            grid_lstm_cell,
            feat_batch_proj_rev1,
            sequence_length=grid_size * grid_size *
            tf.ones([batch_size], dtype=tf.int32),
            dtype=tf.float32,
            scope='rnn1')
        # bottom_left_to_top_right:
        feat_batch_proj_rev2 = tf.reverse(temp, axis=[2])
        feat_batch_proj_rev2 = tf.reshape(
            feat_batch_proj_rev2, [-1, grid_size * grid_size, grid_emb_size])
        grid_lstm_outputs_bottom_left_to_top_right, _ = tf.nn.dynamic_rnn(
            grid_lstm_cell,
            feat_batch_proj_rev2,
            sequence_length=grid_size * grid_size *
            tf.ones([batch_size], dtype=tf.int32),
            dtype=tf.float32,
            scope='rnn2')
        # bottom_right_to_top_left:
        feat_batch_proj_rev3 = tf.reverse(temp, axis=[1, 2])
        feat_batch_proj_rev3 = tf.reshape(
            feat_batch_proj_rev3, [-1, grid_size * grid_size, grid_emb_size])
        grid_lstm_outputs_bottom_right_to_top_left, _ = tf.nn.dynamic_rnn(
            grid_lstm_cell,
            feat_batch_proj_rev3,
            sequence_length=grid_size * grid_size *
            tf.ones([batch_size], dtype=tf.int32),
            dtype=tf.float32,
            scope='rnn3')

        grid_lstm_outputs = grid_lstm_outputs_top_left_to_bottom_right + \
        grid_lstm_outputs_top_right_to_bottom_left + \
        grid_lstm_outputs_bottom_left_to_top_right + \
        grid_lstm_outputs_bottom_right_to_top_left

        deocder_second_layer_init_state = tf.nn.rnn_cell.LSTMStateTuple(
            c=tf.zeros([batch_size, 512], dtype=tf.float32),
            h=tf.zeros([batch_size, 512], dtype=tf.float32))

        attention_depth = 512
        if not is_train:
            # at the test time we need to tile the encoder_outputs:
            beam_width = 20
            encoder_outputs = grid_lstm_outputs[0]
            tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
                encoder_outputs, multiplier=beam_width)
            encoder_final_state = deocder_second_layer_init_state
            tiled_encoder_second_layer_final_state = tf.contrib.seq2seq.tile_batch(
                encoder_final_state, multiplier=beam_width)

            sequence_length = grid_size * grid_size * tf.ones([batch_size],
                                                              dtype=tf.int64)
            tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
                sequence_length, multiplier=beam_width)
            encoder_final_state = deocder_first_layer_init_state
            tiled_encoder_first_layer_final_state = tf.contrib.seq2seq.tile_batch(
                encoder_final_state, multiplier=beam_width)

        else:
            tiled_encoder_outputs = grid_lstm_outputs[0]
            tiled_sequence_length = None

        # define decoder two-layer LSTM (first layer gets the visual concepts
        # and the second layer gets the image features via an attention mechanism):
        cells = [lstm_decoder_first_layer]
        lstm_decoder_second_layer = tf.nn.rnn_cell.LSTMCell(word_emb_size)
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
            num_units=attention_depth,
            memory=tiled_encoder_outputs,
            memory_sequence_length=tiled_sequence_length)
        attention_cell = tf.contrib.seq2seq.AttentionWrapper(
            lstm_decoder_second_layer,
            attention_mechanism,
            alignment_history=True,
            attention_layer_size=word_emb_size)
        cells.append(attention_cell)
        decoder_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
        if is_train:
            # training the decoder:
            # https://www.tensorflow.org/api_guides/python/contrib.seq2seq
            training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=caption_batch_dense_emb,
                sequence_length=tf.shape(caption_batch_dense_emb)[1] *
                tf.ones([batch_size], dtype=tf.int32),
                time_major=False)

            decoder_initial_state2 = attention_cell.zero_state(
                dtype=tf.float32, batch_size=batch_size * 1)
            decoder_initial_state2 = decoder_initial_state2.clone(
                cell_state=deocder_second_layer_init_state)

            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=decoder_cell,
                helper=training_helper,
                initial_state=(deocder_first_layer_init_state,
                               decoder_initial_state2),
                output_layer=projection_layer)

            training_decoder_output, AttentionWrapperState, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=training_decoder,
                impute_finished=False,
                maximum_iterations=100)
            training_logits = training_decoder_output.rnn_output

            # the sum of the negative log likelihood of the correct word at
            # each time step is chosen as the loss:
            lgt = training_logits
            mask = tf.cast(target_batch_dense > 0, dtype=tf.float32)
            cost = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=target_batch_dense, logits=lgt)
            cost_mask = tf.multiply(mask, cost)
            cost_mask_sum = tf.reduce_sum(cost_mask, 1)
            cross_entropy = tf.reduce_mean(cost_mask_sum)

            # the loss is minimized:
            learning_rate = hyperparameters['learning_rate']
            optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                                  decay=0.9)
            train_step = optimizer.minimize(cross_entropy)
            ind_tensor = tf.range(AttentionWrapperState[1][4].size())
            att_w = AttentionWrapperState[1][4].gather(indices=ind_tensor,
                                                       name=None)
            info = [att_w, lgt]
            self.__train_step = train_step
            self.__info = info
            self.__cross_entropy = cross_entropy
            self.__logits = lgt
            self.__all_att_weights = att_w
            self.__caption_batch = caption_batch
        if not is_train:
            # at the test time use beam search to generate a caption for an image:
            true_batch_size = batch_size
            decoder_initial_state = attention_cell.zero_state(
                dtype=tf.float32, batch_size=true_batch_size * beam_width)
            decoder_initial_state = decoder_initial_state.clone(
                cell_state=tiled_encoder_second_layer_final_state)
            initial_state = (tiled_encoder_first_layer_final_state,
                             decoder_initial_state)
            start_tokens = tf.zeros([batch_size], dtype=tf.int32)
            end_token = tf.constant(1, dtype=tf.int32)
            decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                decoder_cell,
                emb,
                start_tokens,
                end_token,
                initial_state,
                beam_width,
                output_layer=projection_layer,
                length_penalty_weight=0.0,
                coverage_penalty_weight=0.0,
                reorder_tensor_arrays=True)
            outputs, AttentionWrapperState, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder, maximum_iterations=25)
            ids = outputs.predicted_ids
            self.__ids = ids
            # get the attetion weights for image regions:
            self.__all_att_weights = AttentionWrapperState[0][1][5]
        self.__W_emb = emb
        self.__visual_concept_batch = visual_concept_batch
Beispiel #4
0
    def inference(self, batch_x):
        [imageInputs, seq_len] = batch_x
        tf.summary.image("images", imageInputs)
        with tf.name_scope('convLayers'):
            conv1 = self.convLayer(imageInputs,
                                   32,
                                   scopeN="l1",
                                   keep_prob=self.keep_prob,
                                   maxPool=[2, 1])
            conv2 = self.convLayer(conv1,
                                   64,
                                   scopeN="l2",
                                   keep_prob=self.keep_prob,
                                   maxPool=[2, 1])
            conv3 = self.convLayer(conv2,
                                   128,
                                   size_window=3,
                                   scopeN="l3",
                                   keep_prob=self.keep_prob,
                                   maxPool=None)
            conv4 = self.convLayer(conv3,
                                   256,
                                   size_window=2,
                                   scopeN="l4",
                                   keep_prob=self.keep_prob,
                                   maxPool=None)

        with tf.name_scope('preprocess'):
            hh, ww, chanels = conv4.get_shape().as_list()[1:4]
            assert ww == self.width, 'image does not have to become smaller in width'
            assert chanels == 256

            h_pool2_flat = tf.transpose(conv4, [2, 0, 1, 3])
            h_pool2_flat = tf.reshape(h_pool2_flat,
                                      [ww, self.batch_size, hh * chanels])

            # Permuting batch_size and n_steps
            #x = tf.transpose(h_pool2_flat, [2, 0, 1])
            # Reshape to (n_steps*batch_size, n_input)
            x = tf.reshape(h_pool2_flat, [-1, hh * chanels])
            # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
            x = tf.split(x, ww, 0)

        with tf.name_scope('BRNN'):
            if self.initializer == "graves":
                myInitializer = tf.truncated_normal_initializer(
                    mean=0., stddev=.075, seed=None, dtype=tf.float32)
            else:
                myInitializer = tf.truncated_normal(shape, stddev=0.1)

            if self.rnn_cell == "LSTM":
                cell = tf.contrib.rnn.LSTMCell(self.hidden,
                                               state_is_tuple=True,
                                               initializer=myInitializer)
            elif self.rnn_cell == "BasicLSTM":
                cell = tf.contrib.rnn.BasicLSTMCell(self.hidden,
                                                    forget_bias=1.0,
                                                    state_is_tuple=True)
            elif self.rnn_cell == "GRU":
                cell = tf.contrib.rnn.GRUCell(self.hidden)
            elif self.rnn_cell == "LSTMGRID2":
                cell = grid_rnn.Grid2LSTMCell(self.hidden,
                                              use_peepholes=True,
                                              forget_bias=1.0)
            elif self.rnn_cell == "LSTMGRID":
                cell = grid_rnn.GridLSTMCell(self.hidden,
                                             use_peepholes=True,
                                             forget_bias=1.0)
            elif self.rnn_cell == "GRUGRID2":
                cell = grid_rnn.Grid2GRUCell(self.hidden)
            else:
                raise Exception("model type not supported: {}".format(
                    self.rnn_cell))
            if self.keep_prob != 1 and self.train_b:
                cell = tf.nn.rnn_cell.DropoutWrapper(
                    cell=cell, output_keep_prob=self.keep_prob)

            stackf = tf.nn.rnn_cell.MultiRNNCell(
                [cell] * (self.layers),
                state_is_tuple=(self.rnn_cell[-4:] == "LSTM"))
            stackb = tf.nn.rnn_cell.MultiRNNCell(
                [cell] * (self.layers),
                state_is_tuple=(self.rnn_cell[-4:] == "LSTM"))

            self.reset_state_stackf = stackf.zero_state(self.batch_size,
                                                        dtype=tf.float32)

            self.reset_state_stackb = stackb.zero_state(self.batch_size,
                                                        dtype=tf.float32)
            if self.insertLastState:
                if self.rnn_cell[-4:] != "LSTM":
                    raise Exception(
                        "model type not supported for insertion of last state: {}"
                        .format(self.rnn_cell))

                self.state_stackb = [[
                    tf.placeholder(tf.float32, [self.batch_size, self.hidden])
                ] * 2 for i in range(self.layers)]
                self.state_stackf = [[
                    tf.placeholder(tf.float32, [self.batch_size, self.hidden])
                ] * 2 for i in range(self.layers)]

                self.rnn_tuple_statef = tuple([
                    tf.nn.rnn_cell.LSTMStateTuple(self.state_stackf[idx][0],
                                                  self.state_stackf[idx][1])
                    for idx in range(self.layers)
                ])

                self.rnn_tuple_stateb = tuple([
                    tf.nn.rnn_cell.LSTMStateTuple(self.state_stackb[idx][0],
                                                  self.state_stackb[idx][1])
                    for idx in range(self.layers)
                ])
                print("insertion of last state")
                #todo: try just initial states
                outputs, self.state_fw, self.state_bw = tf.contrib.rnn.static_bidirectional_rnn(
                    stackf,
                    stackb,
                    x,
                    sequence_length=seq_len,
                    dtype=tf.float32,
                    initial_state_fw=self.rnn_tuple_statef,
                    initial_state_bw=self.rnn_tuple_stateb)
            else:
                print("no insertion of last state")
                outputs, self.state_fw, self.state_bw = tf.contrib.rnn.static_bidirectional_rnn(
                    stackf,
                    stackb,
                    x,
                    sequence_length=seq_len,
                    dtype=tf.float32,
                    initial_state_fw=self.reset_state_stackf,
                    initial_state_bw=self.reset_state_stackb)
            #= states
            y_predict = tf.reshape(outputs, [-1, 2 * self.hidden])
        return y_predict