Example #1
0
    def _aon(self, inputs, is_train):
        """
        """
        assert inputs.get_shape()[1:] == (26, 26, 256)

        with tf.variable_scope("aon"):
            hfeatures = self._shared_cnn(inputs, is_train, reuse=False)
            vfeatures = self._shared_cnn(tf.contrib.image.rotate(
                inputs, math.pi / 2),
                                         is_train,
                                         reuse=True)

            hfeatures = tf.transpose(hfeatures,
                                     perm=[1, 0, 2],
                                     name='h_time_major')
            hfeatures = rnn_layer(hfeatures, None, self.rnn_size, 'hbdrnn')
            vfeatures = tf.transpose(vfeatures,
                                     perm=[1, 0, 2],
                                     name='v_time_major')
            vfeatures = rnn_layer(vfeatures, None, self.rnn_size, 'vbdrnn')

            features = (hfeatures, tf.reverse(hfeatures, axis=[0])) + \
                       (vfeatures, tf.reverse(vfeatures, axis=[0]))
            features = tf.stack(features, axis=1)
            features = tf.transpose(features, [2, 1, 0, 3])

            clue = self._clue_network(inputs, is_train)

        return features, clue
Example #2
0
    def get_logits(self, image, is_train, **kwargs):
        """
        """
        image = tf.reshape(image, [-1, 100, 100, 1])

        # BCNN
        features = self._bcnn(image, is_train)
        assert features.get_shape()[1:] == (26, 26, 256)

        # AON
        features, clue = self._aon(features, is_train)
        assert features.get_shape()[1:] == (4, 23, 512)
        assert clue.get_shape()[1:] == (4, 23, 1)

        # FG
        features = tf.reduce_sum(features * clue, axis=1)
        features = tf.nn.tanh(features)
        assert features.get_shape()[1:] == (23, 512)

        # LSTM
        features = tf.transpose(features, [1, 0, 2], name='time_major')
        features = rnn_layer(features, None, self.rnn_size, 'lstm')
        logits, weights = attention_decoder(features, kwargs['label'],
                                            len(self.out_charset),
                                            self.rnn_size, is_train,
                                            self.FLAGS.label_maxlen)

        sequence_length = None

        return logits, sequence_length
Example #3
0
    def get_logits(self, image, is_train, **kwargs):
        """
        """
        widths = tf.ones(tf.shape(image)[0],
                         dtype=tf.int32) * tf.shape(image)[2]
        features, sequence_length = self._convnet_layers(
            image, widths, is_train)
        features = tf.transpose(features, perm=[1, 0, 2], name='time_major')
        attention_states = rnn_layer(features,
                                     sequence_length,
                                     self.rnn_size,
                                     scope="rnn")
        logits, weights = attention_decoder(attention_states, kwargs['label'],
                                            len(self.out_charset),
                                            self.rnn_size, is_train,
                                            self.FLAGS.label_maxlen)

        return logits, sequence_length
Example #4
0
    def get_logits(self, image, is_train, **kwargs):
        """
        """
        # ResNet
        widths = tf.ones(tf.shape(image)[0],
                         dtype=tf.int32) * tf.shape(image)[2]
        features, sequence_length = self._convnet_layers(
            image, widths, is_train)

        # LSTM encoder
        with tf.variable_scope("rnn"):
            rnn_inputs = tf.nn.max_pool(features, (1, 8, 1, 1), (1, 1, 1, 1),
                                        'VALID',
                                        data_format='NHWC')
            rnn_inputs = tf.squeeze(rnn_inputs, axis=[1])
            rnn_inputs = tf.transpose(rnn_inputs,
                                      perm=[1, 0, 2],
                                      name='time_major')
            holistic_features = rnn_layer(rnn_inputs,
                                          sequence_length,
                                          self.rnn_size,
                                          scope='holistic')
            holistic_feature = dense_layer(holistic_features[-1],
                                           self.FLAGS.rnn_size,
                                           name='holistic_projection')

        # 2D LSTM decoder
        logits, weights = self.twodim_attention_decoder(
            holistic_feature, features, kwargs['label'], len(self.out_charset),
            self.FLAGS.rnn_size, is_train, self.FLAGS.label_maxlen)
        logits = tf.reshape(
            logits, [-1, self.FLAGS.label_maxlen,
                     len(self.out_charset) + 1])

        sequence_length = None
        self.attention_weights = tf.expand_dims(weights, axis=1)

        return logits, sequence_length