def _aon(self, inputs, is_train): """ """ assert inputs.get_shape()[1:] == (26, 26, 256) with tf.variable_scope("aon"): hfeatures = self._shared_cnn(inputs, is_train, reuse=False) vfeatures = self._shared_cnn(tf.contrib.image.rotate( inputs, math.pi / 2), is_train, reuse=True) hfeatures = tf.transpose(hfeatures, perm=[1, 0, 2], name='h_time_major') hfeatures = rnn_layer(hfeatures, None, self.rnn_size, 'hbdrnn') vfeatures = tf.transpose(vfeatures, perm=[1, 0, 2], name='v_time_major') vfeatures = rnn_layer(vfeatures, None, self.rnn_size, 'vbdrnn') features = (hfeatures, tf.reverse(hfeatures, axis=[0])) + \ (vfeatures, tf.reverse(vfeatures, axis=[0])) features = tf.stack(features, axis=1) features = tf.transpose(features, [2, 1, 0, 3]) clue = self._clue_network(inputs, is_train) return features, clue
def get_logits(self, image, is_train, **kwargs): """ """ image = tf.reshape(image, [-1, 100, 100, 1]) # BCNN features = self._bcnn(image, is_train) assert features.get_shape()[1:] == (26, 26, 256) # AON features, clue = self._aon(features, is_train) assert features.get_shape()[1:] == (4, 23, 512) assert clue.get_shape()[1:] == (4, 23, 1) # FG features = tf.reduce_sum(features * clue, axis=1) features = tf.nn.tanh(features) assert features.get_shape()[1:] == (23, 512) # LSTM features = tf.transpose(features, [1, 0, 2], name='time_major') features = rnn_layer(features, None, self.rnn_size, 'lstm') logits, weights = attention_decoder(features, kwargs['label'], len(self.out_charset), self.rnn_size, is_train, self.FLAGS.label_maxlen) sequence_length = None return logits, sequence_length
def get_logits(self, image, is_train, **kwargs): """ """ widths = tf.ones(tf.shape(image)[0], dtype=tf.int32) * tf.shape(image)[2] features, sequence_length = self._convnet_layers( image, widths, is_train) features = tf.transpose(features, perm=[1, 0, 2], name='time_major') attention_states = rnn_layer(features, sequence_length, self.rnn_size, scope="rnn") logits, weights = attention_decoder(attention_states, kwargs['label'], len(self.out_charset), self.rnn_size, is_train, self.FLAGS.label_maxlen) return logits, sequence_length
def get_logits(self, image, is_train, **kwargs): """ """ # ResNet widths = tf.ones(tf.shape(image)[0], dtype=tf.int32) * tf.shape(image)[2] features, sequence_length = self._convnet_layers( image, widths, is_train) # LSTM encoder with tf.variable_scope("rnn"): rnn_inputs = tf.nn.max_pool(features, (1, 8, 1, 1), (1, 1, 1, 1), 'VALID', data_format='NHWC') rnn_inputs = tf.squeeze(rnn_inputs, axis=[1]) rnn_inputs = tf.transpose(rnn_inputs, perm=[1, 0, 2], name='time_major') holistic_features = rnn_layer(rnn_inputs, sequence_length, self.rnn_size, scope='holistic') holistic_feature = dense_layer(holistic_features[-1], self.FLAGS.rnn_size, name='holistic_projection') # 2D LSTM decoder logits, weights = self.twodim_attention_decoder( holistic_feature, features, kwargs['label'], len(self.out_charset), self.FLAGS.rnn_size, is_train, self.FLAGS.label_maxlen) logits = tf.reshape( logits, [-1, self.FLAGS.label_maxlen, len(self.out_charset) + 1]) sequence_length = None self.attention_weights = tf.expand_dims(weights, axis=1) return logits, sequence_length