def predict(self, preprocessed_inputs, true_image_shapes):
        """Predicts unpostprocessed tensors from input tensor.

    This function takes an input batch of images and runs it through the forward
    pass of the network to yield unpostprocessesed predictions.

    A side effect of calling the predict method is that self._anchors is
    populated with a box_list.BoxList of anchors.  These anchors must be
    constructed before the postprocess or loss functions can be called.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] image tensor.
      true_image_shapes: int32 tensor of shape [batch, 3] where each row is
        of the form [height, width, channels] indicating the shapes
        of true images in the resized images, as resized images can be padded
        with zeros.

    Returns:
      prediction_dict: a dictionary holding "raw" prediction tensors:
        1) preprocessed_inputs: the [batch, height, width, channels] image
          tensor.
        2) box_encodings: 4-D float tensor of shape [batch_size, num_anchors,
          box_code_dimension] containing predicted boxes.
        3) class_predictions_with_background: 3-D float tensor of shape
          [batch_size, num_anchors, num_classes+1] containing class predictions
          (logits) for each of the anchors.  Note that this tensor *includes*
          background class predictions (at class index 0).
        4) feature_maps: a list of tensors where the ith tensor has shape
          [batch, height_i, width_i, depth_i].
        5) anchors: 2-D float tensor of shape [num_anchors, 4] containing
          the generated anchors in normalized coordinates.
    """
        with tf.variable_scope(None, self._extract_features_scope,
                               [preprocessed_inputs]):
            feature_maps = self._feature_extractor.extract_features(
                preprocessed_inputs)
        feature_map_spatial_dims = self._get_feature_map_spatial_dims(
            feature_maps)
        image_shape = shape_utils.combined_static_and_dynamic_shape(
            preprocessed_inputs)
        self._anchors = box_list_ops.concatenate(
            self._anchor_generator.generate(feature_map_spatial_dims,
                                            im_height=image_shape[1],
                                            im_width=image_shape[2]))
        prediction_dict = self._box_predictor.predict(
            feature_maps, self._anchor_generator.num_anchors_per_location())
        box_encodings = tf.squeeze(tf.concat(prediction_dict['box_encodings'],
                                             axis=1),
                                   axis=2)
        class_predictions_with_background = tf.concat(
            prediction_dict['class_predictions_with_background'], axis=1)
        predictions_dict = {
            'preprocessed_inputs': preprocessed_inputs,
            'box_encodings': box_encodings,
            'class_predictions_with_background':
            class_predictions_with_background,
            'feature_maps': feature_maps,
            'anchors': self._anchors.get()
        }
        return predictions_dict
Example #2
0
    def _batch_decode_oriented(self, oriented_box_encodings):
        """Decodes a batch of oriented box encodings with respect to the anchors.

    Args:
      oriented_box_encodings: A float32 tensor of shape
        [batch_size, num_anchors, box_code_size] containing box encodings.

    Returns:
      decoded_boxes: A float32 tensor of shape
        [batch_size, num_anchors, 4, 2] containing the decoded boxes.
    """
        combined_shape = shape_utils.combined_static_and_dynamic_shape(
            oriented_box_encodings)
        batch_size = combined_shape[0]
        tiled_anchor_boxes = tf.tile(tf.expand_dims(self.anchors.get(), 0),
                                     [batch_size, 1, 1])
        tiled_anchors_boxlist = box_list.BoxList(
            tf.reshape(tiled_anchor_boxes, [-1, self._box_coder.code_size]))

        decoded_oriented_boxes = self._box_coder.decode_oriented(
            tf.reshape(oriented_box_encodings,
                       [-1, self._box_coder.code_size_oriented]),
            tiled_anchors_boxlist)
        return tf.reshape(
            decoded_oriented_boxes.get_oriented(),
            tf.stack([combined_shape[0], combined_shape[1], 4, 2]))
    def calibration_fn(class_predictions_with_background):
      """Calibrate predictions via 1-d linear interpolation.

      Predictions scores are linearly interpolated based on class-agnostic
      function approximations. Note that the 0-indexed background class may
      also transformed.

      Args:
        class_predictions_with_background: tf.float32 tensor of shape
          [batch_size, num_anchors, num_classes + 1] containing scores on the
          interval [0,1]. This is usually produced by a sigmoid or softmax layer
          and the result of calling the `predict` method of a detection model.

      Returns:
        tf.float32 tensor of shape [batch_size, num_anchors, num_classes] if
        background class is not present (else shape is
        [batch_size, num_anchors, num_classes + 1]) on the interval [0, 1].
      """
      # Flattening Tensors and then reshaping at the end.
      flat_class_predictions_with_background = tf.reshape(
          class_predictions_with_background, shape=[-1])
      fn_x, fn_y = _function_approximation_proto_to_tf_tensors(
          calibration_config.function_approximation.x_y_pairs)
      updated_scores = _tf_linear_interp1d(
          flat_class_predictions_with_background, fn_x, fn_y)

      # Un-flatten the scores
      original_detections_shape = shape_utils.combined_static_and_dynamic_shape(
          class_predictions_with_background)
      calibrated_class_predictions_with_background = tf.reshape(
          updated_scores,
          shape=original_detections_shape,
          name='calibrate_scores')
      return calibrated_class_predictions_with_background
def select_random_box(boxlist, default_box=None, seed=None, scope=None):
    """Selects a random bounding box from a `BoxList`.

  Args:
    boxlist: A BoxList.
    default_box: A [1, 4] float32 tensor. If no boxes are present in `boxlist`,
      this default box will be returned. If None, will use a default box of
      [[-1., -1., -1., -1.]].
    seed: Random seed.
    scope: Name scope.

  Returns:
    bbox: A [1, 4] tensor with a random bounding box.
    valid: A bool tensor indicating whether a valid bounding box is returned
      (True) or whether the default box is returned (False).
  """
    with tf.name_scope(scope, 'SelectRandomBox'):
        bboxes = boxlist.get()
        combined_shape = shape_utils.combined_static_and_dynamic_shape(bboxes)
        number_of_boxes = combined_shape[0]
        default_box = default_box or tf.constant([[-1., -1., -1., -1.]])

        def select_box():
            random_index = tf.random_uniform([],
                                             maxval=number_of_boxes,
                                             dtype=tf.int32,
                                             seed=seed)
            return tf.expand_dims(bboxes[random_index],
                                  axis=0), tf.constant(True)

    return tf.cond(tf.greater_equal(number_of_boxes, 1),
                   true_fn=select_box,
                   false_fn=lambda: (default_box, tf.constant(False)))
Example #5
0
def nearest_neighbor_upsampling(input_tensor, scale):
    """Nearest neighbor upsampling implementation.

  Nearest neighbor upsampling function that maps input tensor with shape
  [batch_size, height, width, channels] to [batch_size, height * scale
  , width * scale, channels]. This implementation only uses reshape and
  broadcasting to make it TPU compatible.

  Args:
    input_tensor: A float32 tensor of size [batch, height_in, width_in,
      channels].
    scale: An integer multiple to scale resolution of input data.
  Returns:
    data_up: A float32 tensor of size
      [batch, height_in*scale, width_in*scale, channels].
  """
    with tf.name_scope('nearest_neighbor_upsampling'):
        (batch_size, height, width, channels
         ) = shape_utils.combined_static_and_dynamic_shape(input_tensor)
        output_tensor = tf.reshape(input_tensor, [
            batch_size, height, 1, width, 1, channels
        ]) * tf.ones([1, 1, scale, 1, scale, 1], dtype=input_tensor.dtype)
        return tf.reshape(
            output_tensor,
            [batch_size, height * scale, width * scale, channels])
Example #6
0
    def _compute_loss(logits, labels, num_classes):
        # shape_labels = combined_static_and_dynamic_shape(labels)
        # cls_logits = tf.image.resize_bilinear(cls_logits, shape_labels[1:3], align_corners=True)
        shape_logits = combined_static_and_dynamic_shape(logits)
        labels = tf.image.resize_nearest_neighbor(labels,
                                                  shape_logits[1:3],
                                                  align_corners=True)

        logits = tf.reshape(logits, [-1, shape_logits[-1]])
        labels = tf.reshape(labels, [-1])

        idx = tf.logical_and(tf.greater_equal(labels, 0),
                             tf.less(labels, num_classes))
        idx = tf.where(idx)[:, 0]

        valid_logits = tf.gather(logits, idx)
        valid_labels = tf.gather(labels, idx)

        # cls_loss = focal_loss(labels=valid_labels, logits=valid_logits)
        cls_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=valid_labels, logits=valid_logits)

        correct = tf.equal(
            valid_labels,
            tf.argmax(valid_logits, axis=-1, output_type=valid_labels.dtype))
        acc = tf.cast(correct, tf.float32)
        return tf.reduce_mean(cls_loss), tf.reduce_mean(acc)
Example #7
0
 def call(self, input):
     cnn_fmaps_lastscale = self.cnn_fmap[-1]
     batch_size, batch_len, depth = shape_utils.combined_static_and_dynamic_shape(
         input)
     _, feature_h, feature_w, feature_c = cnn_fmaps_lastscale.get_shape(
     ).as_list()
     ld_output = tf.reshape(input, (batch_size * batch_len, depth))
     ld_output_reshape = tf.reshape(ld_output,
                                    [batch_size * batch_len, 1, 1, depth])
     ld_output_conv = conv2d(ld_output_reshape,
                             self.embedding_dim,
                             1,
                             activation_fn=None,
                             normalizer_fn=None,
                             scope='compute2d_attention_layer/hs_conv',
                             reuse=tf.AUTO_REUSE)
     ld_output_conv_tile = tf.tile(ld_output_conv,
                                   [1, feature_h, feature_w, 1])
     cnn_fmap_conv = conv2d(cnn_fmaps_lastscale,
                            self.embedding_dim,
                            3,
                            activation_fn=None,
                            normalizer_fn=None,
                            scope='compute2d_attention_layer/fmap_conv',
                            reuse=tf.AUTO_REUSE)
     cnn_fmap_tile = tf.expand_dims(cnn_fmap_conv, 1)
     cnn_fmap_tile = tf.tile(cnn_fmap_tile, [1, batch_len, 1, 1, 1])
     cnn_fmap_tile = tf.reshape(
         cnn_fmap_tile,
         [batch_size * batch_len, feature_h, feature_w, feature_c])
     g = tf.nn.tanh(tf.add(cnn_fmap_tile, ld_output_conv_tile))
     g_conv = conv2d(g,
                     1,
                     1,
                     scope='compute2d_attention_layer/g_conv',
                     activation_fn=None,
                     normalizer_fn=None,
                     reuse=tf.AUTO_REUSE)
     g_conv_reshape = tf.reshape(
         g_conv, [batch_size * batch_len, feature_w * feature_h])
     g_conv_reshape_softmax = tf.nn.softmax(g_conv_reshape)
     mask = tf.reshape(g_conv_reshape_softmax,
                       [batch_size * batch_len, feature_h, feature_w, 1])
     g_tmp = tf.tile(
         tf.reshape(g_conv_reshape_softmax,
                    [batch_size * batch_len, feature_h, feature_w, 1]),
         [1, 1, 1, feature_c])
     glimpse = tf.reduce_sum(tf.multiply(cnn_fmap_tile, g_tmp), [1, 2])
     glimpse = tf.reshape(glimpse, [batch_size, batch_len, depth])
     c_h_concat = tf.concat(
         [glimpse,
          tf.reshape(ld_output, [batch_size, batch_len, depth])],
         axis=-1)
     rnn_output = tf.layers.dense(c_h_concat,
                                  self.output_dim,
                                  name='compute2d_attention_layer/output_w',
                                  reuse=tf.AUTO_REUSE)
     output = tf.reshape(rnn_output, [1, -1, self.output_dim])
     return output
Example #8
0
    def GenerationLoss(self, predictions_dict, scope=None):
        assert 'logits' or 'glyphs' in predictions_dict
        with tf.variable_scope(scope, 'Loss', list(predictions_dict.values())):
            glyphs = predictions_dict['glyphs']
            ref_glyphs = tf.constant(np.load('data/glyphs-325-fonts.npy'),
                                     dtype=tf.float32)  # 96 , 325, 32*32
            #ref_glyphs_reshape = tf.reshape(ref_glyphs, [96*325, 32*32])
            labels = self._groundtruth_dict['decoder_targets']
            lengths = self._groundtruth_dict['decoder_lengths']
            batch_size, batch_len = shape_utils.combined_static_and_dynamic_shape(
                labels)
            labels_indexes = tf.reshape(
                labels, [batch_size * batch_len
                         ]) + 96 * predictions_dict['embedding_ids']

            targets = tf.gather(
                ref_glyphs, labels_indexes)  # batch_size * batch_len, 8, 32*32
            targets_for_visual = tf.reshape(
                (targets + 1.0) * 127.5, [batch_size * batch_len, 32, 32, 1])
            tf.summary.image('target_glyph1', (targets_for_visual[:20]),
                             max_outputs=20)
            targets = tf.reshape(targets, [batch_size, batch_len, 32 * 32])

            with tf.name_scope(scope, 'WeightedL1Loss'):
                raw_losses = tf.reduce_mean(tf.abs(glyphs - targets), axis=[2])
                batch_size, max_time = shape_utils.combined_static_and_dynamic_shape(
                    labels)
                mask = tf.less(tf.tile([tf.range(max_time)], [batch_size, 1]),
                               tf.expand_dims(lengths, 1),
                               name='mask')
                masked_losses = tf.multiply(raw_losses,
                                            tf.cast(mask, tf.float32),
                                            name='masked_losses')
                row_losses = tf.reduce_sum(masked_losses, 1, name='row_losses')

                losses_tmp = tf.truediv(row_losses,
                                        tf.cast(lengths, tf.float32))
                loss_for_compare = tf.reduce_mean(losses_tmp)
                tf.summary.scalar('averged_L1_loss', loss_for_compare)

                loss = tf.reduce_sum(row_losses)
                loss = tf.truediv(
                    loss, tf.cast(tf.maximum(batch_size, 1), tf.float32))
                l1_loss_tensor = loss * 0.5
        return l1_loss_tensor
Example #9
0
def aspp_inference(logits, image):
    '''
    logits: (N, H', W', nc), classification logits
    image: (N, H, W, _), image or label, just for shape inference)
    '''
    shape_image = combined_static_and_dynamic_shape(image)
    logits = tf.image.resize_bilinear(logits, shape_image[1:3], align_corners=True)
    output = tf.argmax(logits, axis=-1, name='segmentation_output', output_type=tf.int32)
    return output
Example #10
0
        def _match_when_rows_are_empty():
            """Performs matching when the rows of similarity matrix are empty.

      When the rows are empty, all detections are false positives. So we return
      a tensor of -1's to indicate that the columns do not match to any rows.

      Returns:
        matches:  int32 tensor indicating the row each column matches to.
      """
            similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape(
                similarity_matrix)
            return -1 * tf.ones([similarity_matrix_shape[1]], dtype=tf.int32)
Example #11
0
 def _predict(self, image_features, num_predictions_per_location):
     combined_feature_shape = shape_utils.combined_static_and_dynamic_shape(image_features)
     batch_size = combined_feature_shape[0]
     num_anchors = (combined_feature_shape[1] * combined_feature_shape[2])
     code_size = 5
     zero = tf.reduce_sum(0 * image_features)
     box_encodings = zero + tf.zeros((batch_size, num_anchors, 1, code_size), dtype=tf.float32)
     class_predictions_with_background = zero + tf.zeros((batch_size,
                                                          num_anchors,
                                                          self.num_classes + 1), dtype=tf.float32)
     return {box_predictor.BOX_ENCODINGS: box_encodings,
             box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}
Example #12
0
def aspp_features(hlist, num_classes=19, alpha=1.0):
    '''
    Args:
        hlist: list of three features, [1/8, 1/16, 1/32].
    '''
    h0, h1, h2 = hlist
    shape_h0 = combined_static_and_dynamic_shape(h0)
    shape_h1 = combined_static_and_dynamic_shape(h1)

    with ssdnet_argscope():
        # merge h1 and h2, create 1/16 feature
        h2 = tf.depth_to_space(h2, 2)
        h12 = tf.concat([h1, h2], axis=-1)  # 128
        h12 = Conv2D('h12', h12, 256, 1, activation=BNReLU)

        with tf.variable_scope('top'):
            feat = Conv2D('conv1', h12, 256, 1, activation=BNReLU)
        with tf.variable_scope('se'):
            s = AvgPooling('avgpool',
                           h12,
                           49,
                           strides=(16, 20),
                           padding='same')
            s = Conv2D('conv1', s, 256, 1, activation=None, use_bias=True)
            s = tf.sigmoid(s, name='sigmoid')
            s = tf.image.resize_bilinear(s, shape_h1[1:3], align_corners=True)
        feat = tf.multiply(feat, s)

        feat = tf.image.resize_bilinear(feat,
                                        shape_h0[1:3],
                                        align_corners=True)
        feat = DWConv('convd', feat, 5)
        feat_l = Conv2D('conv_h0', h0, 128, 1, activation=BNReLU)

    with argscope([Conv2D], use_bias=True):
        feat = Conv2D('logit_up', feat, num_classes, 1)
        feat_l = Conv2D('logit_h0', feat_l, num_classes, 1)

    out = tf.add(feat, alpha * feat_l, name='cls_logit')
    return out
Example #13
0
    def _match_when_rows_are_non_empty():
      """Performs matching when the rows of similarity matrix are non empty.

      Returns:
        matches:  int32 tensor indicating the row each column matches to.
      """
      # Matches for each column
      matches = tf.argmax(similarity_matrix, 0, output_type=tf.int32)

      # Deal with matched and unmatched threshold
      if self._matched_threshold is not None:
        # Get logical indices of ignored and unmatched columns as tf.int64
        matched_vals = tf.reduce_max(similarity_matrix, 0)
        below_unmatched_threshold = tf.greater(self._unmatched_threshold,
                                               matched_vals)
        between_thresholds = tf.logical_and(
            tf.greater_equal(matched_vals, self._unmatched_threshold),
            tf.greater(self._matched_threshold, matched_vals))

        if self._negatives_lower_than_unmatched:
          matches = self._set_values_using_indicator(matches,
                                                     below_unmatched_threshold,
                                                     -1)
          matches = self._set_values_using_indicator(matches,
                                                     between_thresholds,
                                                     -2)
        else:
          matches = self._set_values_using_indicator(matches,
                                                     below_unmatched_threshold,
                                                     -2)
          matches = self._set_values_using_indicator(matches,
                                                     between_thresholds,
                                                     -1)

      if self._force_match_for_each_row:
        similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape(
            similarity_matrix)
        force_match_column_ids = tf.argmax(similarity_matrix, 1,
                                           output_type=tf.int32)
        force_match_column_indicators = (
            tf.one_hot(
                force_match_column_ids, depth=similarity_matrix_shape[1]) *
            tf.cast(tf.expand_dims(valid_rows, axis=-1), dtype=tf.float32))
        force_match_row_ids = tf.argmax(force_match_column_indicators, 0,
                                        output_type=tf.int32)
        force_match_column_mask = tf.cast(
            tf.reduce_max(force_match_column_indicators, 0), tf.bool)
        final_matches = tf.where(force_match_column_mask,
                                 force_match_row_ids, matches)
        return final_matches
      else:
        return matches
Example #14
0
    def predict(self, cnn_fmaps_mulscale, lstm_holistic_features, scope=None):
        with tf.variable_scope(scope, 'Predict'):
            predict = []
            ### a two layer LSTM
            cell0 = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple=True)

            if self._is_training:
                cell0 = tf.nn.rnn_cell.DropoutWrapper(cell=cell0,
                                                      output_keep_prob=0.5)
            cell1 = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple=True)
            if self._is_training:
                cell1 = tf.nn.rnn_cell.DropoutWrapper(cell=cell1,
                                                      output_keep_prob=0.5)
            lstm_cell = tf.nn.rnn_cell.MultiRNNCell([cell0, cell1],
                                                    state_is_tuple=True)
            char_embedding_array = tf.constant(
                np.identity(self.num_classes, dtype=np.float32))
            with tf.variable_scope('decoder') as scope:
                ld_output, ld_output_states = tf.nn.dynamic_rnn(
                    cell=lstm_cell,
                    inputs=tf.nn.embedding_lookup(
                        char_embedding_array,
                        self._groundtruth_dict['decoder_inputs']),
                    #sequence_length=tf.fill([batch_size], feature_w),
                    initial_state=lstm_holistic_features,
                    dtype=tf.float32,
                    time_major=False,
                    scope=scope)

            batch_size, batch_len, depth = shape_utils.combined_static_and_dynamic_shape(
                ld_output)
            rnn_output, glyphs, embeddings_ids = self.compute_att_2d(
                ld_output, cnn_fmaps_mulscale, 512)

            sample_id = tf.argmax(rnn_output, 2)
            if self._is_training:
                #assert isinstance(outputs, seq2seq.BasicDecoderOutput)
                outputs_dict = {
                    'labels': sample_id,
                    'logits': rnn_output,
                    'glyphs': glyphs,
                    'embedding_ids': embeddings_ids
                }
            else:
                outputs_dict = {
                    'labels': sample_id,
                    'scores': res_score,
                    #'lengths': prediction_lengths
                }
        return outputs_dict
Example #15
0
def tile_context_tensors(tensor_dict):
  """Tiles context fields to have num_frames along 0-th dimension."""

  num_frames = tf.shape(tensor_dict[fields.InputDataFields.image])[0]

  for key in tensor_dict:
    if key not in fields.SEQUENCE_FIELDS:
      original_tensor = tensor_dict[key]
      tensor_shape = shape_utils.combined_static_and_dynamic_shape(
          original_tensor)
      tensor_dict[key] = tf.tile(
          tf.expand_dims(original_tensor, 0),
          tf.stack([num_frames] + [1] * len(tensor_shape), axis=0))
  return tensor_dict
    def _get_feature_map_spatial_dims(self, feature_maps):
        """Return list of spatial dimensions for each feature map in a list.

    Args:
      feature_maps: a list of tensors where the ith tensor has shape
          [batch, height_i, width_i, depth_i].

    Returns:
      a list of pairs (height, width) for each feature map in feature_maps
    """
        feature_map_shapes = [
            shape_utils.combined_static_and_dynamic_shape(feature_map)
            for feature_map in feature_maps
        ]
        return [(shape[1], shape[2]) for shape in feature_map_shapes]
Example #17
0
def matmul_gather_on_zeroth_axis(params, indices, scope=None):
    """Matrix multiplication based implementation of tf.gather on zeroth axis.

  TODO(rathodv, jonathanhuang): enable sparse matmul option.

  Args:
    params: A float32 Tensor. The tensor from which to gather values.
      Must be at least rank 1.
    indices: A Tensor. Must be one of the following types: int32, int64.
      Must be in range [0, params.shape[0])
    scope: A name for the operation (optional).

  Returns:
    A Tensor. Has the same type as params. Values from params gathered
    from indices given by indices, with shape indices.shape + params.shape[1:].
  """
    with tf.name_scope(scope, 'MatMulGather'):
        params_shape = shape_utils.combined_static_and_dynamic_shape(params)
        indices_shape = shape_utils.combined_static_and_dynamic_shape(indices)
        params2d = tf.reshape(params, [params_shape[0], -1])
        indicator_matrix = tf.one_hot(indices, params_shape[0])
        gathered_result_flattened = tf.matmul(indicator_matrix, params2d)
        return tf.reshape(gathered_result_flattened,
                          tf.stack(indices_shape + params_shape[1:]))
Example #18
0
 def _aggregate_recognition_results(self, text_list, scores_list, scope=None):
   """Aggregate recognition results by picking up ones with highest scores.
   Args
     text_list: a list of tensors with shape [batch_size]
     scores_list: a list of tensors with shape [batch_size]
   """
   with tf.variable_scope(scope, 'AggregateRecognitionResults', (text_list + scores_list)):
     stacked_text = tf.stack(text_list, axis=1)
     stacked_scores = tf.stack(scores_list, axis=1)
     argmax_scores = tf.argmax(stacked_scores, axis=1)
     batch_size = shape_utils.combined_static_and_dynamic_shape(stacked_text)[0]
     indices = tf.stack([tf.range(batch_size, dtype=tf.int64), argmax_scores], axis=1)
     aggregated_text = tf.gather_nd(stacked_text, indices)
     aggregated_scores = tf.gather_nd(stacked_scores, indices)
     recognition_dict = {'text': aggregated_text, 'scores': aggregated_scores}
   return recognition_dict
Example #19
0
    def predict(self, cnn_fmap, lstm_holistic_features, scope=None):
        '''
    if not isinstance(feature_maps, (list, tuple)):
      raise ValueError('`feature_maps` must be list of tuple')
    '''
        with tf.variable_scope(scope, 'Predict'):
            cnn_fmaps_lastscale = cnn_fmap[-1]
            batch_size = shape_utils.combined_static_and_dynamic_shape(
                cnn_fmaps_lastscale)[0]
            decoder_cell = self._build_decoder_cell()
            decoder = self._build_decoder(decoder_cell, batch_size,
                                          lstm_holistic_features, cnn_fmap)
            outputs, _, output_lengths = seq2seq.dynamic_decode(
                decoder=decoder,
                output_time_major=False,
                impute_finished=False,
                maximum_iterations=self._max_num_steps)
            # apply regularizer
            filter_weights = lambda vars: [
                x for x in vars if x.op.name.endswith('kernel')
            ]
            tf.contrib.layers.apply_regularization(
                self._rnn_regularizer,
                filter_weights(decoder_cell.trainable_weights))

            outputs_dict = None
            if self._is_training:
                assert isinstance(outputs, seq2seq.BasicDecoderOutput)
                outputs_dict = {
                    'labels': outputs.sample_id,
                    'logits': outputs.rnn_output,
                }
            else:
                assert isinstance(outputs,
                                  seq2seq.FinalBeamSearchDecoderOutput)
                prediction_labels = outputs.predicted_ids[:, :, 0]
                prediction_lengths = output_lengths[:, 0]
                prediction_scores = tf.gather_nd(
                    outputs.beam_search_decoder_output.scores[:, :, 0],
                    tf.stack([tf.range(batch_size), prediction_lengths - 1],
                             axis=1))
                outputs_dict = {
                    'labels': prediction_labels,
                    'scores': prediction_scores,
                    'lengths': prediction_lengths
                }
        return outputs_dict
Example #20
0
def resize_image(image,
                 masks=None,
                 new_height=600,
                 new_width=1024,
                 method=tf.image.ResizeMethod.BILINEAR,
                 align_corners=False):

    with tf.name_scope(
            'ResizeImage',
            values=[image, new_height, new_width, method, align_corners]):
        new_image = tf.image.resize_images(image,
                                           tf.stack([new_height, new_width]),
                                           method=method,
                                           align_corners=align_corners)
        image_shape = shape_utils.combined_static_and_dynamic_shape(image)
        result = [new_image]
        result.append(tf.stack([new_height, new_width, image_shape[2]]))

        return result
Example #21
0
  def __call__(self, logits, labels, lengths, scope=None):
    """
    Args:
      logits: float32 tensor with shape [batch_size, max_time, num_classes]
      labels: int32 tensor with shape [batch_size, max_time]
      lengths: int32 tensor with shape [batch_size]
    """
    #print('raw_losses')
    #print(logits)
    with tf.name_scope(scope, 'SequenceCrossEntropyLoss', [logits, labels, lengths]):
      raw_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=labels,
        logits=logits
      )

      #print(raw_losses)
      #input()
      batch_size, max_time = shape_utils.combined_static_and_dynamic_shape(labels)
      mask = tf.less(
        tf.tile([tf.range(max_time)], [batch_size, 1]),
        tf.expand_dims(lengths, 1),
        name='mask'
      )
      masked_losses = tf.multiply(
        raw_losses,
        tf.cast(mask, tf.float32),
        name='masked_losses'
      ) # => [batch_size, max_time]
      row_losses = tf.reduce_sum(masked_losses, 1, name='row_losses')
      if self._sequence_normalize:
        loss = tf.truediv(
          row_losses,
          tf.cast(tf.maximum(lengths, 1), tf.float32),
          name='seq_normed_losses')
      loss = tf.reduce_sum(row_losses)
      if self._sample_normalize:
        loss = tf.truediv(
          loss,
          tf.cast(tf.maximum(batch_size, 1), tf.float32))
      if self._weight:
        loss = loss * self._weight
    return loss
Example #22
0
def nearest_neighbor_upsampling(input_tensor,
                                scale=None,
                                height_scale=None,
                                width_scale=None):
    """Nearest neighbor upsampling implementation.

  Nearest neighbor upsampling function that maps input tensor with shape
  [batch_size, height, width, channels] to [batch_size, height * scale
  , width * scale, channels]. This implementation only uses reshape and
  broadcasting to make it TPU compatible.

  Args:
    input_tensor: A float32 tensor of size [batch, height_in, width_in,
      channels].
    scale: An integer multiple to scale resolution of input data in both height
      and width dimensions.
    height_scale: An integer multiple to scale the height of input image. This
      option when provided overrides `scale` option.
    width_scale: An integer multiple to scale the width of input image. This
      option when provided overrides `scale` option.
  Returns:
    data_up: A float32 tensor of size
      [batch, height_in*scale, width_in*scale, channels].

  Raises:
    ValueError: If both scale and height_scale or if both scale and width_scale
      are None.
  """
    if not scale and (height_scale is None or width_scale is None):
        raise ValueError('Provide either `scale` or `height_scale` and'
                         ' `width_scale`.')
    with tf.name_scope('nearest_neighbor_upsampling'):
        h_scale = scale if height_scale is None else height_scale
        w_scale = scale if width_scale is None else width_scale
        (batch_size, height, width, channels
         ) = shape_utils.combined_static_and_dynamic_shape(input_tensor)
        output_tensor = tf.reshape(input_tensor, [
            batch_size, height, 1, width, 1, channels
        ]) * tf.ones([1, 1, h_scale, 1, w_scale, 1], dtype=input_tensor.dtype)
        return tf.reshape(
            output_tensor,
            [batch_size, height * h_scale, width * w_scale, channels])
Example #23
0
 def _predict(self, image_features, **kwargs):
     image_feature = image_features[0]
     combined_feature_shape = shape_utils.combined_static_and_dynamic_shape(
         image_feature)
     batch_size = combined_feature_shape[0]
     num_anchors = (combined_feature_shape[1] * combined_feature_shape[2])
     code_size = 4
     zero = tf.reduce_sum(0 * image_feature)
     num_class_slots = self.num_classes
     if self._add_background_class:
         num_class_slots = num_class_slots + 1
     box_encodings = zero + tf.zeros(
         (batch_size, num_anchors, 1, code_size), dtype=tf.float32)
     class_predictions_with_background = zero + tf.zeros(
         (batch_size, num_anchors, num_class_slots), dtype=tf.float32)
     predictions_dict = {
         box_predictor.BOX_ENCODINGS:
         box_encodings,
         box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND:
         class_predictions_with_background
     }
     return predictions_dict
Example #24
0
def tile_activation_maps_rows_cols(maps, num_rows, num_cols):
    """
  Args:
    maps: [batch_size, map_height, map_width, map_depth]
  Return:
    tiled_map: [batch_size, tiled_height, tiled_width]
  """
    batch_size, map_height, map_width, map_depth = \
      shape_utils.combined_static_and_dynamic_shape(maps)

    # padding
    num_maps = num_rows * num_cols
    padded_map = tf.cond(tf.greater(num_maps, map_depth),
                         true_fn=lambda: tf.pad(
                             maps, [[0, 0], [0, 0], [0, 0],
                                    [0, tf.maximum(num_maps - map_depth, 0)]]),
                         false_fn=lambda: maps[:, :, :, :num_maps])

    # reshape to [batch_size, map_height, map_width, num_rows, num_cols]
    reshaped_map = tf.reshape(
        padded_map, [batch_size, map_height, map_width, num_rows, num_cols])

    # unstack and concat along widths
    width_concated_maps = tf.concat(
        tf.unstack(
            reshaped_map, axis=4
        ),  # => list of [batch_size, map_height, map_width, num_rows]
        axis=2)  # => [batch_size, map_height, map_width * num_cols, num_rows]

    tiled_map = tf.concat(
        tf.unstack(
            width_concated_maps, axis=3
        ),  # => list of [batch_size, map_height, map_width * num_cols]
        axis=1)  # => [batch_size, map_height * num_rows, map_width * num_cols]

    tiled_map = tf.expand_dims(tiled_map, axis=3)

    return tiled_map
Example #25
0
def nearest_neighbor_upsampling(input_tensor, scale):
  """Nearest neighbor upsampling implementation.

  Nearest neighbor upsampling function that maps input tensor with shape
  [batch_size, height, width, channels] to [batch_size, height * scale
  , width * scale, channels]. This implementation only uses reshape and tile to
  make it compatible with certain hardware.

  Args:
    input_tensor: A float32 tensor of size [batch, height_in, width_in,
      channels].
    scale: An integer multiple to scale resolution of input data.
  Returns:
    data_up: A float32 tensor of size
      [batch, height_in*scale, width_in*scale, channels].
  """
  shape = shape_utils.combined_static_and_dynamic_shape(input_tensor)
  shape_before_tile = [shape[0], shape[1], 1, shape[2], 1, shape[3]]
  shape_after_tile = [shape[0], shape[1] * scale, shape[2] * scale, shape[3]]
  data_reshaped = tf.reshape(input_tensor, shape_before_tile)
  resized_tensor = tf.tile(data_reshaped, [1, 1, scale, 1, scale, 1])
  resized_tensor = tf.reshape(resized_tensor, shape_after_tile)
  return resized_tensor
  def _create_regression_targets(self, anchors, groundtruth_boxes, match):
    """Returns a regression target for each anchor.

    Args:
      anchors: a BoxList representing N anchors
      groundtruth_boxes: a BoxList representing M groundtruth_boxes
      match: a matcher.Match object

    Returns:
      reg_targets: a float32 tensor with shape [N, box_code_dimension]
    """
    matched_gt_boxes = match.gather_based_on_match(
        groundtruth_boxes.get(),
        unmatched_value=tf.zeros(4),
        ignored_value=tf.zeros(4))
    matched_gt_boxlist = box_list.BoxList(matched_gt_boxes)
    if groundtruth_boxes.has_field(fields.BoxListFields.keypoints):
      groundtruth_keypoints = groundtruth_boxes.get_field(
          fields.BoxListFields.keypoints)
      matched_keypoints = match.gather_based_on_match(
          groundtruth_keypoints,
          unmatched_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]),
          ignored_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]))
      matched_gt_boxlist.add_field(fields.BoxListFields.keypoints,
                                   matched_keypoints)
    matched_reg_targets = self._box_coder.encode(matched_gt_boxlist, anchors)
    match_results_shape = shape_utils.combined_static_and_dynamic_shape(
        match.match_results)

    # Zero out the unmatched and ignored regression targets.
    unmatched_ignored_reg_targets = tf.tile(
        self._default_regression_target(), [match_results_shape[0], 1])
    matched_anchors_mask = match.matched_column_indicator()
    reg_targets = tf.where(matched_anchors_mask,
                           matched_reg_targets,
                           unmatched_ignored_reg_targets)
    return reg_targets
Example #27
0
    def _compute_clip_window(self, preprocessed_images, true_image_shapes):
        """Computes clip window to use during post_processing.

    Computes a new clip window to use during post-processing based on
    `resized_image_shapes` and `true_image_shapes` only if `preprocess` method
    has been called. Otherwise returns a default clip window of [0, 0, 1, 1].

    Args:
      preprocessed_images: the [batch, height, width, channels] image
          tensor.
      true_image_shapes: int32 tensor of shape [batch, 3] where each row is
        of the form [height, width, channels] indicating the shapes
        of true images in the resized images, as resized images can be padded
        with zeros. Or None if the clip window should cover the full image.

    Returns:
      a 2-D float32 tensor of the form [batch_size, 4] containing the clip
      window for each image in the batch in normalized coordinates (relative to
      the resized dimensions) where each clip window is of the form [ymin, xmin,
      ymax, xmax] or a default clip window of [0, 0, 1, 1].

    """
        if true_image_shapes is None:
            return tf.constant([0, 0, 1, 1], dtype=tf.float32)

        resized_inputs_shape = shape_utils.combined_static_and_dynamic_shape(
            preprocessed_images)
        true_heights, true_widths, _ = tf.unstack(
            tf.to_float(true_image_shapes), axis=1)
        padded_height = tf.to_float(resized_inputs_shape[1])
        padded_width = tf.to_float(resized_inputs_shape[2])
        return tf.stack([
            tf.zeros_like(true_heights),
            tf.zeros_like(true_widths), true_heights / padded_height,
            true_widths / padded_width
        ],
                        axis=1)
Example #28
0
    def _batch_decode(self, box_encodings):
        """Decodes a batch of box encodings with respect to the anchors.

    Args:
      box_encodings: A float32 tensor of shape
        [batch_size, num_anchors, box_code_size] containing box encodings.

    Returns:
      decoded_boxes: A float32 tensor of shape
        [batch_size, num_anchors, 4] containing the decoded boxes.
      decoded_keypoints: A float32 tensor of shape
        [batch_size, num_anchors, num_keypoints, 2] containing the decoded
        keypoints if present in the input `box_encodings`, None otherwise.
    """
        combined_shape = shape_utils.combined_static_and_dynamic_shape(
            box_encodings)
        batch_size = combined_shape[0]
        tiled_anchor_boxes = tf.tile(tf.expand_dims(self.anchors.get(), 0),
                                     [batch_size, 1, 1])
        tiled_anchors_boxlist = box_list.BoxList(
            tf.reshape(tiled_anchor_boxes, [-1, 4]))
        decoded_boxes = self._box_coder.decode(
            tf.reshape(box_encodings, [-1, self._box_coder.code_size]),
            tiled_anchors_boxlist)
        decoded_keypoints = None
        if decoded_boxes.has_field(fields.BoxListFields.keypoints):
            decoded_keypoints = decoded_boxes.get_field(
                fields.BoxListFields.keypoints)
            num_keypoints = decoded_keypoints.get_shape()[1]
            decoded_keypoints = tf.reshape(
                decoded_keypoints,
                tf.stack(
                    [combined_shape[0], combined_shape[1], num_keypoints, 2]))
        decoded_boxes = tf.reshape(
            decoded_boxes.get(),
            tf.stack([combined_shape[0], combined_shape[1], 4]))
        return decoded_boxes, decoded_keypoints
Example #29
0
def aspp_losses(cls_logits, labels, num_classes):
    '''
    Args:
        labels: (H, W) label image
        cls_logits: (H', W', nc) logits

        For now, H' and W' are H/8 and W/8, respectively.
    '''
    # shape_labels = combined_static_and_dynamic_shape(labels)
    # cls_logits = tf.image.resize_bilinear(cls_logits, shape_labels[1:3], align_corners=True)
    shape_logits = combined_static_and_dynamic_shape(cls_logits)
    labels = tf.image.resize_nearest_neighbor(labels,
                                              shape_logits[1:3],
                                              align_corners=True)

    logits = tf.reshape(cls_logits, [-1, shape_logits[-1]])
    labels = tf.reshape(labels, [-1])

    idx = tf.logical_and(tf.greater_equal(labels, 0),
                         tf.less(labels, num_classes))
    idx = tf.where(idx)[:, 0]

    valid_logits = tf.gather(logits, idx)
    valid_labels = tf.gather(labels, idx)

    # cls_loss = focal_loss(labels=valid_labels, logits=valid_logits)
    cls_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=valid_labels, logits=valid_logits)
    cls_loss = tf.reduce_mean(cls_loss, name='cls_loss')

    correct = tf.equal(
        valid_labels,
        tf.argmax(valid_logits, axis=-1, output_type=valid_labels.dtype))
    acc = tf.reduce_mean(tf.cast(correct, tf.float32), name='accuracy')
    add_moving_summary(cls_loss, acc)
    # return the loss
    return cls_loss
Example #30
0
    def compute_att_2d(self, ld_output, cnn_fmaps_mulscale, d):
        with tf.variable_scope('decoder/compute2d_attention_layer'):
            cnn_fmaps_s1 = cnn_fmaps_mulscale[0]  # 24 * 80
            cnn_fmaps_s2 = cnn_fmaps_mulscale[1]  # 12 * 40
            cnn_fmaps_s3 = cnn_fmaps_mulscale[2]  # 6 * 40
            cnn_fmaps_lastscale = cnn_fmaps_mulscale[-1]  # 6 * 40
            batch_size, batch_len, depth = shape_utils.combined_static_and_dynamic_shape(
                ld_output)
            _, feature_h, feature_w, feature_c = cnn_fmaps_lastscale.get_shape(
            ).as_list()
            ## for lstm outputs
            ld_output = tf.reshape(ld_output, (batch_size * batch_len, depth))
            ld_output_reshape = tf.reshape(
                ld_output, [batch_size * batch_len, 1, 1, depth])
            ld_output_conv = conv2d(ld_output_reshape,
                                    d,
                                    1,
                                    activation_fn=None,
                                    normalizer_fn=None,
                                    scope='hs_conv')
            ld_output_conv_tile = tf.tile(ld_output_conv,
                                          [1, feature_h, feature_w, 1])

            cnn_fmap_conv = conv2d(cnn_fmaps_lastscale,
                                   d,
                                   3,
                                   activation_fn=None,
                                   normalizer_fn=None,
                                   scope='fmap_conv')
            cnn_fmap_tile = tf.expand_dims(cnn_fmap_conv, 1)
            cnn_fmap_tile = tf.tile(cnn_fmap_tile, [1, batch_len, 1, 1, 1])
            cnn_fmap_tile = tf.reshape(
                cnn_fmap_tile,
                [batch_size * batch_len, feature_h, feature_w, feature_c])

            g = tf.nn.tanh(tf.add(cnn_fmap_tile, ld_output_conv_tile))
            g = tf.nn.dropout(g, 0.5)
            g_conv = conv2d(g,
                            1,
                            1,
                            scope='g_conv',
                            activation_fn=None,
                            normalizer_fn=None)
            g_conv_reshape = tf.reshape(
                g_conv, [batch_size * batch_len, feature_w * feature_h])
            g_conv_reshape_softmax = tf.nn.softmax(g_conv_reshape)
            mask = tf.reshape(
                g_conv_reshape_softmax,
                [batch_size * batch_len, feature_h, feature_w, 1])
            tf.summary.image('Mask1', (mask[:20]), max_outputs=20)

            g_tmp = tf.tile(
                tf.reshape(g_conv_reshape_softmax,
                           [batch_size * batch_len, feature_h, feature_w, 1]),
                [1, 1, 1, feature_c])
            glimpse = tf.reduce_sum(tf.multiply(cnn_fmap_tile, g_tmp), [1, 2])

            _, cnn_fmap_s1_h, cnn_fmap_s1_w, cnn_fmap_s1_c = cnn_fmaps_s1.get_shape(
            ).as_list()
            _, cnn_fmap_s2_h, cnn_fmap_s2_w, cnn_fmap_s2_c = cnn_fmaps_s2.get_shape(
            ).as_list()
            _, cnn_fmap_s3_h, cnn_fmap_s3_w, cnn_fmap_s3_c = cnn_fmaps_s3.get_shape(
            ).as_list()

            mask_s3 = tf.tile(mask, [1, 1, 1, cnn_fmap_s3_c])  #bs_bl, 6 ,40 ,1
            mask_s2 = tf.tile(
                tf.image.resize_bilinear(mask, [cnn_fmap_s2_h, cnn_fmap_s2_w]),
                [1, 1, 1, cnn_fmap_s2_c])
            mask_s1 = tf.tile(
                tf.image.resize_bilinear(mask, [cnn_fmap_s1_h, cnn_fmap_s1_w]),
                [1, 1, 1, cnn_fmap_s1_c])
            # cnn_fmaps_s1 ( bs * 24 * 80 * c )

            cnn_fmap_s1_tile = tf.expand_dims(cnn_fmaps_s1, 1)
            cnn_fmap_s1_tile = tf.tile(cnn_fmap_s1_tile,
                                       [1, batch_len, 1, 1, 1])
            cnn_fmap_s1_tile = tf.reshape(cnn_fmap_s1_tile, [
                batch_size * batch_len, cnn_fmap_s1_h, cnn_fmap_s1_w,
                cnn_fmap_s1_c
            ])
            glimpse_s1 = tf.multiply(cnn_fmap_s1_tile, mask_s1)

            cnn_fmap_s2_tile = tf.expand_dims(cnn_fmaps_s2, 1)
            cnn_fmap_s2_tile = tf.tile(cnn_fmap_s2_tile,
                                       [1, batch_len, 1, 1, 1])
            cnn_fmap_s2_tile = tf.reshape(cnn_fmap_s2_tile, [
                batch_size * batch_len, cnn_fmap_s2_h, cnn_fmap_s2_w,
                cnn_fmap_s2_c
            ])
            glimpse_s2 = tf.multiply(cnn_fmap_s2_tile, mask_s2)

            cnn_fmap_s3_tile = tf.expand_dims(cnn_fmaps_s3, 1)
            cnn_fmap_s3_tile = tf.tile(cnn_fmap_s3_tile,
                                       [1, batch_len, 1, 1, 1])
            cnn_fmap_s3_tile = tf.reshape(cnn_fmap_s3_tile, [
                batch_size * batch_len, cnn_fmap_s3_h, cnn_fmap_s3_w,
                cnn_fmap_s3_c
            ])
            glimpse_s3 = tf.multiply(cnn_fmap_s3_tile, mask_s3)

            glimpse_s1_reshape = tf.reshape(glimpse_s1, [
                batch_size * batch_len, cnn_fmap_s1_h * cnn_fmap_s1_w,
                cnn_fmap_s1_c
            ])
            glimpse_s1_reshape = tf.reshape(glimpse_s1_reshape, [
                batch_size * batch_len, cnn_fmap_s1_c,
                cnn_fmap_s1_h * cnn_fmap_s1_w
            ])

            glimpse_s2_reshape = tf.reshape(glimpse_s2, [
                batch_size * batch_len, cnn_fmap_s2_h * cnn_fmap_s2_w,
                cnn_fmap_s2_c
            ])
            glimpse_s2_reshape = tf.reshape(glimpse_s2_reshape, [
                batch_size * batch_len, cnn_fmap_s2_c,
                cnn_fmap_s2_h * cnn_fmap_s2_w
            ])

            glimpse_s3_reshape = tf.reshape(glimpse_s3, [
                batch_size * batch_len, cnn_fmap_s3_h * cnn_fmap_s3_w,
                cnn_fmap_s3_c
            ])
            glimpse_s3_reshape = tf.reshape(glimpse_s3_reshape, [
                batch_size * batch_len, cnn_fmap_s3_c,
                cnn_fmap_s3_h * cnn_fmap_s3_w
            ])

            glimpse_s1_resize_ = fully_connected(glimpse_s1_reshape, 16 * 16)
            glimpse_s2_resize_ = fully_connected(glimpse_s2_reshape, 8 * 8)
            glimpse_s3_resize_ = fully_connected(glimpse_s3_reshape, 4 * 4)

            glimpse_s1_resize = tf.reshape(
                glimpse_s1_resize_,
                [batch_size * batch_len, 16 * 16, cnn_fmap_s1_c])
            glimpse_s1_resize = tf.reshape(
                glimpse_s1_resize,
                [batch_size * batch_len, 16, 16, cnn_fmap_s1_c])

            glimpse_s2_resize = tf.reshape(
                glimpse_s2_resize_,
                [batch_size * batch_len, 8 * 8, cnn_fmap_s2_c])
            glimpse_s2_resize = tf.reshape(
                glimpse_s2_resize,
                [batch_size * batch_len, 8, 8, cnn_fmap_s2_c])

            glimpse_s3_resize = tf.reshape(
                glimpse_s3_resize_,
                [batch_size * batch_len, 4 * 4, cnn_fmap_s3_c])
            glimpse_s3_resize = tf.reshape(
                glimpse_s3_resize,
                [batch_size * batch_len, 4, 4, cnn_fmap_s3_c])

            embeddings_ids = tf.random_uniform([batch_size * batch_len],
                                               minval=0,
                                               maxval=325,
                                               dtype=tf.int64)
            embeddings_fordeconv = tf.gather(self._embeddings, embeddings_ids)

            glimpse_fordeconv = tf.reshape(
                glimpse, [batch_size * batch_len, 1, 1, depth])
            concat_feat = tf.concat([glimpse_fordeconv, embeddings_fordeconv],
                                    axis=-1)
            d1 = conv2d_transpose(concat_feat,
                                  128, [2, 2], [2, 2],
                                  normalizer_fn=batch_norm,
                                  scope='gly_deconv_1')
            d2 = conv2d_transpose(d1,
                                  64, [3, 3], [2, 2],
                                  normalizer_fn=batch_norm,
                                  scope='gly_deconv_2')
            d3 = conv2d_transpose(tf.concat([d2, glimpse_s3_resize], axis=-1),
                                  32, [3, 3], [2, 2],
                                  normalizer_fn=batch_norm,
                                  scope='gly_deconv_3')
            d4 = conv2d_transpose(tf.concat([d3, glimpse_s2_resize], axis=-1),
                                  16, [3, 3], [2, 2],
                                  normalizer_fn=batch_norm,
                                  scope='gly_deconv_4')
            d5 = conv2d_transpose(tf.concat([d4, glimpse_s1_resize], axis=-1),
                                  1, [3, 3], [2, 2],
                                  activation_fn=tf.nn.tanh,
                                  scope='gly_deconv_5')

            glyph = d5  # batch_size * batchlen , 32, 32, 1
            glyph = tf.reshape(glyph, [batch_size * batch_len, 32 * 32
                                       ])  # batch_size * batchlen , 32 * 32
            glyph_for_visual = tf.reshape((glyph + 1.0) * 127.5,
                                          [batch_size * batch_len, 32, 32, 1])
            tf.summary.image('glyph1', (glyph_for_visual[:20]), max_outputs=20)

            glyph_output = tf.reshape(glyph,
                                      [batch_size, batch_len, 32 * 32
                                       ])  # batch_size , batchlen , 32 * 32

            glimpse = tf.reshape(glimpse, [batch_size, batch_len, depth])
            c_h_concat = tf.concat([
                glimpse,
                tf.reshape(ld_output, [batch_size, batch_len, depth])
            ],
                                   axis=-1)
            rnn_output = tf.layers.dense(c_h_concat,
                                         self.num_classes,
                                         name='output_w')

        return rnn_output, glyph_output, embeddings_ids