Example #1
0
        def _cumsum_gradient(score_map, box):
            b, n, m, c = utils.get_tensor_shape(score_map)
            _, p, _ = utils.get_tensor_shape(box)

            expanded_box = _get_expanded_box(box,
                                             img_h=n,
                                             img_w=m,
                                             border_ratio=border_ratio)

            (box_h, box_w) = _get_box_shape(box)
            (expanded_box_h, expanded_box_w) = _get_box_shape(expanded_box)

            cumsum = imgproc.calc_cumsum_2d(
                score_map, tf.concat([box, expanded_box], axis=1))

            area = tf.expand_dims(tf.cast(box_h * box_w, tf.float32), axis=-1)
            area_border = tf.expand_dims(tf.cast(
                expanded_box_h * expanded_box_w - box_h * box_w, tf.float32),
                                         axis=-1)

            avg_val = tf.div(cumsum[:, :p, :], tf.maximum(_SMALL_NUMBER, area))
            avg_val_in_border = tf.div(cumsum[:, p:, :] - cumsum[:, :p, :],
                                       tf.maximum(_SMALL_NUMBER, area_border))

            return avg_val - avg_val_in_border
Example #2
0
def calc_cumsum_2d(image, box):
    """Computes the cumulative sum give pre-defiend boxes.

  i_a (ymin, xmin), ..., i_b (ymin, xmax)
  i_c (ymax, xmin), ..., i_d (ymax, xmax)

  Args:
    image: 4-D float `Tensor` of size [b, n, m, c], representing `b` images with
      height `n`, width `m`, and channels `c`.
    box: 3-D int64 `Tensor` of size [b, p, 4], representing `b` examples each 
      with `p` proposals in the format of [ymin, xmin, ymax, xmax].

  Returns:
    cumsum: 3-D float `Tensor` of size [b, p, c], channel-wise cumulative sum.
  """
    b, n, m, c = utils.get_tensor_shape(image)
    _, p, _ = utils.get_tensor_shape(box)

    cumsum = calc_integral_image(image)
    ymin, xmin, ymax, xmax = tf.unstack(box, axis=-1)

    i = tf.range(tf.cast(b, tf.int64), dtype=tf.int64)
    i = tf.tile(tf.expand_dims(i, axis=-1), [1, p])

    i_a = tf.gather_nd(cumsum, tf.stack([i, ymin, xmin], axis=-1))
    i_b = tf.gather_nd(cumsum, tf.stack([i, ymin, xmax], axis=-1))
    i_c = tf.gather_nd(cumsum, tf.stack([i, ymax, xmin], axis=-1))
    i_d = tf.gather_nd(cumsum, tf.stack([i, ymax, xmax], axis=-1))

    return i_d + i_a - i_b - i_c
Example #3
0
  def extract_labels(self, examples):
    """Extracts the pseudo labels.

    Args:
      examples: A dictionary involving image-level annotations.

    Returns:
      labels: A [batch, num_classes] tensor denoting the presence of classes.
    """
    with tf.name_scope('extend_match_extractor'):
      items = self._name2id.items()
      keys = [k for k, v in items]
      values = [v for k, v in items]
      table = tf.contrib.lookup.HashTable(
          initializer=tf.contrib.lookup.KeyValueTensorInitializer(keys, values),
          default_value=self.
          num_classes)  # Class ID for Out-of-Vocabulary words.
      ids = table.lookup(examples[InputDataFields.concat_caption_string])
      labels = tf.one_hot(
          indices=ids, depth=1 + self.num_classes, dtype=tf.float32)

      batch, num_tokens = utils.get_tensor_shape(
          examples[InputDataFields.concat_caption_string])
      labels = tf.cond(
          num_tokens > 0,
          true_fn=lambda: tf.reduce_max(labels, axis=1)[:, :-1],
          false_fn=lambda: tf.zeros(shape=[batch, self.num_classes]))
      return labels
Example #4
0
def _match_labels(class_texts, vocabulary_list):
  """Matches labels from texts.

  Args:
    class_texts: A [batch, num_tokens] string tensor.

  Returns:
    A [batch, num_classes] float tensor.
  """
  keys = [class_name for class_id, class_name in enumerate(vocabulary_list)]
  values = [class_id for class_id, class_name in enumerate(vocabulary_list)]
  table = tf.contrib.lookup.HashTable(
      initializer=tf.contrib.lookup.KeyValueTensorInitializer(keys, values),
      default_value=len(
          vocabulary_list))  # Class ID for Out-of-Vocabulary words.
  ids = table.lookup(class_texts)
  labels = tf.one_hot(
      indices=ids, depth=1 + len(vocabulary_list), dtype=tf.float32)

  batch, num_tokens = utils.get_tensor_shape(class_texts)
  labels = tf.cond(
      num_tokens > 0,
      true_fn=lambda: tf.reduce_max(labels, axis=1)[:, :-1],
      false_fn=lambda: tf.zeros(shape=[batch, len(vocabulary_list)]))
  return labels
Example #5
0
def resize_image_to_size(image,
                         new_height=600,
                         new_width=1024,
                         method=tf.image.ResizeMethod.BILINEAR,
                         align_corners=False):
    """Resizes images to the given height and width.

  Args:
    image: A 3D tensor of shape [height, width, channels]
    new_height: (optional) (scalar) desired height of the image.
    new_width: (optional) (scalar) desired width of the image.
    method: (optional) interpolation method used in resizing. Defaults to 
      BILINEAR.
    align_corners: bool. If true, exactly align all 4 corners of the input
      and output. Defaults to False.

  Returns:
    resized_image: A tensor of size [new_height, new_width, channels].
    resized_image_shape: A 1D tensor of shape [3] containing the shape of the
      resized image.
  """
    with tf.name_scope("resize_image_to_size"):
        new_image = tf.image.resize_images(image,
                                           tf.stack([new_height, new_width]),
                                           method=method,
                                           align_corners=align_corners)
        image_shape = utils.get_tensor_shape(image)
        return new_image, tf.stack([new_height, new_width, image_shape[2]])
Example #6
0
    def _extract_class_label(self, class_texts, vocabulary_list):
        """Extracts class labels.

    Args:
      class_texts: a [batch, max_num_objects] string tensor.
      vocabulary_list: a list of words of length `num_classes`.

    Returns:
      labels: a [batch, num_classes] float tensor.
    """
        with tf.name_scope('extract_class_label'):
            batch, _ = utils.get_tensor_shape(class_texts)

            categorical_col = tf.feature_column.categorical_column_with_vocabulary_list(
                key='name_to_id',
                vocabulary_list=vocabulary_list,
                num_oov_buckets=1)
            indicator_col = tf.feature_column.indicator_column(categorical_col)
            indicator = tf.feature_column.input_layer(
                {'name_to_id': class_texts}, feature_columns=[indicator_col])
            labels = tf.cast(indicator[:, :-1] > 0, tf.float32)

            # if isinstance(batch, int):
            #   labels.set_shape([batch, len(vocabulary_list)])
            # else:
            #   labels.set_shape([None, len(vocabulary_list)])

            labels.set_shape([batch, len(vocabulary_list)])

        return labels
Example #7
0
  def _batch_scale_box_fn(examples):
    (image, image_shape, object_boxes,
     proposal_boxes) = (examples[InputDataFields.image],
                        examples[InputDataFields.image_shape],
                        examples[InputDataFields.object_boxes],
                        examples[InputDataFields.proposals])

    _, pad_h, pad_w, _ = utils.get_tensor_shape(image)
    img_h, img_w, _ = tf.unstack(image_shape, axis=-1)

    def _scale_box(box):
      ymin, xmin, ymax, xmax = tf.unstack(box, axis=-1)
      ymin = ymin * tf.to_float(tf.expand_dims(img_h,
                                               axis=-1)) / tf.to_float(pad_h)
      xmin = xmin * tf.to_float(tf.expand_dims(img_w,
                                               axis=-1)) / tf.to_float(pad_w)
      ymax = ymax * tf.to_float(tf.expand_dims(img_h,
                                               axis=-1)) / tf.to_float(pad_h)
      xmax = xmax * tf.to_float(tf.expand_dims(img_w,
                                               axis=-1)) / tf.to_float(pad_w)
      return tf.stack([ymin, xmin, ymax, xmax], axis=-1)

    examples[InputDataFields.object_boxes] = _scale_box(object_boxes)
    examples[InputDataFields.proposals] = _scale_box(proposal_boxes)

    return examples
Example #8
0
    def _extract_class_label(self, num_captions, caption_strings,
                             caption_lengths, vocabulary_list):
        """Encodes labels.

    Args:
      num_captions: a [batch] int tensor, should always be ONE.
      caption_strings: a [batch, num_captions, max_caption_len] string tensor.
      caption_lengths: a [batch, num_captions] int tensor.
      vocabulary_list: a list of words of length `num_classes`.

    Returns:
      class_label: a [batch, num_classes] float tensor.
    """
        with tf.name_scope('extract_class_label'):
            batch, num_captions, max_caption_len = utils.get_tensor_shape(
                caption_strings)

            caption_string = caption_strings[:, 0, :]
            caption_length = caption_lengths[:, 0]

            categorical_col = tf.feature_column.categorical_column_with_vocabulary_list(
                key='name_to_class_id',
                vocabulary_list=vocabulary_list,
                num_oov_buckets=1)
            indicator_col = tf.feature_column.indicator_column(categorical_col)
            indicator = tf.feature_column.input_layer(
                {'name_to_class_id': caption_strings},
                feature_columns=[indicator_col])
            class_label = tf.cast(indicator[:, :-1] > 0, tf.float32)
            class_label.set_shape([batch, len(vocabulary_list)])

        return class_label
Example #9
0
    def _calc_anchor_scores(self,
                            class_activation_map,
                            anchors,
                            resize_height=224,
                            resize_width=224,
                            num_boxes_per_class=100):
        """Calculates class activation box based on the class activation map.

    Args:
      class_act_map: A [batch, height, width, num_classes] float tensor.
      anchor_boxes: A [batch, number_of_anchors, 4] float tensor.

    Returns:
      anchor_scores: A [batch, number_of_anchors, num_classes] tensor.
    """
        with tf.name_scope('calc_anchor_scores'):
            class_activation_map = tf.image.resize_images(
                class_activation_map, [resize_height, resize_width])
            batch, height, width, num_classes = utils.get_tensor_shape(
                class_activation_map)
            ymin, xmin, ymax, xmax = tf.unstack(anchors, axis=-1)
            anchors_absolute = tf.stack([
                tf.to_int64(tf.round(ymin * tf.to_float(height))),
                tf.to_int64(tf.round(xmin * tf.to_float(width))),
                tf.to_int64(tf.round(ymax * tf.to_float(height))),
                tf.to_int64(tf.round(xmax * tf.to_float(width)))
            ],
                                        axis=-1)

            fn = model_utils.build_proposal_saliency_fn(func_name='wei',
                                                        border_ratio=0.2,
                                                        purity_weight=1.0)
            anchor_scores = fn(class_activation_map, anchors_absolute)
        return anchor_scores
Example #10
0
    def _encode_captions(self,
                         caption_strings,
                         vocabulary_list,
                         common_dimensions=300,
                         scope="coco_word_embedding",
                         is_training=False):
        """Builds caption model.

    Args:
      caption_strings: captions in the batch, a [num_captions_in_batch,
        max_caption_length] string tensor.
      vocabulary_list: words in the vocabulary, a list of python strings.
      common_dimensions: dimensions of the word embedding.
      is_training: if True, training graph is built.

    Returns:
      text_feature: embedding of each word, a [num_captions_in_batch, 
        max_caption_length, common_dimensions] tensor.
    """
        (num_captions_in_batch,
         max_caption_length) = utils.get_tensor_shape(caption_strings)

        caption_strings_flattened = tf.reshape(caption_strings, [-1])

        text_feature_flattened = self._encode_words(caption_strings_flattened,
                                                    common_dimensions,
                                                    vocabulary_list)

        text_feature = tf.reshape(
            text_feature_flattened,
            [num_captions_in_batch, max_caption_length, common_dimensions])

        return text_feature
Example #11
0
    def build_loss(self, predictions, examples, **kwargs):
        """Build tf graph to compute loss.

    Args:
      predictions: dict of prediction results keyed by name.
      examples: dict of inputs keyed by name.

    Returns:
      loss_dict: dict of loss tensors keyed by name.
    """
        loss_dict = {}

        with tf.name_scope('losses'):

            # Extract image-level labels.

            labels = self._extract_class_label(
                class_texts=examples[InputDataFields.caption_strings],
                vocabulary_list=self._vocabulary_list)

            # Loss of the multi-instance detection network.

            midn_logits = predictions[OICRPredictions.midn_logits]
            losses = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=labels, logits=midn_logits)
            loss_dict['midn_cross_entropy_loss'] = tf.reduce_mean(losses)

            # Losses of the online instance classifier refinement network.

            options = self._model_proto

            (num_proposals, proposals, proposal_scores_0) = (
                predictions[DetectionResultFields.num_proposals],
                predictions[DetectionResultFields.proposal_boxes],
                predictions[OICRPredictions.midn_proba_r_given_c])

            batch, max_num_proposals, _ = utils.get_tensor_shape(
                proposal_scores_0)
            proposal_scores_0 = tf.concat([
                tf.fill([batch, max_num_proposals, 1], 0.0), proposal_scores_0
            ],
                                          axis=-1)

            for i in range(options.oicr_iterations):
                proposal_scores_1 = predictions[
                    OICRPredictions.oicr_proposal_scores +
                    '_at_{}'.format(i + 1)]
                loss_dict['oicr_cross_entropy_loss_at_{}'.format(
                    i + 1)] = self._calc_oicr_loss(
                        labels,
                        num_proposals,
                        proposals,
                        proposal_scores_0,
                        proposal_scores_1,
                        scope='oicr_{}'.format(i + 1),
                        iou_threshold=options.oicr_iou_threshold)

                proposal_scores_0 = proposal_scores_1

        return loss_dict
Example #12
0
def gaussian_filter(inputs, ksize=3):
  """Applies Gaussian filter to the inputs.

  Args:
    inputs: input images, a [batch, height, width, channels] float tensor.
    ksize: aperture size of the Gaussian kernel.

  Returns:
    outputs: output images, a [batch, height, width, channels] float tensor.
  """
  batch, height, width, channels = utils.get_tensor_shape(inputs)

  kernel = gaussian_kernel(ksize)
  kernel = tf.reshape(tf.constant(kernel), [ksize, ksize, 1, 1])

  outputs = []
  channel_images = tf.split(inputs, num_or_size_splits=channels, axis=-1)

  for channel_image in channel_images:
    outputs.append(
        tf.nn.conv2d(
            channel_image,
            kernel, [1, 1, 1, 1],
            padding='SAME',
            data_format="NHWC",
            name="gaussian_filter"))

  return tf.concat(outputs, axis=-1)
Example #13
0
  def _extract_text_feature(self,
                            text_strings,
                            text_lengths,
                            vocabulary_list,
                            initial_embedding=None,
                            embedding_dims=50,
                            trainable=True,
                            max_norm=None):
    """Extracts text feature.

    Args:
      text_strings: A [batch, max_text_length] string tensor.
      text_lengths: A [batch] int tensor.
      vocabulary_list: A list of words.

    Returns:
      text_features: a [batch, max_text_length, feature_dims] float tensor.
    """
    batch, max_text_length = utils.get_tensor_shape(text_strings)

    text_strings_flattented = tf.reshape(text_strings, [-1])
    token_ids_flatterned, text_features_flattened = self._encode_tokens(
        text_strings_flattented, embedding_dims, vocabulary_list,
        initial_embedding, trainable)

    token_ids = tf.reshape(token_ids_flatterned, [batch, max_text_length])
    text_features = tf.reshape(text_features_flattened,
                               [batch, max_text_length, embedding_dims])
    return token_ids, text_features
Example #14
0
  def _batch_resize_image_fn(examples):

    # Resize image, height and width denote the padding size.

    image = examples[InputDataFields.image]
    _, height, width, channels = utils.get_tensor_shape(image)

    index = tf.random_uniform([],
                              minval=0,
                              maxval=len(options.batch_resize_scale_value),
                              dtype=tf.int32)
    scale_h = scale_w = tf.gather([x for x in options.batch_resize_scale_value],
                                  index)
    new_height = tf.to_int32(tf.round(scale_h * tf.to_float(height)))
    new_width = tf.to_int32(tf.round(scale_w * tf.to_float(width)))

    new_image = tf.image.resize_images(image, tf.stack([new_height, new_width]))
    examples[InputDataFields.image] = new_image

    # Modify the image_shape, height and width denote the image size.

    image_shape = examples[InputDataFields.image_shape]
    height, width, channels = tf.unstack(image_shape, axis=-1)
    new_height = tf.to_int32(tf.round(scale_h * tf.to_float(height)))
    new_width = tf.to_int32(tf.round(scale_w * tf.to_float(width)))

    new_image_shape = tf.stack([new_height, new_width, channels], axis=-1)
    examples[InputDataFields.image_shape] = new_image_shape

    return examples
Example #15
0
def sample_negatives_randomly(num_captions, caption_strings, caption_lengths):
    """Samples negative examples randomly.

  Args:
    num_pos_captions: number of captions of each example, a [batch] int tensor.
    caption_strings: caption data, a 
      [batch, max_num_captions, max_caption_length] string tensor.
    caption_lengths: length of each caption, a [batch, max_num_captions] int
      tensor.

  Returns:
    num_neg_captions: number of captions of each example, a [batch] int tensor.
    neg_caption_strings: caption data, a 
      [batch, max_num_captions, max_caption_length] string tensor.
    neg_caption_lengths: length of each caption, a [batch, max_num_captions] int
      tensor.
  """
    batch = utils.get_tensor_shape(num_captions)[0]
    offset = tf.random_uniform([batch], maxval=batch - 1, dtype=tf.int32)
    index = tf.range(batch, dtype=tf.int32)
    sampled_index = tf.mod(index + offset, batch)

    return (tf.gather(num_captions, sampled_index),
            tf.gather(caption_strings,
                      sampled_index), tf.gather(caption_lengths,
                                                sampled_index))
Example #16
0
  def _predict_image_score_map(self, examples):
    """Builds tf graph for .

    Args:
      examples: dict of input tensors keyed by name.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
    options = self._model_proto
    is_training = self._is_training

    image = examples[InputDataFields.image]

    # Keep image size for resizing saliency and activation map later.
    batch, height, width, channels = utils.get_tensor_shape(image)

    class_act_map_predictions = self._calc_class_act_map(examples)
    class_act_map = class_act_map_predictions[VOCPredictions.class_act_map]

    def _resize_fn(image, ksize=32):
      resized_image = tf.image.resize_images(image, [height, width])
      if ksize:
        smoothed_image = imgproc.gaussian_filter(resized_image, ksize=ksize)
      else:
        smoothed_image = resized_image

      return smoothed_image

    predictions = {
        VOCPredictionTasks.image_saliency: tf.zeros([batch, height, width, 1]),
        VOCPredictionTasks.image_score_map: _resize_fn(class_act_map),
    }

    return predictions
Example #17
0
  def _build_midn_network(self,
                          num_proposals,
                          proposal_features,
                          num_classes=20):
    """Builds the Multiple Instance Detection Network.

    MIDN: An attention network.

    Args:
      num_proposals: A [batch] int tensor.
      proposal_features: A [batch, max_num_proposals, features_dims] 
        float tensor.
      num_classes: Number of classes.

    Returns:
      logits: A [batch, num_classes] float tensor.
      proba_r_given_c: A [batch, max_num_proposals, num_classes] float tensor.
    """
    with tf.name_scope('multi_instance_detection'):

      batch, max_num_proposals, _ = utils.get_tensor_shape(proposal_features)
      mask = tf.sequence_mask(
          num_proposals, maxlen=max_num_proposals, dtype=tf.float32)
      mask = tf.expand_dims(mask, axis=-1)

      # Calculates the attention score: proposal `r` given class `c`.
      #   proba_r_given_c shape = [batch, max_num_proposals, num_classes].

      logits_r_given_c = slim.fully_connected(
          proposal_features,
          num_outputs=num_classes,
          activation_fn=None,
          scope='midn/proba_r_given_c')
      logits_r_given_c = tf.multiply(mask, logits_r_given_c)
      proba_r_given_c = utils.masked_softmax(
          data=logits_r_given_c, mask=mask, dim=1)
      proba_r_given_c = tf.multiply(mask, proba_r_given_c)
      tf.summary.histogram('midn/logits_r_given_c', logits_r_given_c)

      # Calculates the weighted logits:
      #   logits_c_given_r shape = [batch, max_num_proposals, num_classes].
      #   logits shape = [batch, num_classes].

      logits_c_given_r = slim.fully_connected(
          proposal_features,
          num_outputs=num_classes,
          activation_fn=None,
          scope='midn/proba_c_given_r')
      proba_c_given_r = tf.nn.softmax(logits_c_given_r)
      proba_c_given_r = tf.multiply(mask, proba_c_given_r)
      tf.summary.histogram('midn/logits_c_given_r', logits_c_given_r)

      # Aggregates the logits.

      logits = tf.multiply(logits_c_given_r, proba_r_given_c)
      logits = tf.reduce_sum(logits, axis=1)
      tf.summary.histogram('midn/logits', logits)

    return logits, proba_r_given_c
Example #18
0
        def _cumsum_gradient(score_map, box):
            b, n, m, c = utils.get_tensor_shape(score_map)
            _, p, _ = utils.get_tensor_shape(box)

            # Leave a border for the image border.
            ymin, xmin, ymax, xmax = tf.unstack(box, axis=-1)
            ymin, xmin = tf.maximum(ymin, 2), tf.maximum(xmin, 2)
            ymax, xmax = tf.minimum(ymax, tf.to_int64(n - 2)), tf.minimum(
                xmax, tf.to_int64(m - 2))

            box = tf.stack([ymin, xmin, ymax, xmax], axis=-1)
            box_exp = _get_expanded_box(box,
                                        img_h=n,
                                        img_w=m,
                                        border_ratio=border_ratio)

            box_list = [box, box_exp]

            area_list = [
                tf.cast(_get_box_area(b), tf.float32) for b in box_list
            ]
            cumsum = imgproc.calc_cumsum_2d(score_map,
                                            tf.concat(box_list, axis=1))
            cumsum_list = [
                cumsum[:, i * p:(i + 1) * p, :] for i in range(len(box_list))
            ]

            # The main box has to be valid, including the four shrinked boxes.
            assert_op = tf.Assert(
                tf.reduce_all(tf.greater(area_list[0], 0)),
                ["Check area of the main box failed:", area_list[0]])

            with tf.control_dependencies([assert_op]):
                border_area = area_list[1] - area_list[0]
                border_cumsum = cumsum_list[1] - cumsum_list[0]

                border_avg = tf.div(
                    border_cumsum,
                    tf.maximum(_SMALL_NUMBER,
                               tf.expand_dims(border_area, axis=-1)))
                box_avg = tf.div(
                    cumsum_list[0],
                    tf.maximum(_SMALL_NUMBER,
                               tf.expand_dims(area_list[0], axis=-1)))

                return purity_weight * box_avg - border_avg
Example #19
0
    def _predict_image_saliency(self, examples):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        if not options.use_saliency_score:
            raise ValueError("The flag of `use_saliency_score` should be set.")

        image = examples[InputDataFields.image]

        # Extract image feature, shape =
        #   [batch, feature_height * feature_width, common_dimensions].

        image_feature = self._encode_images(
            image,
            cnn_name=options.cnn_name,
            cnn_trainable=options.cnn_trainable,
            cnn_weight_decay=options.cnn_weight_decay,
            cnn_feature_map=options.cnn_feature_map,
            cnn_dropout_keep_prob=options.cnn_dropout_keep_prob,
            cnn_checkpoint=options.cnn_checkpoint,
            cnn_scope=GAPVariableScopes.cnn,
            is_training=is_training)

        image_feature = self._project_images(
            image_feature,
            common_dimensions=options.common_dimensions,
            scope=GAPVariableScopes.image_proj,
            hyperparams=options.image_proj_hyperparams,
            is_training=is_training)

        (batch, feature_height, feature_width,
         common_dimensions) = utils.get_tensor_shape(image_feature)
        image_feature = tf.reshape(image_feature,
                                   [batch, -1, common_dimensions])

        # Predict saliency score.
        #   image_saliency shape = [batch, num_regions].
        #   caption_saliency shape = [num_captions_in_batch, max_caption_length].

        image_saliency = self._calc_saliency_score(
            image_feature,
            scope=GAPVariableScopes.image_saliency,
            hyperparams=options.image_saliency_hyperparams,
            is_training=is_training)
        return {
            GAPPredictions.image_saliency:
            tf.reshape(image_saliency, [-1, feature_height, feature_width]),
        }
Example #20
0
  def encode(self, feature, length, scope=None):
    """Encodes sequence features into representation.

    Args:
      feature: A [batch, max_sequence_len, dims] float tensor.
      length: A [batch] int tensor.

    Returns:
      A [batch, dims] float tensor.
    """
    options = self._model_proto
    is_training = self._is_training

    def lstm_cell():
      cell = tf.nn.rnn_cell.BasicLSTMCell(
          num_units=options.hidden_units, forget_bias=1.0)
      if is_training:
        cell = tf.nn.rnn_cell.DropoutWrapper(
            cell,
            input_keep_prob=options.input_keep_prob,
            output_keep_prob=options.output_keep_prob,
            state_keep_prob=options.state_keep_prob)
      return cell

    rnn_cell = tf.contrib.rnn.MultiRNNCell(
        [lstm_cell() for _ in range(options.number_of_layers)])

    with tf.variable_scope(scope):
      outputs, state = tf.nn.bidirectional_dynamic_rnn(
          cell_fw=rnn_cell,
          cell_bw=rnn_cell,
          inputs=feature,
          sequence_length=length,
          parallel_iterations=options.parallel_iterations,
          dtype=tf.float32)

      mask = tf.sequence_mask(
          length, maxlen=utils.get_tensor_shape(feature)[1], dtype=tf.float32)

      # outputs = tf.multiply(0.5, outputs[0] + outputs[1])
      # feature = utils.masked_avg_nd(data=outputs, mask=mask, dim=1)
      # return tf.squeeze(feature, axis=1)

      state_list = []
      for state_per_direction in state:
        for state_per_layer in state_per_direction:
          state_list.extend([state_per_layer.c, state_per_layer.h])

      state_final = tf.contrib.layers.fully_connected(
          inputs=tf.concat(state_list, axis=-1),
          num_outputs=options.output_units,
          activation_fn=None,
          scope='bilstm_output')

    return state_final
Example #21
0
    def _calc_spp_feature(self, inputs, spp_bins=[1, 2, 3, 6], max_pool=True):
        """Apply SPP layer to get the multi-resolutional feature.

    LIMITATION: the inputs has to have static shape.

    Args:
      inputs: A [batch, feature_height, feature_width, feature_dims] 
        float tensor.
      spp_bins: A python list representing the number of bins at each SPP 
        level. 

    Returns:
      spp_pool: A [batch, spp_feature_dims] fixed-length feature tensor.

    Raises:
      ValueError: If any of the parameters are invalid.
    """
        batch, height, width, _ = utils.get_tensor_shape(inputs)
        if not type(height) == type(width) == int:
            raise ValueError('The inputs should have static shape.')

        pool_fn = tf.nn.avg_pool
        if max_pool:
            pool_fn = tf.nn.max_pool

        with tf.name_scope('calc_spp_feature'):
            pool_outputs = []
            for bins in spp_bins:
                if height % bins or width % bins:
                    raise ValueError('Reminder should be ZERO.')

                pool_h, pool_w = height // bins, width // bins
                stride_h, stride_w = height // bins, width // bins
                pool = pool_fn(inputs,
                               ksize=[1, pool_h, pool_w, 1],
                               strides=[1, stride_h, stride_w, 1],
                               padding='SAME')
                tf.summary.histogram('oicr/spp_bins_{}'.format(bins), pool)
                tf.summary.scalar('oicr/spp_bins_min_{}'.format(bins),
                                  tf.reduce_min(pool))
                tf.summary.scalar('oicr/spp_bins_max_{}'.format(bins),
                                  tf.reduce_max(pool))
                tf.summary.scalar('oicr/spp_bins_avg_{}'.format(bins),
                                  tf.reduce_mean(pool))
                pool_outputs.append(tf.reshape(pool, [batch, -1]))
                tf.logging.info(
                    'SPP bins=%i, bin_size=(%i,%i), strides=(%i, %i), output=%s',
                    bins, pool_h, pool_w, stride_h, stride_w,
                    pool.get_shape().as_list())
            spp_pool = tf.concat(pool_outputs, axis=-1)
            tf.logging.info('Final SPP shape=%s',
                            spp_pool.get_shape().as_list())

        return spp_pool
Example #22
0
def gather_in_batch_captions(image_id, num_captions, caption_strings,
                             caption_lengths):
    """Gathers all of the in-batch captions into a caption batch.

  Args:
    image_id: image_id, a [batch] int64 tensor.
    num_captions: number of captions of each example, a [batch] int tensor.
    caption_strings: caption data, a [batch, max_num_captions, 
      max_caption_length] string tensor.
    caption_lengths: length of each caption, a [batch, max_num_captions] int
      tensor.

  Returns:
    image_ids_gathered: associated image_id of each caption in the new batch, a
      [num_captions_in_batch] string tensor.
    caption_strings_gathered: caption data, a [num_captions_in_batch,
      max_caption_length] string tensor.
    caption_lengths_gathered: length of each caption, a [num_captions_in_batch]
      int tensor.
  """
    if not image_id.dtype in [tf.int32, tf.int64]:
        raise ValueError('The image_id has to be int32 or int64')

    (batch, max_num_captions,
     max_caption_length) = utils.get_tensor_shape(caption_strings)

    # caption_mask denotes the validity of each caption in the flattened batch.
    # caption_mask shape = [batch * max_num_captions],

    caption_mask = tf.sequence_mask(num_captions,
                                    maxlen=max_num_captions,
                                    dtype=tf.bool)
    caption_mask = tf.reshape(caption_mask, [-1])

    # image_id shape = [batch, max_num_captions].

    image_id = tf.tile(tf.expand_dims(image_id, axis=1), [1, max_num_captions])

    # Reshape the tensors to make their first dimensions to be [batch * max_num_captions].

    image_id_reshaped = tf.reshape(image_id, [-1])
    caption_strings_reshaped = tf.reshape(caption_strings,
                                          [-1, max_caption_length])
    caption_lengths_reshaped = tf.reshape(caption_lengths, [-1])

    # Apply the caption_mask.

    image_ids_gathered = tf.boolean_mask(image_id_reshaped, caption_mask)
    caption_strings_gathered = tf.boolean_mask(caption_strings_reshaped,
                                               caption_mask)
    caption_lengths_gathered = tf.boolean_mask(caption_lengths_reshaped,
                                               caption_mask)

    return image_ids_gathered, caption_strings_gathered, caption_lengths_gathered
Example #23
0
def draw_rectangles(image,
                    boxes,
                    scores=None,
                    labels=None,
                    color=GREEN,
                    thickness=1,
                    fontscale=1.0):
  """Draws rectangle to the image.

  Args:
    image: a [batch, height, width, 3] uint8 tensor.
    boxes: a [batch, num_boxes, 4] float tensor representing normalized boxes,
      i.e.: [ymin, xmin, ymax, xmax], values are ranging from 0.0 to 1.0.
    scores: a [batch, num_boxes] float tensor representing the scores to be
      drawn on the image.
    labels: a [batch, num_boxes] string or float tensor representing the labels
      to be drawn on the image.
    color: color to be used.
    thickness: the line thickness.
    fontscale: size of the font.

  Returns:
    canvas: a [batch, height, width, 3] uint8 tensor with information drawn.
  """

  def _draw_fn(inputs):
    """Draws the box on the image.

    Args:
      image: a [height, width, 3] float tensor.
      box: a [num_boxes, 4] float tensor representing [ymin, xmin, ymax, xmax].
      score: a [num_boxes] float tensor representing box scores.
      label: a [num_boxes] string tensor denoting the text to be drawn.

    Returns:
      canvas: a [height, width, 3] float tensor with box drawn.
    """
    image, boxes, scores, labels = inputs
    canvas = tf.py_func(
        func=lambda x, y, z, w: _py_draw_rectangles(x, y, z, w, color=color, thickness=thickness, fontscale=fontscale),
        inp=[image, boxes, scores, labels], Tout=tf.uint8)
    canvas.set_shape(tf.TensorShape([None, None, 3]))
    return canvas

  batch, num_boxes, _ = utils.get_tensor_shape(boxes)
  if scores is None:
    scores = tf.constant(-9999.0, shape=[batch, num_boxes], dtype=tf.float32)
  if labels is None:
    labels = tf.constant("", shape=[batch, num_boxes], dtype=tf.string)

  return tf.map_fn(
      _draw_fn, elems=[image, boxes, scores, labels], dtype=tf.uint8)
Example #24
0
  def encode(self, feature, length, scope=None):
    """Encodes sequence features into representation.

    Args:
      feature: A [batch, max_sequence_len, dims] float tensor.
      length: A [batch] int tensor.

    Returns:
      A [batch, dims] float tensor.
    """
    with tf.name_scope('avg_pooling_encoder'):
      mask = tf.sequence_mask(
          length, maxlen=utils.get_tensor_shape(feature)[-2], dtype=tf.float32)
      feature = utils.masked_avg_nd(data=feature, mask=mask, dim=1)
      return tf.squeeze(feature, axis=1)
Example #25
0
    def _calc_vgg_proposal_feature(self, image_feature_cropped):
        """Calculates proposal feature using vgg fc layers.

    Args:
      image_feature_cropped: A [batch, crop_size, crop_size, feature_dims]
        float tensor.

    Returns:
      proposal_feature: A [batch, proposal_feature_dims] float tensor.
    """
        options = self._model_proto
        is_training = self._is_training

        # SPP.
        bins = 7
        batch, height, width, _ = utils.get_tensor_shape(image_feature_cropped)
        if height % bins or width % bins:
            raise ValueError('Reminder should be ZERO.')

        pool_h, pool_w = height // bins, width // bins
        stride_h, stride_w = height // bins, width // bins
        net = tf.nn.max_pool(image_feature_cropped,
                             ksize=[1, pool_h, pool_w, 1],
                             strides=[1, stride_h, stride_w, 1],
                             padding='SAME')

        with tf.variable_scope(options.cnn.scope, reuse=True):
            with tf.variable_scope(options.cnn.name, reuse=True):

                net = slim.conv2d(net,
                                  4096, [7, 7],
                                  padding='VALID',
                                  scope='fc6')
                net = slim.dropout(net,
                                   options.cnn.dropout_keep_prob,
                                   is_training=is_training
                                   and options.cnn.trainable,
                                   scope='dropout6')
                net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
                net = slim.dropout(net,
                                   options.hidden_dropout_keep_prob,
                                   is_training=is_training,
                                   scope='dropout7')
                net = tf.squeeze(net, [1, 2], name='fc8/squeezed')

        return net
Example #26
0
  def _encode_labels(self,
                     num_captions,
                     caption_strings,
                     caption_lengths,
                     vocabulary_list,
                     is_training=False):
    """Encodes labels.

    Args:
      num_captions: a [batch] int tensor.
      caption_strings: a [batch, num_captions, max_caption_len] string tensor.
      caption_lengths: a [batch, num_captions] int tensor.
      vocabulary_list: a list of words of length ``num_classes''''.
      is_training: if True, training graph is built.

    Returns:
      classes: a [batch, num_classes] int tensor.
    """
    with tf.name_scope('encode_labels'):
      batch, num_captions, max_caption_len = utils.get_tensor_shape(
          caption_strings)

      caption_string = caption_strings[:, 0, :]
      caption_length = caption_lengths[:, 0]

      categorical_col = tf.feature_column.categorical_column_with_vocabulary_list(
          key='name_to_class_id',
          vocabulary_list=vocabulary_list,
          num_oov_buckets=1)
      indicator_col = tf.feature_column.indicator_column(categorical_col)
      indicator = tf.feature_column.input_layer(
          {
              'name_to_class_id': caption_strings
          },
          feature_columns=[indicator_col])
      classes = tf.cast(indicator[:, :-1] > 0, tf.int64)
      tf.summary.histogram('num_gt_boxes_per_image', caption_length)
      tf.summary.histogram('num_gt_labels_per_image',
                           tf.reduce_sum(classes, axis=-1))
      classes.set_shape([batch, len(vocabulary_list)])

    return classes
Example #27
0
def _average_encoding(sequence_feature, sequence_length):
  """Encodes sequence using Average pooling.

  Args:
    sequence_feature: a [batch_sequence, max_sequence_length, feature_dimensions].
      float tensor.
    sequence_length: a [batch_sequence] int tensor.

  Returns:
    sequence_emb: A [batch_sequence, common_dimensions] float tensor, 
      representing the embedding vectors.
  """
  (_, max_sequence_length, _) = utils.get_tensor_shape(sequence_feature)

  mask = tf.sequence_mask(
      sequence_length, maxlen=max_sequence_length, dtype=tf.float32)

  sequence_emb = utils.masked_avg_nd(sequence_feature, mask, dim=1)
  sequence_emb = tf.squeeze(sequence_emb, axis=1)
  return sequence_emb
Example #28
0
    def _midn_loss_mine_hardest_negative(self, labels, losses):
        """Hardest negative mining of the MIDN loss.

    Args:
      labels: A [batch, num_classes] float tensor, where `1` denotes the 
        presence of a class.
      losses: A [batch, num_classes] float tensor, the losses predicted by  
        the model.

    Returns:
      mask: A [batch, num_classes] float tensor where `1` denotes the 
        selected entry.
    """
        batch, num_classes = utils.get_tensor_shape(labels)
        indices_0 = tf.range(batch, dtype=tf.int64)
        indices_1 = utils.masked_argmax(data=losses, mask=1.0 - labels, dim=1)
        indices = tf.stack([indices_0, indices_1], axis=-1)
        negative_masks = tf.sparse_to_dense(indices, [batch, num_classes],
                                            sparse_values=1.0)
        return tf.add(labels, negative_masks)
Example #29
0
def _calc_graph_node_scores(node,
                            hidden_layers=None,
                            hidden_units=None,
                            dropout_keep_prob=1.0,
                            is_training=False,
                            scope='calc_graph_node_scores'):
    """Calculates the node scores [from node to node].

  Args:
    node: A [batch, max_num_node, dims] float tensor.
    hidden_layers: An integer denoting number of MLP layers.
    hidden_units: An integer denoting MLP hidden units.
    dropout_keep_prob: Keep probability of the dropout layers.
    is_training: If true, build the training graph.
    scope: Variable scope name.

  Returns:
    A [batch, max_num_node] float tensor.
  """
    with tf.variable_scope(scope):
        batch = utils.get_tensor_shape(node)[0]

        # Concatenate the node features, inputs = [node].

        hiddens = node
        for layer_i in range(hidden_layers):
            hiddens = tf.contrib.layers.fully_connected(
                inputs=hiddens,
                num_outputs=hidden_units,
                activation_fn=tf.nn.relu,
                scope='hidden_{}'.format(layer_i))
            hiddens = slim.dropout(hiddens,
                                   dropout_keep_prob,
                                   is_training=is_training)

        outputs = tf.contrib.layers.fully_connected(inputs=hiddens,
                                                    num_outputs=1,
                                                    activation_fn=None,
                                                    scope='output')
        outputs = tf.squeeze(outputs, axis=-1)
    return outputs
Example #30
0
    def visualize(self,
                  image,
                  saliency,
                  interpolation=tf.image.ResizeMethod.NEAREST_NEIGHBOR):
        """Visualizes images to tensorboard.

    Args:
      image: a [batch, height, width, channels] float tensor, in [0, 255].
      saliency: a [batch, feature_height, feature_width] float tensor.
    """
        (batch, height, width, channels) = utils.get_tensor_shape(image)

        image = image / 255.0
        heatmap = plotlib.convert_to_heatmap(saliency, normalize=True)
        heatmap = tf.image.resize_images(heatmap, [height, width],
                                         interpolation)

        heatmap = plotlib.gaussian_filter(heatmap, ksize=32)

        image = tf.maximum(0.0, tf.concat([image, heatmap], axis=2))
        tf.summary.image("images", image, max_outputs=10)