Esempio n. 1
0
def ExtractLocalFeatures(image, image_scales, max_feature_num, abs_thres, iou,
                         attention_model_fn, stride_factor):
    """Extract local features for input image.

  Args:
    image: image tensor of type tf.uint8 with shape [h, w, channels].
    image_scales: 1D float tensor which contains float scales used for image
      pyramid construction.
    max_feature_num: int tensor denotes the maximum selected feature points.
    abs_thres: float tensor denotes the score threshold for feature selection.
    iou: float scalar denotes the iou threshold for NMS.
    attention_model_fn: model function. Follows the signature:
      * Args:
        * `images`: Image tensor which is re-scaled.
      * Returns:
        * `attention_prob`: attention map after the non-linearity.
        * `feature_map`: feature map after ResNet convolution.
    stride_factor: integer accounting for striding after block3.

  Returns:
    boxes: [N, 4] float tensor which denotes the selected receptive box. N is
      the number of final feature points which pass through keypoint selection
      and NMS steps.
    features: [N, depth] float tensor.
    feature_scales: [N] float tensor. It is the inverse of the input image
      scales such that larger image scales correspond to larger image regions,
      which is compatible with keypoints detected with other techniques, for
      example Congas.
    scores: [N, 1] float tensor denotes the attention score.

  """
    original_image_shape_float = tf.gather(
        tf.dtypes.cast(tf.shape(image), tf.float32), [0, 1])

    image_tensor = gld.NormalizeImages(image,
                                       pixel_value_offset=128.0,
                                       pixel_value_scale=128.0)
    image_tensor = tf.expand_dims(image_tensor, 0, name='image/expand_dims')

    # Hard code the feature depth and receptive field parameters for now.
    rf, stride, padding = [291.0, 16.0 * stride_factor, 145.0]
    feature_depth = 1024

    def _ProcessSingleScale(scale_index, boxes, features, scales, scores):
        """Resizes the image and run feature extraction and keypoint selection.

       This function will be passed into tf.while_loop() and be called
       repeatedly. The input boxes are collected from the previous iteration
       [0: scale_index -1]. We get the current scale by
       image_scales[scale_index], and run resize image, feature extraction and
       keypoint selection. Then we will get a new set of selected_boxes for
       current scale. In the end, we concat the previous boxes with current
       selected_boxes as the output.
    Args:
      scale_index: A valid index in the image_scales.
      boxes: Box tensor with the shape of [N, 4].
      features: Feature tensor with the shape of [N, depth].
      scales: Scale tensor with the shape of [N].
      scores: Attention score tensor with the shape of [N].

    Returns:
      scale_index: The next scale index for processing.
      boxes: Concatenated box tensor with the shape of [K, 4]. K >= N.
      features: Concatenated feature tensor with the shape of [K, depth].
      scales: Concatenated scale tensor with the shape of [K].
      scores: Concatenated score tensor with the shape of [K].
    """
        scale = tf.gather(image_scales, scale_index)
        new_image_size = tf.dtypes.cast(
            tf.round(original_image_shape_float * scale), tf.int32)
        resized_image = tf.image.resize(image_tensor, new_image_size)

        attention_prob, feature_map = attention_model_fn(resized_image)
        attention_prob = tf.squeeze(attention_prob, axis=[0])
        feature_map = tf.squeeze(feature_map, axis=[0])

        rf_boxes = feature_extractor.CalculateReceptiveBoxes(
            tf.shape(feature_map)[0],
            tf.shape(feature_map)[1], rf, stride, padding)

        # Re-project back to the original image space.
        rf_boxes = tf.divide(rf_boxes, scale)
        attention_prob = tf.reshape(attention_prob, [-1])
        feature_map = tf.reshape(feature_map, [-1, feature_depth])

        # Use attention score to select feature vectors.
        indices = tf.reshape(tf.where(attention_prob >= abs_thres), [-1])
        selected_boxes = tf.gather(rf_boxes, indices)
        selected_features = tf.gather(feature_map, indices)
        selected_scores = tf.gather(attention_prob, indices)
        selected_scales = tf.ones_like(selected_scores, tf.float32) / scale

        # Concat with the previous result from different scales.
        boxes = tf.concat([boxes, selected_boxes], 0)
        features = tf.concat([features, selected_features], 0)
        scales = tf.concat([scales, selected_scales], 0)
        scores = tf.concat([scores, selected_scores], 0)

        return scale_index + 1, boxes, features, scales, scores

    output_boxes = tf.zeros([0, 4], dtype=tf.float32)
    output_features = tf.zeros([0, feature_depth], dtype=tf.float32)
    output_scales = tf.zeros([0], dtype=tf.float32)
    output_scores = tf.zeros([0], dtype=tf.float32)

    # Process the first scale separately, the following scales will reuse the
    # graph variables.
    (_, output_boxes, output_features, output_scales,
     output_scores) = _ProcessSingleScale(0, output_boxes, output_features,
                                          output_scales, output_scores)

    i = tf.constant(1, dtype=tf.int32)
    num_scales = tf.shape(image_scales)[0]
    keep_going = lambda j, b, f, scales, scores: tf.less(j, num_scales)

    (_, output_boxes, output_features, output_scales,
     output_scores) = tf.nest.map_structure(
         tf.stop_gradient,
         tf.while_loop(cond=keep_going,
                       body=_ProcessSingleScale,
                       loop_vars=[
                           i, output_boxes, output_features, output_scales,
                           output_scores
                       ],
                       shape_invariants=[
                           i.get_shape(),
                           tf.TensorShape([None, 4]),
                           tf.TensorShape([None, feature_depth]),
                           tf.TensorShape([None]),
                           tf.TensorShape([None])
                       ]))

    feature_boxes = box_list.BoxList(output_boxes)
    feature_boxes.add_field('features', output_features)
    feature_boxes.add_field('scales', output_scales)
    feature_boxes.add_field('scores', output_scores)

    nms_max_boxes = tf.minimum(max_feature_num, feature_boxes.num_boxes())
    final_boxes = box_list_ops.non_max_suppression(feature_boxes, iou,
                                                   nms_max_boxes)

    return final_boxes.get(), final_boxes.get_field(
        'features'), final_boxes.get_field('scales'), tf.expand_dims(
            final_boxes.get_field('scores'), 1)
Esempio n. 2
0
def ExtractLocalAndGlobalFeatures(image, image_scales, max_feature_num,
                                  abs_thres, global_scales_ind, iou, model_fn,
                                  stride_factor):
    """Extract local+global features for input image.

  Args:
    image: image tensor of type tf.uint8 with shape [h, w, channels].
    image_scales: 1D float tensor which contains float scales used for image
      pyramid construction.
    max_feature_num: int tensor denoting the maximum selected feature points.
    abs_thres: float tensor denoting the score threshold for feature selection.
    global_scales_ind: Global feature extraction happens only for a subset of
      `image_scales`, those with corresponding indices from this tensor.
    iou: float scalar denoting the iou threshold for NMS.
    model_fn: model function. Follows the signature:
      * Args:
        * `images`: Batched image tensor.
      * Returns:
        * `global_descriptors`: Global descriptors for input images.
        * `attention_prob`: Attention map after the non-linearity.
        * `feature_map`: Feature map after ResNet convolution.
    stride_factor: integer accounting for striding after block3.

  Returns:
    boxes: [N, 4] float tensor which denotes the selected receptive boxes. N is
      the number of final feature points which pass through keypoint selection
      and NMS steps.
    local_descriptors: [N, depth] float tensor.
    feature_scales: [N] float tensor. It is the inverse of the input image
      scales such that larger image scales correspond to larger image regions,
      which is compatible with keypoints detected with other techniques, for
      example Congas.
    scores: [N, 1] float tensor denoting the attention score.
    global_descriptors: [S, D] float tensor, with the global descriptors for
      each scale; S is the number of scales, and D the global descriptor
      dimensionality.
  """
    original_image_shape_float = tf.gather(
        tf.dtypes.cast(tf.shape(image), tf.float32), [0, 1])
    val = tf.math.reduce_max(image)
    image_tensor = gld.NormalizeImages(image,
                                       pixel_value_offset=val,
                                       pixel_value_scale=val)
    image_tensor = tf.expand_dims(image_tensor, 0, name='image/expand_dims')

    # Hard code the receptive field parameters for now.
    # We need to revisit this once we change the architecture and selected
    # convolutional blocks to use as local features.
    rf, stride, padding = [291.0, 16.0 * stride_factor, 145.0]

    def _ResizeAndExtract(scale_index):
        """Helper function to resize image then extract features.

    Args:
      scale_index: A valid index in image_scales.

    Returns:
      global_descriptor: [1,D] tensor denoting the extracted global descriptor.
      boxes: Box tensor with the shape of [K, 4].
      local_descriptors: Local descriptor tensor with the shape of [K, depth].
      scales: Scale tensor with the shape of [K].
      scores: Score tensor with the shape of [K].
    """
        scale = tf.gather(image_scales, scale_index)
        new_image_size = tf.dtypes.cast(
            tf.round(original_image_shape_float * scale), tf.int32)
        resized_image = tf.image.resize(image_tensor, new_image_size)
        global_descriptor, attention_prob, feature_map = model_fn(
            resized_image)

        attention_prob = tf.squeeze(attention_prob, axis=[0])
        feature_map = tf.squeeze(feature_map, axis=[0])

        # Compute RF boxes and re-project them to the original image space.
        rf_boxes = feature_extractor.CalculateReceptiveBoxes(
            tf.shape(feature_map)[0],
            tf.shape(feature_map)[1], rf, stride, padding)
        rf_boxes = tf.divide(rf_boxes, scale)

        attention_prob = tf.reshape(attention_prob, [-1])
        feature_map = tf.reshape(feature_map, [-1, tf.shape(feature_map)[2]])

        # Use attention score to select local features.
        indices = tf.reshape(tf.where(attention_prob >= abs_thres), [-1])
        boxes = tf.gather(rf_boxes, indices)
        local_descriptors = tf.gather(feature_map, indices)
        scores = tf.gather(attention_prob, indices)
        scales = tf.ones_like(scores, tf.float32) / scale

        return global_descriptor, boxes, local_descriptors, scales, scores

    # TODO(andrearaujo): Currently, a global feature is extracted even for scales
    # which are not using it. The obtained result is correct, however feature
    # extraction is slower than expected. We should try to fix this in the future.

    # Run first scale.
    (output_global_descriptors, output_boxes, output_local_descriptors,
     output_scales, output_scores) = _ResizeAndExtract(0)
    if not tf.reduce_any(tf.equal(global_scales_ind, 0)):
        # If global descriptor is not using the first scale, clear it out.
        output_global_descriptors = tf.zeros(
            [0, tf.shape(output_global_descriptors)[1]])

    # Loop over subsequent scales.
    num_scales = tf.shape(image_scales)[0]
    for scale_index in tf.range(1, num_scales):
        # Allow an undefined number of global feature scales to be extracted.
        tf.autograph.experimental.set_loop_options(
            shape_invariants=[(output_global_descriptors,
                               tf.TensorShape([None, None]))])

        (global_descriptor, boxes, local_descriptors, scales,
         scores) = _ResizeAndExtract(scale_index)
        output_boxes = tf.concat([output_boxes, boxes], 0)
        output_local_descriptors = tf.concat(
            [output_local_descriptors, local_descriptors], 0)
        output_scales = tf.concat([output_scales, scales], 0)
        output_scores = tf.concat([output_scores, scores], 0)
        if tf.reduce_any(tf.equal(global_scales_ind, scale_index)):
            output_global_descriptors = tf.concat(
                [output_global_descriptors, global_descriptor], 0)

    feature_boxes = box_list.BoxList(output_boxes)
    feature_boxes.add_field('local_descriptors', output_local_descriptors)
    feature_boxes.add_field('scales', output_scales)
    feature_boxes.add_field('scores', output_scores)

    nms_max_boxes = tf.minimum(max_feature_num, feature_boxes.num_boxes())
    final_boxes = box_list_ops.non_max_suppression(feature_boxes, iou,
                                                   nms_max_boxes)

    return (final_boxes.get(), final_boxes.get_field('local_descriptors'),
            final_boxes.get_field('scales'),
            tf.expand_dims(final_boxes.get_field('scores'),
                           1), output_global_descriptors)
Esempio n. 3
0
def ExtractGlobalFeatures(image,
                          image_scales,
                          model_fn,
                          multi_scale_pool_type='None',
                          normalize_global_descriptor=False):
    """Extract global features for input image.

  Args:
    image: image tensor of type tf.uint8 with shape [h, w, channels].
    image_scales: 1D float tensor which contains float scales used for image
      pyramid construction.
    model_fn: model function. Follows the signature:
      * Args:
        * `images`: Image tensor which is re-scaled.
      * Returns:
        * `global_descriptors`: Global descriptors for input images.
    multi_scale_pool_type: If set, the global descriptor of each scale is pooled
      and a 1D global descriptor is returned.
    normalize_global_descriptor: If True, output global descriptors are
      L2-normalized.

  Returns:
    global_descriptors: If `multi_scale_pool_type` is 'None', returns a [S, D]
      float tensor. S is the number of scales, and D the global descriptor
      dimensionality. Each D-dimensional entry is a global descriptor, which may
      be L2-normalized depending on `normalize_global_descriptor`. If
      `multi_scale_pool_type` is not 'None', returns a [D] float tensor with the
      pooled global descriptor.

  """
    original_image_shape_float = tf.gather(
        tf.dtypes.cast(tf.shape(image), tf.float32), [0, 1])

    image_tensor = gld.NormalizeImages(image,
                                       pixel_value_offset=128.0,
                                       pixel_value_scale=128.0)
    image_tensor = tf.expand_dims(image_tensor, 0, name='image/expand_dims')

    def _ProcessSingleScale(scale_index, global_descriptors=None):
        """Resizes the image and runs feature extraction.

       This function will be passed into tf.while_loop() and be called
       repeatedly. We get the current scale by image_scales[scale_index], and
       run image resizing / feature extraction. In the end, we concat the
       previous global descriptors with current descriptor as the output.

    Args:
      scale_index: A valid index in image_scales.
      global_descriptors: Global descriptor tensor with the shape of [S, D]. If
        None, no previous global descriptors are used, and the output will be of
        shape [1, D].

    Returns:
      scale_index: The next scale index for processing.
      global_descriptors: A concatenated global descriptor tensor with the shape
        of [S+1, D].
    """
        scale = tf.gather(image_scales, scale_index)
        new_image_size = tf.dtypes.cast(
            tf.round(original_image_shape_float * scale), tf.int32)
        resized_image = tf.image.resize(image_tensor, new_image_size)

        global_descriptor = model_fn(resized_image)
        if global_descriptors is None:
            global_descriptors = global_descriptor
        else:
            global_descriptors = tf.concat(
                [global_descriptors, global_descriptor], 0)

        return scale_index + 1, global_descriptors

    # Process the first scale separately, the following scales will reuse the
    # graph variables.
    (_, output_global) = _ProcessSingleScale(0)

    i = tf.constant(1, dtype=tf.int32)
    num_scales = tf.shape(image_scales)[0]
    keep_going = lambda j, g: tf.less(j, num_scales)

    (_, output_global) = tf.nest.map_structure(
        tf.stop_gradient,
        tf.while_loop(
            cond=keep_going,
            body=_ProcessSingleScale,
            loop_vars=[i, output_global],
            shape_invariants=[i.get_shape(),
                              tf.TensorShape([None, None])]))

    normalization_axis = 1
    if multi_scale_pool_type == 'average':
        output_global = tf.reduce_mean(output_global,
                                       axis=0,
                                       keepdims=False,
                                       name='multi_scale_average_pooling')
        normalization_axis = 0
    elif multi_scale_pool_type == 'sum':
        output_global = tf.reduce_sum(output_global,
                                      axis=0,
                                      keepdims=False,
                                      name='multi_scale_sum_pooling')
        normalization_axis = 0

    if normalize_global_descriptor:
        output_global = tf.nn.l2_normalize(output_global,
                                           axis=normalization_axis,
                                           name='l2_normalization')

    return output_global
Esempio n. 4
0
def ExtractGlobalFeatures(image,
                          image_scales,
                          global_scales_ind,
                          model_fn,
                          multi_scale_pool_type='None',
                          normalize_global_descriptor=False):
    """Extract global features for input image.

  Args:
    image: image tensor of type tf.uint8 with shape [h, w, channels].
    image_scales: 1D float tensor which contains float scales used for image
      pyramid construction.
    global_scales_ind: Feature extraction happens only for a subset of
      `image_scales`, those with corresponding indices from this tensor.
    model_fn: model function. Follows the signature:
      * Args:
        * `images`: Batched image tensor.
      * Returns:
        * `global_descriptors`: Global descriptors for input images.
    multi_scale_pool_type: If set, the global descriptor of each scale is pooled
      and a 1D global descriptor is returned.
    normalize_global_descriptor: If True, output global descriptors are
      L2-normalized.

  Returns:
    global_descriptors: If `multi_scale_pool_type` is 'None', returns a [S, D]
      float tensor. S is the number of scales, and D the global descriptor
      dimensionality. Each D-dimensional entry is a global descriptor, which may
      be L2-normalized depending on `normalize_global_descriptor`. If
      `multi_scale_pool_type` is not 'None', returns a [D] float tensor with the
      pooled global descriptor.

  """
    original_image_shape_float = tf.gather(
        tf.dtypes.cast(tf.shape(image), tf.float32), [0, 1])
    val = tf.math.reduce_max(image)
    image_tensor = gld.NormalizeImages(image,
                                       pixel_value_offset=val,
                                       pixel_value_scale=val)
    image_tensor = tf.expand_dims(image_tensor, 0, name='image/expand_dims')

    def _ResizeAndExtract(scale_index):
        """Helper function to resize image then extract global feature.

    Args:
      scale_index: A valid index in image_scales.

    Returns:
      global_descriptor: [1,D] tensor denoting the extracted global descriptor.
    """
        scale = tf.gather(image_scales, scale_index)
        new_image_size = tf.dtypes.cast(
            tf.round(original_image_shape_float * scale), tf.int32)
        resized_image = tf.image.resize(image_tensor, new_image_size)
        global_descriptor = model_fn(resized_image)
        return global_descriptor

    # First loop to find initial scale to be used.
    num_scales = tf.shape(image_scales)[0]
    initial_scale_index = tf.constant(-1, dtype=tf.int32)
    for scale_index in tf.range(num_scales):
        if tf.reduce_any(tf.equal(global_scales_ind, scale_index)):
            initial_scale_index = scale_index
            break

    output_global = _ResizeAndExtract(initial_scale_index)

    # Loop over subsequent scales.
    for scale_index in tf.range(initial_scale_index + 1, num_scales):
        # Allow an undefined number of global feature scales to be extracted.
        tf.autograph.experimental.set_loop_options(
            shape_invariants=[(output_global, tf.TensorShape([None, None]))])

        if tf.reduce_any(tf.equal(global_scales_ind, scale_index)):
            global_descriptor = _ResizeAndExtract(scale_index)
            output_global = tf.concat([output_global, global_descriptor], 0)

    normalization_axis = 1
    if multi_scale_pool_type == 'average':
        output_global = tf.reduce_mean(output_global,
                                       axis=0,
                                       keepdims=False,
                                       name='multi_scale_average_pooling')
        normalization_axis = 0
    elif multi_scale_pool_type == 'sum':
        output_global = tf.reduce_sum(output_global,
                                      axis=0,
                                      keepdims=False,
                                      name='multi_scale_sum_pooling')
        normalization_axis = 0

    if normalize_global_descriptor:
        output_global = tf.nn.l2_normalize(output_global,
                                           axis=normalization_axis,
                                           name='l2_normalization')

    return output_global