Exemple #1
0
    def test_SpatialGumbelSoftmax(self):

        features = tf.convert_to_tensor(
            np.random.normal(size=(32, 16, 16, 64)).astype(np.float32))
        with tf.variable_scope('mean_pool'):
            expected_feature_points, softmax = spatial_softmax.BuildSpatialSoftmax(
                features, spatial_gumbel_softmax=False)
        with tf.variable_scope('gumbel_pool'):
            gumbel_feature_points, gumbel_softmax = (
                spatial_softmax.BuildSpatialSoftmax(
                    features, spatial_gumbel_softmax=True))
        self.assertEqual(expected_feature_points.shape,
                         gumbel_feature_points.shape)
        self.assertEqual(softmax.shape, gumbel_softmax.shape)
Exemple #2
0
def BuildImagesToFeaturesModel(images,
                               filter_size=3,
                               num_blocks=5,
                               num_output_maps=32,
                               is_training=False,
                               normalizer_fn=slim.layer_norm,
                               normalizer_params=None,
                               weight_regularization=0.00001,
                               film_output_params=None):
  """Builds the pose regression model.

  Args:
    images: A float32 Tensor with shape [batch_size, height, width, channels]
      representing the camera image. Its values range from 0.0 to 1.0.
    filter_size: The width and height of the conv filters.
    num_blocks: The number of pool-conv-conv_1x1 blocks to repeat.
    num_output_maps: Number of output feature maps.
    is_training: True if training.
    normalizer_fn: Function to use for normalization. Defaults to layer norm.
    normalizer_params: Dictionary of normalizer_fn parameters for batch_norm.
    weight_regularization: Weight regularization factor.
    film_output_params: If given, parse gamma and beta from given Tensor and
      scale feature maps as done in FILM (https://arxiv.org/abs/1709.07871). As
        recommended by the paper, instead of doing gamma * x + beta, this does
        (1 + gamma) * x + beta, to better handle the initial zero-centered
        gamma.

  Returns:
    expected_feature_points: A tensor of size
      [batch_size, num_features * 2]. These are the expected feature
      locations, i.e., the spatial softmax of feature_maps. The inner
      dimension is arranged as [x1, x2, x3 ... xN, y1, y2, y3, ... yN].
  """

  if normalizer_params is None and normalizer_fn == slim.batch_norm:
    normalizer_params = {
        'is_training': is_training,
        'decay': 0.99,
        'scale': False,
        'epsilon': 0.0001,
    }
    batch_norm_params_with_scaling = {
        'is_training': is_training,
        'decay': 0.99,
        'scale': True,
        'epsilon': 0.0001,
    }
  else:
    batch_norm_params_with_scaling = None

  # Number of channels for each layer in intermediate conv layers.
  # Not configurable at the moment.
  num_channels_per_block = 32

  if film_output_params is not None:
    # Retrieves the gammas and betas for FILM.
    # Given an input z we wish to condition on, FILM learns an
    #   f_i(z) = gamma_i, beta_i
    # for each layer i we want to condition, then does.
    #   FILM(h_i) = gamma_i * h_i + beta_i
    # where h_i is the pre-activation of the network (right before ReLU).
    # This assume we are given a Tensor that's the concat of all gammas and
    # betas, and we want to condition each conv layer of the network.
    expected_size = 2 * num_blocks * num_channels_per_block
    # I bet there's a better way to assert this.
    film_shape = film_output_params.get_shape().as_list()
    if len(film_shape) != 2:
      raise ValueError('FILM shape is %s but is expected to be 2-D' %
                       str(film_shape))
    if film_shape[-1] != expected_size:
      raise ValueError('FILM shape is %s but final dimension should be %d' %
                       (str(film_shape), expected_size))

    # [batch, film_size] -> [batch, 1, 1, film_size] for broadcasting
    film_output_params = tf.expand_dims(film_output_params, axis=-2)
    film_output_params = tf.expand_dims(film_output_params, axis=-2)
    gammas_and_betas = tf.split(
        film_output_params, num_or_size_splits=2 * num_blocks, axis=-1)
    gammas, betas = gammas_and_betas[:num_blocks], gammas_and_betas[num_blocks:]
    for i in range(num_blocks):
      gammas[i] = 1.0 + gammas[i]

  net = images

  with slim.arg_scope([slim.conv2d], padding='VALID'):
    with slim.arg_scope(
        [slim.conv2d],
        weights_initializer=slim.xavier_initializer(),
        weights_regularizer=slim.l2_regularizer(weight_regularization),
        biases_initializer=tf.constant_initializer(0.01),
        normalizer_fn=normalizer_fn,
        normalizer_params=normalizer_params):
      for i in range(num_blocks):
        if i == 0 or i == 1:
          stride = 2
        else:
          stride = 1

        # Conv -> BN -> FILM -> ReLU.
        net = slim.conv2d(
            net,
            num_outputs=num_channels_per_block,
            activation_fn=None,
            kernel_size=[filter_size, filter_size],
            stride=stride,
            scope='conv{:d}'.format(i + 2))

        if film_output_params is not None:
          net = gammas[i] * net + betas[i]
        net = tf.nn.relu(net)

      net = slim.conv2d(
          net,
          num_output_maps, [1, 1],
          scope='final_conv_1x1',
          normalizer_params=batch_norm_params_with_scaling)
      net, softmax = spatial_softmax.BuildSpatialSoftmax(net)
      return net, {'softmax': softmax}
Exemple #3
0
def BuildImagesToFeaturesModelHighRes(images,
                                      filter_size=3,
                                      num_blocks=5,
                                      num_output_maps=32,
                                      is_training=False,
                                      normalizer_fn=slim.batch_norm,
                                      normalizer_params=None,
                                      weight_regularization=0.00001):
  """Builds the pose regression model.

  Note: this is a variant of the above, used in the PI-GPS paper (Chebotar et
  al., 2016). We call it "HighRes" because it adds up features from multiple
  layers at different resolutions by scaling everything up, and the spatial
  softmax is computed at the highest of those resolutions. See
  https://arxiv.org/pdf/1610.00529.pdf for an architecture diagram.

  Args:
    images: A float32 Tensor with shape [batch_size, height, width, channels]
      representing the camera image. Its values range from 0.0 to 1.0.
    filter_size: The width and height of the conv filters.
    num_blocks: The number of pool-conv-conv_1x1 blocks to repeat.
    num_output_maps: Number of output feature maps.
    is_training: True if training.
    normalizer_fn: Function to use for normalization. Defaults to batch norm.
    normalizer_params: Dictionary of normalizer_fn parameters.
    weight_regularization: Weight regularization factor.

  Returns:
    expected_feature_points: A tensor of size
      [batch_size, num_features * 2]. These are the expected feature
      locations, i.e., the spatial softmax of feature_maps. The inner
      dimension is arranged as [x1, x2, x3 ... xN, y1, y2, y3, ... yN].
  """
  # Parameters for batch normalization.
  batch_norm_params_with_scaling = None
  if normalizer_fn == slim.batch_norm:
    if normalizer_params is None:
      normalizer_params = {
          'is_training': is_training,
          'decay': 0.99,
          'scale': False,
          'epsilon': 0.0001,
      }
    batch_norm_params_with_scaling = {
        'is_training': is_training,
        'decay': 0.99,
        'scale': True,
        'epsilon': 0.0001,
    }

  with slim.arg_scope([slim.conv2d, slim.avg_pool2d], padding='VALID'):
    with slim.arg_scope(
        [slim.conv2d],
        weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
        weights_regularizer=slim.l2_regularizer(weight_regularization),
        normalizer_fn=normalizer_fn,
        normalizer_params=normalizer_params):
      block_outs = []
      net = slim.avg_pool2d(images, [2, 2], stride=2, scope='pool1')
      net = slim.conv2d(
          net, 16, [filter_size, filter_size], stride=2, scope='conv1')
      net = slim.conv2d(
          net, 32, [filter_size, filter_size], stride=1, scope='conv2')
      block_outs.append(slim.conv2d(net, 32, [1, 1], scope='conv2_1x1'))
      for i in range(1, num_blocks):
        net = slim.max_pool2d(
            net, [2, 2], stride=2, scope='pool{:d}'.format(i + 1))
        net = slim.conv2d(
            net,
            32, [filter_size, filter_size],
            stride=1,
            scope='conv{:d}'.format(i + 2))
        block_outs.append(
            slim.conv2d(net, 32, [1, 1], scope='conv{:d}_1x1'.format(i + 2)))
      final_image_shape = block_outs[0].get_shape().as_list()[1:3]

      def ResizeLayerToImage(layer):
        return tf.image.resize_images(
            layer, [final_image_shape[0], final_image_shape[1]],
            tf.image.ResizeMethod.NEAREST_NEIGHBOR)

      net = tf.add_n([ResizeLayerToImage(layer) for layer in block_outs])
      net = slim.conv2d(
          net,
          num_output_maps, [1, 1],
          scope='final_conv_1x1',
          normalizer_params=batch_norm_params_with_scaling)
      net, softmax = spatial_softmax.BuildSpatialSoftmax(net)
      return net, {'softmax': softmax}
def BuildImagesToFeaturesModel(images,
                               filter_size=3,
                               num_blocks=5,
                               num_output_maps=32,
                               is_training=False,
                               normalizer_fn=slim.layer_norm,
                               normalizer_params=None,
                               weight_regularization=0.00001):
  """Builds the pose regression model.

  Args:
    images: A float32 Tensor with shape [batch_size, height, width, channels]
      representing the camera image. Its values range from 0.0 to 1.0.
    filter_size: The width and height of the conv filters.
    num_blocks: The number of pool-conv-conv_1x1 blocks to repeat.
    num_output_maps: Number of output feature maps.
    is_training: True if training.
    normalizer_fn: Function to use for normalization. Defaults to layer norm.
    normalizer_params: Dictionary of normalizer_fn parameters for batch_norm.
    weight_regularization: Weight regularization factor.

  Returns:
    expected_feature_points: A tensor of size
      [batch_size, num_features * 2]. These are the expected feature
      locations, i.e., the spatial softmax of feature_maps. The inner
      dimension is arranged as [x1, x2, x3 ... xN, y1, y2, y3, ... yN].
  """

  if normalizer_params is None and normalizer_fn == slim.batch_norm:
    normalizer_params = {
        'is_training': is_training,
        'decay': 0.99,
        'scale': False,
        'epsilon': 0.0001,
    }
    batch_norm_params_with_scaling = {
        'is_training': is_training,
        'decay': 0.99,
        'scale': True,
        'epsilon': 0.0001,
    }
  else:
    batch_norm_params_with_scaling = None

  net = images

  with slim.arg_scope([slim.conv2d], padding='VALID'):
    with slim.arg_scope(
        [slim.conv2d],
        weights_initializer=slim.xavier_initializer(),
        weights_regularizer=slim.l2_regularizer(weight_regularization),
        biases_initializer=tf.constant_initializer(0.01),
        normalizer_fn=normalizer_fn,
        normalizer_params=normalizer_params):
      for i in range(num_blocks):
        if i == 0 or i == 1:
          stride = 2
        else:
          stride = 1
        net = slim.conv2d(
            net,
            32, [filter_size, filter_size],
            stride=stride,
            scope='conv{:d}'.format(i + 2))

      net = slim.conv2d(
          net,
          num_output_maps, [1, 1],
          scope='final_conv_1x1',
          normalizer_params=batch_norm_params_with_scaling)
      net, softmax = spatial_softmax.BuildSpatialSoftmax(net)
      return net, {'softmax': softmax}