Beispiel #1
0
def _randomly_negate_tensor(tensor):
    """With 50% prob turn the tensor negative."""
    should_flip = tf.cast(tf.floor(tf.random_uniform([]) + 0.5), tf.bool)
    final_tensor = tf.cond(should_flip, lambda: tensor, lambda: -tensor)
    return final_tensor
def _gen_mask(shape, drop_prob):
    """Generate a droppout mask."""
    keep_prob = 1. - drop_prob
    mask = tf.random_uniform(shape, minval=0., maxval=1., dtype=tf.float32)
    mask = tf.floor(mask + keep_prob) / keep_prob
    return mask
Beispiel #3
0
def compute_grid_positions(boxes, boundaries, output_size, sample_offset):
  """Compute the grid position w.r.t.

  the corresponding feature map.

  Args:
    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
      information of each box w.r.t. the corresponding feature map.
      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
        in terms of the number of pixels of the corresponding feature map size.
    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
      the boundary (in (y, x)) of the corresponding feature map for each box.
      Any resampled grid points that go beyond the bounary will be clipped.
    output_size: a scalar indicating the output crop size.
    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
      from grid point.

  Returns:
    kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
    kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
    box_grid_y0y1: Tensor of size [batch_size, boxes, output_size, 2]
    box_grid_x0x1: Tensor of size [batch_size, boxes, output_size, 2]
  """
  with tf.name_scope('compute_grid_positions'):
    batch_size, num_boxes, _ = boxes.get_shape().as_list()
    if batch_size is None:
      batch_size = tf.shape(boxes)[0]
    box_grid_x = []
    box_grid_y = []
    for i in range(output_size):
      box_grid_x.append(boxes[:, :, 1] +
                        (i + sample_offset) * boxes[:, :, 3] / output_size)
      box_grid_y.append(boxes[:, :, 0] +
                        (i + sample_offset) * boxes[:, :, 2] / output_size)
    box_grid_x = tf.stack(box_grid_x, axis=2)
    box_grid_y = tf.stack(box_grid_y, axis=2)

    box_grid_y0 = tf.floor(box_grid_y)
    box_grid_x0 = tf.floor(box_grid_x)
    box_grid_x0 = tf.maximum(0., box_grid_x0)
    box_grid_y0 = tf.maximum(0., box_grid_y0)

    box_grid_x0 = tf.minimum(box_grid_x0,
                             tf.expand_dims(boundaries[:, :, 1], -1))
    box_grid_x1 = tf.minimum(box_grid_x0 + 1,
                             tf.expand_dims(boundaries[:, :, 1], -1))
    box_grid_y0 = tf.minimum(box_grid_y0,
                             tf.expand_dims(boundaries[:, :, 0], -1))
    box_grid_y1 = tf.minimum(box_grid_y0 + 1,
                             tf.expand_dims(boundaries[:, :, 0], -1))

    box_gridx0x1 = tf.stack([box_grid_x0, box_grid_x1], axis=-1)
    box_gridy0y1 = tf.stack([box_grid_y0, box_grid_y1], axis=-1)

    # The RoIAlign feature f can be computed by bilinear interpolation of four
    # neighboring feature points f0, f1, f2, and f3.
    # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
    #                       [f10, f11]]
    # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
    # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
    ly = box_grid_y - box_grid_y0
    lx = box_grid_x - box_grid_x0
    hy = 1.0 - ly
    hx = 1.0 - lx
    kernel_y = tf.reshape(
        tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size, 2, 1])
    kernel_x = tf.reshape(
        tf.stack([hx, lx], axis=3), [batch_size, num_boxes, output_size, 2, 1])
  return kernel_y, kernel_x, box_gridy0y1, box_gridx0x1
def bilinear_sampler(img, x, y):
    """
    Performs bilinear sampling of the input images according to the
    normalized coordinates provided by the sampling grid. Note that
    the sampling is done identically for each channel of the input.
    To test if the function works properly, output image should be
    identical to input image when theta is initialized to identity
    transform.
    Input
    -----
    - img: batch of images in (B, H, W, C) layout.
    - grid: x, y which is the output of affine_grid_generator.
    Returns
    -------
    - interpolated images according to grids. Same size as grid.
    """
    # prepare useful params
    B = tf.shape(img)[0]
    H = tf.shape(img)[1]
    W = tf.shape(img)[2]
    C = tf.shape(img)[3]

    max_y = tf.cast(H - 1, 'int32')
    max_x = tf.cast(W - 1, 'int32')
    zero = tf.zeros([], dtype='int32')

    # cast indices as float32 (for rescaling)
    x = tf.cast(x, 'float32')
    y = tf.cast(y, 'float32')

    # rescale x and y to [0, W/H]
    x = 0.5 * ((x + 1.0) * tf.cast(W, 'float32'))
    y = 0.5 * ((y + 1.0) * tf.cast(H, 'float32'))

    # grab 4 nearest corner points for each (x_i, y_i)
    # i.e. we need a rectangle around the point of interest
    x0 = tf.cast(tf.floor(x), 'int32')
    x1 = x0 + 1
    y0 = tf.cast(tf.floor(y), 'int32')
    y1 = y0 + 1

    # clip to range [0, H/W] to not violate img boundaries
    x0 = tf.clip_by_value(x0, zero, max_x)
    x1 = tf.clip_by_value(x1, zero, max_x)
    y0 = tf.clip_by_value(y0, zero, max_y)
    y1 = tf.clip_by_value(y1, zero, max_y)

    # get pixel value at corner coords
    Ia = get_pixel_value(img, x0, y0)
    Ib = get_pixel_value(img, x0, y1)
    Ic = get_pixel_value(img, x1, y0)
    Id = get_pixel_value(img, x1, y1)

    # recast as float for delta calculation
    x0 = tf.cast(x0, 'float32')
    x1 = tf.cast(x1, 'float32')
    y0 = tf.cast(y0, 'float32')
    y1 = tf.cast(y1, 'float32')

    # calculate deltas
    wa = (x1 - x) * (y1 - y)
    wb = (x1 - x) * (y - y0)
    wc = (x - x0) * (y1 - y)
    wd = (x - x0) * (y - y0)

    # add dimension for addition
    wa = tf.expand_dims(wa, axis=3)
    wb = tf.expand_dims(wb, axis=3)
    wc = tf.expand_dims(wc, axis=3)
    wd = tf.expand_dims(wd, axis=3)

    # compute output
    out = tf.add_n([wa * Ia, wb * Ib, wc * Ic, wd * Id])

    return out
Beispiel #5
0
def op(
    name,
    labels,
    predictions,
    num_thresholds=None,
    weights=None,
    display_name=None,
    description=None,
    collections=None,
):
    """Create a PR curve summary op for a single binary classifier.

    Computes true/false positive/negative values for the given `predictions`
    against the ground truth `labels`, against a list of evenly distributed
    threshold values in `[0, 1]` of length `num_thresholds`.

    Each number in `predictions`, a float in `[0, 1]`, is compared with its
    corresponding boolean label in `labels`, and counts as a single tp/fp/tn/fn
    value at each threshold. This is then multiplied with `weights` which can be
    used to reweight certain values, or more commonly used for masking values.

    Args:
      name: A tag attached to the summary. Used by TensorBoard for organization.
      labels: The ground truth values. A Tensor of `bool` values with arbitrary
          shape.
      predictions: A float32 `Tensor` whose values are in the range `[0, 1]`.
          Dimensions must match those of `labels`.
      num_thresholds: Number of thresholds, evenly distributed in `[0, 1]`, to
          compute PR metrics for. Should be `>= 2`. This value should be a
          constant integer value, not a Tensor that stores an integer.
      weights: Optional float32 `Tensor`. Individual counts are multiplied by this
          value. This tensor must be either the same shape as or broadcastable to
          the `labels` tensor.
      display_name: Optional name for this summary in TensorBoard, as a
          constant `str`. Defaults to `name`.
      description: Optional long-form description for this summary, as a
          constant `str`. Markdown is supported. Defaults to empty.
      collections: Optional list of graph collections keys. The new
          summary op is added to these collections. Defaults to
          `[Graph Keys.SUMMARIES]`.

    Returns:
      A summary operation for use in a TensorFlow graph. The float32 tensor
      produced by the summary operation is of dimension (6, num_thresholds). The
      first dimension (of length 6) is of the order: true positives,
      false positives, true negatives, false negatives, precision, recall.
    """
    # TODO(nickfelt): remove on-demand imports once dep situation is fixed.
    import tensorflow.compat.v1 as tf

    if num_thresholds is None:
        num_thresholds = _DEFAULT_NUM_THRESHOLDS

    if weights is None:
        weights = 1.0

    dtype = predictions.dtype

    with tf.name_scope(name, values=[labels, predictions, weights]):
        tf.assert_type(labels, tf.bool)
        # We cast to float to ensure we have 0.0 or 1.0.
        f_labels = tf.cast(labels, dtype)
        # Ensure predictions are all in range [0.0, 1.0].
        predictions = tf.minimum(1.0, tf.maximum(0.0, predictions))
        # Get weighted true/false labels.
        true_labels = f_labels * weights
        false_labels = (1.0 - f_labels) * weights

        # Before we begin, flatten predictions.
        predictions = tf.reshape(predictions, [-1])

        # Shape the labels so they are broadcast-able for later multiplication.
        true_labels = tf.reshape(true_labels, [-1, 1])
        false_labels = tf.reshape(false_labels, [-1, 1])

        # To compute TP/FP/TN/FN, we are measuring a binary classifier
        #   C(t) = (predictions >= t)
        # at each threshold 't'. So we have
        #   TP(t) = sum( C(t) * true_labels )
        #   FP(t) = sum( C(t) * false_labels )
        #
        # But, computing C(t) requires computation for each t. To make it fast,
        # observe that C(t) is a cumulative integral, and so if we have
        #   thresholds = [t_0, ..., t_{n-1}];  t_0 < ... < t_{n-1}
        # where n = num_thresholds, and if we can compute the bucket function
        #   B(i) = Sum( (predictions == t), t_i <= t < t{i+1} )
        # then we get
        #   C(t_i) = sum( B(j), j >= i )
        # which is the reversed cumulative sum in tf.cumsum().
        #
        # We can compute B(i) efficiently by taking advantage of the fact that
        # our thresholds are evenly distributed, in that
        #   width = 1.0 / (num_thresholds - 1)
        #   thresholds = [0.0, 1*width, 2*width, 3*width, ..., 1.0]
        # Given a prediction value p, we can map it to its bucket by
        #   bucket_index(p) = floor( p * (num_thresholds - 1) )
        # so we can use tf.scatter_add() to update the buckets in one pass.

        # Compute the bucket indices for each prediction value.
        bucket_indices = tf.cast(tf.floor(predictions * (num_thresholds - 1)),
                                 tf.int32)

        # Bucket predictions.
        tp_buckets = tf.reduce_sum(
            input_tensor=tf.one_hot(bucket_indices, depth=num_thresholds) *
            true_labels,
            axis=0,
        )
        fp_buckets = tf.reduce_sum(
            input_tensor=tf.one_hot(bucket_indices, depth=num_thresholds) *
            false_labels,
            axis=0,
        )

        # Set up the cumulative sums to compute the actual metrics.
        tp = tf.cumsum(tp_buckets, reverse=True, name="tp")
        fp = tf.cumsum(fp_buckets, reverse=True, name="fp")
        # fn = sum(true_labels) - tp
        #    = sum(tp_buckets) - tp
        #    = tp[0] - tp
        # Similarly,
        # tn = fp[0] - fp
        tn = fp[0] - fp
        fn = tp[0] - tp

        precision = tp / tf.maximum(_MINIMUM_COUNT, tp + fp)
        recall = tp / tf.maximum(_MINIMUM_COUNT, tp + fn)

        return _create_tensor_summary(
            name,
            tp,
            fp,
            tn,
            fn,
            precision,
            recall,
            num_thresholds,
            display_name,
            description,
            collections,
        )
Beispiel #6
0
def selective_crop_and_resize(features,
                              boxes,
                              box_levels,
                              boundaries,
                              output_size=7,
                              sample_offset=0.5):
  """Crop and resize boxes on a set of feature maps.

  Given multiple features maps indexed by different levels, and a set of boxes
  where each box is mapped to a certain level, it selectively crops and resizes
  boxes from the corresponding feature maps to generate the box features.

  We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf,
  figure 3 for reference). Specifically, for each feature map, we select an
  (output_size, output_size) set of pixels corresponding to the box location,
  and then use bilinear interpolation to select the feature value for each
  pixel.

  For performance, we perform the gather and interpolation on all layers as a
  single operation. This is op the multi-level features are first stacked and
  gathered into [2*output_size, 2*output_size] feature points. Then bilinear
  interpolation is performed on the gathered feature points to generate
  [output_size, output_size] RoIAlign feature map.

  Here is the step-by-step algorithm:
    1. The multi-level features are gathered into a
       [batch_size, num_boxes, output_size*2, output_size*2, num_filters]
       Tensor. The Tensor contains four neighboring feature points for each
       vertice in the output grid.
    2. Compute the interpolation kernel of shape
       [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis
       can be seen as stacking 2x2 interpolation kernels for all vertices in the
       output grid.
    3. Element-wise multiply the gathered features and interpolation kernel.
       Then apply 2x2 average pooling to reduce spatial dimension to
       output_size.

  Args:
    features: a 5-D tensor of shape
      [batch_size, num_levels, max_height, max_width, num_filters] where
      cropping and resizing are based.
    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
      information of each box w.r.t. the corresponding feature map.
      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
      in terms of the number of pixels of the corresponding feature map size.
    box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
      the 0-based corresponding feature level index of each box.
    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
      the boundary (in (y, x)) of the corresponding feature map for each box.
      Any resampled grid points that go beyond the bounary will be clipped.
    output_size: a scalar indicating the output crop size.
    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
      from grid point.

  Returns:
    features_per_box: a 5-D tensor of shape
      [batch_size, num_boxes, output_size, output_size, num_filters]
      representing the cropped features.
  """
  (batch_size, num_levels, max_feature_height, max_feature_width,
   num_filters) = features.get_shape().as_list()
  _, num_boxes, _ = boxes.get_shape().as_list()

  # Compute the grid position w.r.t. the corresponding feature map.
  box_grid_x = []
  box_grid_y = []
  for i in range(output_size):
    box_grid_x.append(boxes[:, :, 1] +
                      (i + sample_offset) * boxes[:, :, 3] / output_size)
    box_grid_y.append(boxes[:, :, 0] +
                      (i + sample_offset) * boxes[:, :, 2] / output_size)
  box_grid_x = tf.stack(box_grid_x, axis=2)
  box_grid_y = tf.stack(box_grid_y, axis=2)

  # Compute indices for gather operation.
  box_grid_y0 = tf.floor(box_grid_y)
  box_grid_x0 = tf.floor(box_grid_x)
  box_grid_x0 = tf.maximum(0., box_grid_x0)
  box_grid_y0 = tf.maximum(0., box_grid_y0)
  box_gridx0x1 = tf.stack(
      [tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1)),
       tf.minimum(box_grid_x0 + 1, tf.expand_dims(boundaries[:, :, 1], -1))],
      axis=3)
  box_gridy0y1 = tf.stack(
      [tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1)),
       tf.minimum(box_grid_y0 + 1, tf.expand_dims(boundaries[:, :, 0], -1))],
      axis=3)

  x_indices = tf.cast(
      tf.reshape(box_gridx0x1,
                 [batch_size, num_boxes, output_size * 2]), dtype=tf.int32)
  y_indices = tf.cast(
      tf.reshape(box_gridy0y1,
                 [batch_size, num_boxes, output_size * 2]), dtype=tf.int32)

  height_dim_offset = max_feature_width
  level_dim_offset = max_feature_height * height_dim_offset
  batch_dim_offset = num_levels * level_dim_offset
  indices = tf.reshape(
      tf.tile(tf.reshape(tf.range(batch_size) * batch_dim_offset,
                         [batch_size, 1, 1, 1]),
              [1, num_boxes, output_size * 2, output_size * 2]) +
      tf.tile(tf.reshape(box_levels * level_dim_offset,
                         [batch_size, num_boxes, 1, 1]),
              [1, 1, output_size * 2, output_size * 2]) +
      tf.tile(tf.reshape(y_indices * height_dim_offset,
                         [batch_size, num_boxes, output_size * 2, 1]),
              [1, 1, 1, output_size * 2]) +
      tf.tile(tf.reshape(x_indices,
                         [batch_size, num_boxes, 1, output_size * 2]),
              [1, 1, output_size * 2, 1]), [-1])

  features = tf.reshape(features, [-1, num_filters])
  features_per_box = tf.reshape(
      tf.gather(features, indices),
      [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])

  # The RoIAlign feature f can be computed by bilinear interpolation of four
  # neighboring feature points f0, f1, f2, and f3.
  # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
  #                       [f10, f11]]
  # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
  # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
  ly = box_grid_y - box_grid_y0
  lx = box_grid_x - box_grid_x0
  hy = 1.0 - ly
  hx = 1.0 - lx
  kernel_x = tf.reshape(tf.stack([hx, lx], axis=3),
                        [batch_size, num_boxes, 1, output_size*2])
  kernel_y = tf.reshape(tf.stack([hy, ly], axis=3),
                        [batch_size, num_boxes, output_size*2, 1])
  # Uses implicit broadcast to generate the interpolation kernel. The
  # multiplier `4` is for avg pooling.
  interpolation_kernel = kernel_y * kernel_x * 4

  # Interpolates the gathered features with computed interpolation kernels.
  features_per_box *= tf.cast(
      tf.expand_dims(interpolation_kernel, axis=4),
      dtype=features_per_box.dtype)
  features_per_box = tf.reshape(
      features_per_box,
      [batch_size * num_boxes, output_size*2, output_size*2, num_filters])
  features_per_box = tf.nn.avg_pool(
      features_per_box, [1, 2, 2, 1], [1, 2, 2, 1], 'VALID')
  features_per_box = tf.reshape(
      features_per_box,
      [batch_size, num_boxes, output_size, output_size, num_filters])

  return features_per_box
Beispiel #7
0
def tf_interpolate(voxel, x, y, z, out_size):
    """
    Trilinear interpolation for batch of voxels
    :param voxel: The whole voxel grid
    :param x,y,z: indices of voxel
    :param output_size: output size of voxel
    :return:
    """
    batch_size = tf.shape(voxel)[0]
    height = tf.shape(voxel)[1]
    width = tf.shape(voxel)[2]
    depth = tf.shape(voxel)[3]
    n_channels = tf.shape(voxel)[4]

    x = tf.cast(x, 'float32')
    y = tf.cast(y, 'float32')
    z = tf.cast(z, 'float32')

    out_height = out_size[1]
    out_width = out_size[2]
    out_depth = out_size[3]
    out_channel = out_size[4]

    zero = tf.zeros([], dtype='int32')
    max_y = tf.cast(height - 1, 'int32')
    max_x = tf.cast(width - 1, 'int32')
    max_z = tf.cast(depth - 1, 'int32')

    # do sampling
    x0 = tf.cast(tf.floor(x), 'int32')
    x1 = x0 + 1
    y0 = tf.cast(tf.floor(y), 'int32')
    y1 = y0 + 1
    z0 = tf.cast(tf.floor(z), 'int32')
    z1 = z0 + 1

    x0 = tf.clip_by_value(x0, zero, max_x)
    x1 = tf.clip_by_value(x1, zero, max_x)
    y0 = tf.clip_by_value(y0, zero, max_y)
    y1 = tf.clip_by_value(y1, zero, max_y)
    z0 = tf.clip_by_value(z0, zero, max_z)
    z1 = tf.clip_by_value(z1, zero, max_z)

    #A 1D tensor of base indicies describe First index for each shape/map in the whole batch
    #tf.range(batch_size) * width * height * depth : Element to repeat. Each selement in the list is incremented by width*height*depth amount
    # out_height * out_width * out_depth: n of repeat. Create chunks of out_height*out_width*out_depth length with the same value created by tf.rage(batch_size) *width*height*dept
    base = tf_repeat(
        tf.range(batch_size) * width * height * depth,
        out_height * out_width * out_depth)

    #Find the Z element of each index

    base_z0 = base + z0 * width * height
    base_z1 = base + z1 * width * height
    #Find the Y element based on Z
    base_z0_y0 = base_z0 + y0 * width
    base_z0_y1 = base_z0 + y1 * width
    base_z1_y0 = base_z1 + y0 * width
    base_z1_y1 = base_z1 + y1 * width

    # Find the X element based on Y, Z for Z=0
    idx_a = base_z0_y0 + x0
    idx_b = base_z0_y1 + x0
    idx_c = base_z0_y0 + x1
    idx_d = base_z0_y1 + x1
    # Find the X element based on Y,Z for Z =1
    idx_e = base_z1_y0 + x0
    idx_f = base_z1_y1 + x0
    idx_g = base_z1_y0 + x1
    idx_h = base_z1_y1 + x1

    # use indices to lookup pixels in the flat image and restore
    # channels dim
    voxel_flat = tf.reshape(voxel, [-1, n_channels])
    voxel_flat = tf.cast(voxel_flat, 'float32')
    Ia = tf.gather(voxel_flat, idx_a)
    Ib = tf.gather(voxel_flat, idx_b)
    Ic = tf.gather(voxel_flat, idx_c)
    Id = tf.gather(voxel_flat, idx_d)
    Ie = tf.gather(voxel_flat, idx_e)
    If = tf.gather(voxel_flat, idx_f)
    Ig = tf.gather(voxel_flat, idx_g)
    Ih = tf.gather(voxel_flat, idx_h)

    # and finally calculate interpolated values
    x0_f = tf.cast(x0, 'float32')
    x1_f = tf.cast(x1, 'float32')
    y0_f = tf.cast(y0, 'float32')
    y1_f = tf.cast(y1, 'float32')
    z0_f = tf.cast(z0, 'float32')
    z1_f = tf.cast(z1, 'float32')

    #First slice XY along Z where z=0
    wa = tf.expand_dims(((x1_f - x) * (y1_f - y) * (z1_f - z)), 1)
    wb = tf.expand_dims(((x1_f - x) * (y - y0_f) * (z1_f - z)), 1)
    wc = tf.expand_dims(((x - x0_f) * (y1_f - y) * (z1_f - z)), 1)
    wd = tf.expand_dims(((x - x0_f) * (y - y0_f) * (z1_f - z)), 1)
    # First slice XY along Z where z=1
    we = tf.expand_dims(((x1_f - x) * (y1_f - y) * (z - z0_f)), 1)
    wf = tf.expand_dims(((x1_f - x) * (y - y0_f) * (z - z0_f)), 1)
    wg = tf.expand_dims(((x - x0_f) * (y1_f - y) * (z - z0_f)), 1)
    wh = tf.expand_dims(((x - x0_f) * (y - y0_f) * (z - z0_f)), 1)

    output = tf.add_n([
        wa * Ia, wb * Ib, wc * Ic, wd * Id, we * Ie, wf * If, wg * Ig, wh * Ih
    ])
    return output
Beispiel #8
0
def compute_floor_offsets_with_indices(y_source,
                                       x_source,
                                       y_target=None,
                                       x_target=None):
  """Computes offsets from floored source(floored) to target coordinates.

  This function computes the offsets from source coordinates ("floored" as if
  they were put on the grids) to target coordinates. Note that the input
  coordinates should be the "absolute" coordinates in terms of the output image
  dimensions as opposed to the normalized coordinates (i.e. values in [0, 1]).
  If the input y and x source have the second dimension (representing the
  neighboring pixels), then the offsets are computed from each of the
  neighboring pixels to their corresponding target (first dimension).

  Args:
    y_source: A tensor with shape [num_points] (or [num_points, num_neighbors])
      representing the absolute y-coordinates (in the output image space) of the
      source points.
    x_source: A tensor with shape [num_points] (or [num_points, num_neighbors])
      representing the absolute x-coordinates (in the output image space) of the
      source points.
    y_target: A tensor with shape [num_points] representing the absolute
      y-coordinates (in the output image space) of the target points. If not
      provided, then y_source is used as the targets.
    x_target: A tensor with shape [num_points] representing the absolute
      x-coordinates (in the output image space) of the target points. If not
      provided, then x_source is used as the targets.

  Returns:
    A tuple of two tensors:
      offsets: A tensor with shape [num_points, 2] (or
        [num_points, num_neighbors, 2]) representing the offsets of each input
        point.
      indices: A tensor with shape [num_points, 2] (or
        [num_points, num_neighbors, 2]) representing the indices of where the
        offsets should be retrieved in the output image dimension space.

  Raise:
    ValueError: source and target shapes have unexpected values.
  """
  y_source_floored = tf.floor(y_source)
  x_source_floored = tf.floor(x_source)

  source_shape = shape_utils.combined_static_and_dynamic_shape(y_source)
  if y_target is None and x_target is None:
    y_target = y_source
    x_target = x_source
  else:
    target_shape = shape_utils.combined_static_and_dynamic_shape(y_target)
    if len(source_shape) == 2 and len(target_shape) == 1:
      _, num_neighbors = source_shape
      y_target = tf.tile(
          tf.expand_dims(y_target, -1), multiples=[1, num_neighbors])
      x_target = tf.tile(
          tf.expand_dims(x_target, -1), multiples=[1, num_neighbors])
    elif source_shape != target_shape:
      raise ValueError('Inconsistent source and target shape.')

  y_offset = y_target - y_source_floored
  x_offset = x_target - x_source_floored

  y_source_indices = tf.cast(y_source_floored, tf.int32)
  x_source_indices = tf.cast(x_source_floored, tf.int32)

  indices = tf.stack([y_source_indices, x_source_indices], axis=-1)
  offsets = tf.stack([y_offset, x_offset], axis=-1)
  return offsets, indices
Beispiel #9
0
def selective_crop_and_resize(features,
                              boxes,
                              box_levels,
                              boundaries,
                              output_size=7,
                              is_gpu_inference=False):
  """Crop and resize boxes on a set of feature maps.

  Given multiple features maps indexed by different levels, and a set of boxes
  where each box is mapped to a certain level, it selectively crops and resizes
  boxes from the corresponding feature maps to generate the box features.

  We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf,
  figure 3 for reference). Specifically, for each feature map, we select an
  (output_size, output_size) set of pixels corresponding to the box location,
  and then use bilinear interpolation to select the feature value for each
  pixel.

  For performance, we perform the gather and interpolation on all layers as a
  single operation. This is op the multi-level features are first stacked and
  gathered into [2*output_size, 2*output_size] feature points. Then bilinear
  interpolation is performed on the gathered feature points to generate
  [output_size, output_size] RoIAlign feature map.

  Here is the step-by-step algorithm:
    1. The multi-level features are gathered into a
       [batch_size, num_boxes, output_size*2, output_size*2, num_filters]
       Tensor. The Tensor contains four neighboring feature points for each
       vertice in the output grid.
    2. Compute the interpolation kernel of shape
       [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis
       can be seen as stacking 2x2 interpolation kernels for all vertices in the
       output grid.
    3. Element-wise multiply the gathered features and interpolation kernel.
       Then apply 2x2 average pooling to reduce spatial dimension to
       output_size.

  Args:
    features: a 5-D tensor of shape
      [batch_size, num_levels, max_height, max_width, num_filters] where
      cropping and resizing are based.
    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
      information of each box w.r.t. the corresponding feature map.
      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
        in terms of the number of pixels of the corresponding feature map size.
    box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
      the 0-based corresponding feature level index of each box.
    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
      the boundary (in (y, x)) of the corresponding feature map for each box.
      Any resampled grid points that go beyond the bounary will be clipped.
    output_size: a scalar indicating the output crop size.
    is_gpu_inference: whether to build the model for GPU inference.

  Returns:
    features_per_box: a 5-D tensor of shape
      [batch_size, num_boxes, output_size, output_size, num_filters]
      representing the cropped features.
  """
  (batch_size, num_levels, max_feature_height, max_feature_width,
   num_filters) = features.get_shape().as_list()
  _, num_boxes, _ = boxes.get_shape().as_list()

  # Compute the grid position w.r.t. the corresponding feature map.
  box_grid_y, box_grid_x = compute_grid_positions(boxes, output_size)

  # Compute indices for gather operation.
  box_grid_y0 = tf.floor(box_grid_y)
  box_grid_x0 = tf.floor(box_grid_x)
  box_grid_x0 = tf.maximum(0., box_grid_x0)
  box_grid_y0 = tf.maximum(0., box_grid_y0)
  box_gridx0x1 = tf.stack([
      tf.minimum(box_grid_x0, boundaries[:, :, 1:2]),
      tf.minimum(box_grid_x0 + 1, boundaries[:, :, 1:2])
  ],
                          axis=3)
  box_gridy0y1 = tf.stack([
      tf.minimum(box_grid_y0, boundaries[:, :, 0:1]),
      tf.minimum(box_grid_y0 + 1, boundaries[:, :, 0:1])
  ],
                          axis=3)

  x_indices = (
      tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]))
  y_indices = (
      tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]))

  # If using GPU for inference, delay the cast until when Gather ops show up
  # since GPU inference supports float point better.
  # TODO(laigd): revisit this when newer versions of GPU libraries is released.
  indices_dtype = tf.float32 if is_gpu_inference else tf.int32
  if not is_gpu_inference:
    x_indices = tf.cast(x_indices, tf.int32)
    y_indices = tf.cast(y_indices, tf.int32)

  height_dim_offset = max_feature_width
  level_dim_offset = max_feature_height * height_dim_offset
  batch_dim_offset = num_levels * level_dim_offset

  batch_dim_indices = (
      tf.reshape(
          tf.range(batch_size, dtype=indices_dtype) * batch_dim_offset,
          [batch_size, 1, 1, 1]) *
      tf.ones([1, num_boxes, output_size * 2, output_size * 2],
              dtype=indices_dtype))
  box_level_indices = (
      tf.reshape(box_levels * level_dim_offset, [batch_size, num_boxes, 1, 1]) *
      tf.ones([1, 1, output_size * 2, output_size * 2], dtype=indices_dtype))
  height_indices = (
      tf.reshape(y_indices * height_dim_offset,
                 [batch_size, num_boxes, output_size * 2, 1]) *
      tf.ones([1, 1, 1, output_size * 2], dtype=indices_dtype))
  width_indices = (
      tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]) *
      tf.ones([1, 1, output_size * 2, 1], dtype=indices_dtype))

  # TODO(hongjunchoi): Remove the need for temporary variables as
  # temporary variables with int32 dtype are not supported for GPU's.
  indices = tf.add_n([
      batch_dim_indices,
      box_level_indices,
      height_indices,
      width_indices,
  ])

  if batch_size == 1:
    # Special handling for single batch input to make it friendly for GPU
    # inference.
    indices = tf.reshape(indices, [1, -1])
    if is_gpu_inference:
      indices = tf.cast(indices, dtype=tf.int32)
    features = tf.reshape(features, [1, -1, num_filters])
    # Cast should happen at last since GPU has better support for floating point
    # operations.
    features_per_box = tf.gather(features, indices, axis=1)
  else:
    indices = tf.reshape(indices, [-1])
    if is_gpu_inference:
      indices = tf.cast(indices, dtype=tf.int32)
    features = tf.reshape(features, [-1, num_filters])
    features_per_box = tf.gather(features, indices)

  features_per_box = tf.reshape(
      features_per_box,
      [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])

  # The RoIAlign feature f can be computed by bilinear interpolation of four
  # neighboring feature points f0, f1, f2, and f3.
  # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
  #                       [f10, f11]]
  # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
  # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
  ly = box_grid_y - box_grid_y0
  lx = box_grid_x - box_grid_x0
  hy = 1.0 - ly
  hx = 1.0 - lx
  kernel_x = tf.reshape(
      tf.stack([hx, lx], axis=3), [batch_size, num_boxes, 1, output_size * 2])
  kernel_y = tf.reshape(
      tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size * 2, 1])
  # Use implicit broadcast to generate the interpolation kernel. The
  # multiplier `4` is for avg pooling.
  interpolation_kernel = kernel_y * kernel_x * 4

  # Interpolate the gathered features with computed interpolation kernels.
  features_per_box *= tf.cast(
      tf.expand_dims(interpolation_kernel, axis=4),
      dtype=features_per_box.dtype)
  features_per_box = tf.reshape(
      features_per_box,
      [batch_size * num_boxes, output_size * 2, output_size * 2, num_filters])
  features_per_box = tf.nn.avg_pool(features_per_box, [1, 2, 2, 1],
                                    [1, 2, 2, 1], 'VALID')
  features_per_box = tf.reshape(
      features_per_box,
      [batch_size, num_boxes, output_size, output_size, num_filters])

  return features_per_box
Beispiel #10
0
    def testLoss(self):
        """
        Tests the loss of the FasterRCNN
        """

        # Create prediction_dict's structure
        prediction_dict_random = {
            'rpn_prediction': {},
            'classification_prediction': {
                'rcnn': {
                    'cls_score': None,
                    'bbox_offsets': None
                },
                'target': {},
                '_debug': {
                    'losses': {}
                }
            }
        }
        prediction_dict_perf = {
            'rpn_prediction': {},
            'classification_prediction': {
                'rcnn': {
                    'cls_score': None,
                    'bbox_offsets': None
                },
                'target': {},
                '_debug': {
                    'losses': {}
                }
            }
        }

        # Set seeds for stable results
        rand_seed = 13
        target_seed = 43
        image_size = (60, 80)
        num_anchors = 1000

        config = EasyDict(self.config)
        config.model.rpn.l2_regularization_scale = 0.0
        config.model.rcnn.l2_regularization_scale = 0.0
        config.model.base_network.arg_scope.weight_decay = 0.0

        #   RPN

        # Random generation of cls_targets for rpn
        # where:
        #       {-1}:   Ignore
        #       { 0}:   Background
        #       { 1}:   Object
        rpn_cls_target = tf.floor(
            tf.random_uniform([num_anchors],
                              minval=-1,
                              maxval=2,
                              dtype=tf.float32,
                              seed=target_seed,
                              name=None))

        # Creation of cls_scores with:
        #   score 100 in correct class
        #   score 0 in wrong class

        # Generation of opposite cls_score for rpn
        rpn_cls_score = tf.cast(
            tf.one_hot(tf.cast(tf.mod(tf.identity(rpn_cls_target) + 1, 2),
                               tf.int32),
                       depth=2,
                       on_value=10), tf.float32)
        # Generation of correct cls_score for rpn
        rpn_cls_perf_score = tf.cast(
            tf.one_hot(tf.cast(tf.identity(rpn_cls_target), tf.int32),
                       depth=2,
                       on_value=100), tf.float32)

        # Random generation of target bbox deltas
        rpn_bbox_target = tf.floor(
            tf.random_uniform([num_anchors, 4],
                              minval=-1,
                              maxval=1,
                              dtype=tf.float32,
                              seed=target_seed,
                              name=None))

        # Random generation of predicted bbox deltas
        rpn_bbox_predictions = tf.floor(
            tf.random_uniform([num_anchors, 4],
                              minval=-1,
                              maxval=1,
                              dtype=tf.float32,
                              seed=rand_seed,
                              name=None))

        prediction_dict_random['rpn_prediction'][
            'rpn_cls_score'] = rpn_cls_score
        prediction_dict_random['rpn_prediction'][
            'rpn_cls_target'] = rpn_cls_target
        prediction_dict_random['rpn_prediction'][
            'rpn_bbox_target'] = rpn_bbox_target
        prediction_dict_random['rpn_prediction'][
            'rpn_bbox_pred'] = rpn_bbox_predictions

        prediction_dict_perf['rpn_prediction'][
            'rpn_cls_score'] = rpn_cls_perf_score
        prediction_dict_perf['rpn_prediction'][
            'rpn_cls_target'] = rpn_cls_target
        prediction_dict_perf['rpn_prediction'][
            'rpn_bbox_target'] = rpn_bbox_target
        prediction_dict_perf['rpn_prediction'][
            'rpn_bbox_pred'] = rpn_bbox_target

        #   RCNN

        # Set the number of classes
        num_classes = config.model.network.num_classes

        # Randomly generate the bbox_offsets for the correct class = 1
        prediction_dict_random['classification_prediction']['target'] = {
            'bbox_offsets':
            tf.random_uniform([1, 4],
                              minval=-1,
                              maxval=1,
                              dtype=tf.float32,
                              seed=target_seed,
                              name=None),
            'cls': [1]
        }

        # Set the same bbox_offsets and cls for the perfect prediction
        prediction_dict_perf['classification_prediction'][
            'target'] = prediction_dict_random['classification_prediction'][
                'target'].copy()

        # Generate random scores for the num_classes + the background class
        rcnn_cls_score = tf.random_uniform([1, num_classes + 1],
                                           minval=-100,
                                           maxval=100,
                                           dtype=tf.float32,
                                           seed=rand_seed,
                                           name=None)

        # Generate a perfect prediction with the correct class score = 100
        # and the rest set to 0
        rcnn_cls_perf_score = tf.cast(
            tf.one_hot([1], depth=num_classes + 1, on_value=100), tf.float32)

        # Generate the random delta prediction for each class
        rcnn_bbox_offsets = tf.random_uniform([1, num_classes * 4],
                                              minval=-1,
                                              maxval=1,
                                              dtype=tf.float32,
                                              seed=rand_seed,
                                              name=None)

        # Copy the random prediction and set the correct class prediction
        # as the target one
        target_bbox_offsets = prediction_dict_random[
            'classification_prediction']['target']['bbox_offsets']
        initial_val = 1 * 4  # cls value * 4
        rcnn_bbox_perf_offsets = tf.Variable(
            tf.reshape(
                tf.random_uniform([1, num_classes * 4],
                                  minval=-1,
                                  maxval=1,
                                  dtype=tf.float32,
                                  seed=target_seed,
                                  name=None), [-1]))
        rcnn_bbox_perf_offsets = tf.reshape(
            tf.scatter_update(rcnn_bbox_perf_offsets,
                              tf.range(initial_val, initial_val + 4),
                              tf.reshape(target_bbox_offsets, [-1])), [1, -1])

        prediction_dict_random['classification_prediction']['rcnn'][
            'cls_score'] = rcnn_cls_score
        prediction_dict_random['classification_prediction']['rcnn'][
            'bbox_offsets'] = rcnn_bbox_offsets

        prediction_dict_perf['classification_prediction']['rcnn'][
            'cls_score'] = rcnn_cls_perf_score
        prediction_dict_perf['classification_prediction']['rcnn'][
            'bbox_offsets'] = rcnn_bbox_perf_offsets

        loss_perfect = self._get_losses(config, prediction_dict_perf,
                                        image_size)
        loss_random = self._get_losses(config, prediction_dict_random,
                                       image_size)

        loss_random_compare = {
            'rcnn_cls_loss': 5,
            'rcnn_reg_loss': 3,
            'rpn_cls_loss': 5,
            'rpn_reg_loss': 3,
            'no_reg_loss': 16,
            'regularization_loss': 0,
            'total_loss': 22,
        }
        for loss in loss_random:
            self.assertGreaterEqual(loss_random[loss],
                                    loss_random_compare[loss], loss)
            self.assertEqual(loss_perfect[loss], 0, loss)
Beispiel #11
0
    def _model_fn(input_fea, input_lab):
        """Creates a model, add summary, modes (train or eval), and hooks."""

        # input_fea and input_lab should be a list (laid_out_tensors).
        if not isinstance(input_fea, list):
            input_fea = [input_fea]
        if not isinstance(input_lab, list):
            input_lab = [input_lab]

        def _add_summary(lowering, train_or_eval, tf_loss, scalars,
                         global_step):
            """Add all summaries."""
            for k in scalars.keys():
                if not isinstance(scalars[k], tf.Tensor):
                    scalars[k] = tf.cast(
                        lowering.export_to_tf_tensor(scalars[k]), tf.float32)

            def _host_loss_summary(global_step, tf_loss, **scalars):
                """Add summary.scalar in host side."""
                gs = tf.cast(global_step, tf.int64)
                sum_loss = contrib_summary.scalar(
                    '{}_loss'.format(train_or_eval), tf_loss, step=gs)
                sum_ops = [sum_loss.op]
                for description, tf_metric in scalars.iteritems():
                    sum_metric = contrib_summary.scalar('{}_{}'.format(
                        train_or_eval, description),
                                                        tf_metric,
                                                        step=gs)
                    sum_ops.append(sum_metric)
                with tf.control_dependencies(sum_ops):
                    return tf.identity(tf_loss)

            if FLAGS.use_tpu:
                # Cast the global step to tf.int32, since
                # outside_compilation does not support tf.int64.
                tf_loss = tpu.outside_compilation(
                    _host_loss_summary, tf.cast(global_step, tf.int32),
                    tf_loss, **scalars)
            else:
                tf_loss = _host_loss_summary(tf.cast(global_step, tf.int32),
                                             tf_loss, **scalars)

            return tf_loss

        global_step = tf.train.get_or_create_global_step()
        graph, mesh, mesh_impl = mesh_context.create_graph_mesh_and_mesh_impl()

        with mtf.utils.outside_all_rewrites():
            # Do not tpu_rewrite this part. Inside this unet, If you use Tensorflow,
            # instead of Mesh-Tensorflor, it will cause host to tpu send/rec.
            preds, loss, scalars, bn_update_ops = (
                unet.unet_with_spatial_partition(mesh, mesh_impl,
                                                 train_or_eval, input_fea,
                                                 input_lab))

        if train_or_eval == 'train':
            var_grads = mtf.gradients(
                [loss], [v.outputs[0] for v in graph.trainable_variables])

            lr = FLAGS.lr * tf.pow(
                FLAGS.lr_drop_rate,
                tf.floor(
                    tf.cast(global_step, tf.float32) / FLAGS.lr_drop_steps))
            scalars['learning_rate'] = lr

            optimizer = mtf.optimize.AdafactorOptimizer(learning_rate=lr)
            update_ops = optimizer.apply_grads(var_grads,
                                               graph.trainable_variables)

            # This is where the actual tf graph got built.
            lowering = mtf.Lowering(graph, {mesh: mesh_impl})

            tf_update_ops = [
                lowering.lowered_operation(op) for op in update_ops
            ]
            tf_update_ops.append(tf.assign_add(global_step, 1))
            tf_update_ops.extend(
                [lowering.lowered_operation(op) for op in bn_update_ops])

        else:  # train_or_eval == 'eval':
            preds = [mtf.anonymize(pred) for pred in preds]

            # This is where the actual tf graph got built.
            lowering = mtf.Lowering(graph, {mesh: mesh_impl})

            tf_preds = [
                tf.cast(lowering.export_to_tf_tensor(pred), tf.float32)
                for pred in preds
            ]

        tf_loss = tf.cast(lowering.export_to_tf_tensor(loss), tf.float32)
        if FLAGS.write_summary:
            tf_loss = _add_summary(lowering, train_or_eval, tf_loss, scalars,
                                   global_step)
        master_to_slice_hook = mtf.MtfRestoreHook(lowering)

        if train_or_eval == 'train':
            with mtf.utils.outside_all_rewrites():
                saver = tf.train.Saver(tf.global_variables(),
                                       save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                saver_listener = mtf.MtfCheckpointSaverListener(lowering)
                slice_to_master_hook = tf.train.CheckpointSaverHook(
                    FLAGS.checkpoint_dir,
                    save_steps=FLAGS.save_checkpoints_steps,
                    saver=saver,
                    listeners=[saver_listener])
                captured_hooks.capture(
                    [master_to_slice_hook, slice_to_master_hook])
                return tf.group([tf_loss] + tf_update_ops)

        else:  # train_or_eval == 'eval':
            if FLAGS.use_tpu:
                tf_preds.extend([tf_loss, global_step])
                tf_preds_dtypes = [tf_pred.dtype for tf_pred in tf_preds]
                tf_preds_shapes = [tf_pred.shape for tf_pred in tf_preds]
                captured_hooks.capture([master_to_slice_hook, None])
                captured_output_dtypes_shapes.capture(
                    [tf_preds_dtypes, tf_preds_shapes])
                return tpu_ops.outfeed_enqueue_tuple(tf_preds)

            else:
                tf_preds.extend([tf_loss, global_step])
                captured_hooks.capture([master_to_slice_hook, None])
                return tf_preds
def bilinear_sampler(imgs, coords):
    """Construct a new image by bilinear sampling from the input image.

  Points falling outside the source image boundary have value 0.

  Args:
    imgs: source image to be sampled from [batch, height_s, width_s, channels]
    coords: coordinates of source pixels to sample from [batch, height_t,
      width_t, 2]. height_t/width_t correspond to the dimensions of the output
      image (don't need to be the same as height_s/width_s). The two channels
      correspond to x and y coordinates respectively.
  Returns:
    A new sampled image [batch, height_t, width_t, channels]
  """
    def _repeat(x, n_repeats):
        rep = tf.transpose(
            tf.expand_dims(tf.ones(shape=tf.stack([
                n_repeats,
            ])), 1), [1, 0])
        rep = tf.cast(rep, 'float32')
        x = tf.matmul(tf.reshape(x, (-1, 1)), rep)
        return tf.reshape(x, [-1])

    with tf.name_scope('image_sampling'):
        coords_x, coords_y = tf.split(coords, [1, 1], axis=3)
        inp_size = imgs.get_shape()
        coord_size = coords.get_shape()
        out_size = coords.get_shape().as_list()
        out_size[3] = imgs.get_shape().as_list()[3]

        coords_x = tf.cast(coords_x, 'float32')
        coords_y = tf.cast(coords_y, 'float32')

        x0 = tf.floor(coords_x)
        x1 = x0 + 1
        y0 = tf.floor(coords_y)
        y1 = y0 + 1

        y_max = tf.cast(tf.shape(imgs)[1] - 1, 'float32')
        x_max = tf.cast(tf.shape(imgs)[2] - 1, 'float32')
        zero = tf.zeros([1], dtype='float32')

        x0_safe = tf.clip_by_value(x0, zero, x_max)
        y0_safe = tf.clip_by_value(y0, zero, y_max)
        x1_safe = tf.clip_by_value(x1, zero, x_max)
        y1_safe = tf.clip_by_value(y1, zero, y_max)

        ## bilinear interp weights, with points outside the grid having weight 0
        # wt_x0 = (x1 - coords_x) * tf.cast(tf.equal(x0, x0_safe), 'float32')
        # wt_x1 = (coords_x - x0) * tf.cast(tf.equal(x1, x1_safe), 'float32')
        # wt_y0 = (y1 - coords_y) * tf.cast(tf.equal(y0, y0_safe), 'float32')
        # wt_y1 = (coords_y - y0) * tf.cast(tf.equal(y1, y1_safe), 'float32')

        wt_x0 = x1_safe - coords_x
        wt_x1 = coords_x - x0_safe
        wt_y0 = y1_safe - coords_y
        wt_y1 = coords_y - y0_safe

        ## indices in the flat image to sample from
        dim2 = tf.cast(inp_size[2], 'float32')
        dim1 = tf.cast(inp_size[2] * inp_size[1], 'float32')
        base = tf.reshape(
            _repeat(
                tf.cast(tf.range(coord_size[0]), 'float32') * dim1,
                coord_size[1] * coord_size[2]),
            [out_size[0], out_size[1], out_size[2], 1])

        base_y0 = base + y0_safe * dim2
        base_y1 = base + y1_safe * dim2
        idx00 = tf.reshape(x0_safe + base_y0, [-1])
        idx01 = x0_safe + base_y1
        idx10 = x1_safe + base_y0
        idx11 = x1_safe + base_y1

        ## sample from imgs
        imgs_flat = tf.reshape(imgs, tf.stack([-1, inp_size[3]]))
        imgs_flat = tf.cast(imgs_flat, 'float32')
        im00 = tf.reshape(tf.gather(imgs_flat, tf.cast(idx00, 'int32')),
                          out_size)
        im01 = tf.reshape(tf.gather(imgs_flat, tf.cast(idx01, 'int32')),
                          out_size)
        im10 = tf.reshape(tf.gather(imgs_flat, tf.cast(idx10, 'int32')),
                          out_size)
        im11 = tf.reshape(tf.gather(imgs_flat, tf.cast(idx11, 'int32')),
                          out_size)

        w00 = wt_x0 * wt_y0
        w01 = wt_x0 * wt_y1
        w10 = wt_x1 * wt_y0
        w11 = wt_x1 * wt_y1

        output = tf.add_n([w00 * im00, w01 * im01, w10 * im10, w11 * im11])
        return output