Exemple #1
0
 def _preprocess(features):
   image = features[tfds_arguments.features_key]
   if not crop_size:
     return image
   tf.logging.info("Scaling down %s and cropping to %d x %d", image,
                   crop_size, crop_size)
   with tf.name_scope("random_scale"):
     # Scale down by at least `biggest_fac` and at most `smallest_fac` to
     # remove JPG artifacts. This code also handles images that have one
     # side  shorter than crop_size. In this case, we always upscale such
     # that this side becomes the same as `crop_size`. Overall, images
     # returned will never be smaller than `crop_size`.
     image_shape = tf.cast(tf.shape(image), tf.float32)
     height, width = image_shape[0], image_shape[1]
     smallest_side = tf.math.minimum(height, width)
     # The smallest factor such that the downscaled image is still bigger
     # than `crop_size`. Will be bigger than 1 for images smaller than
     # `crop_size`.
     image_smallest_fac = crop_size_float / smallest_side
     min_fac = tf.math.maximum(smallest_fac, image_smallest_fac)
     max_fac = tf.math.maximum(min_fac, biggest_fac)
     scale = tf.random_uniform([],
                               minval=min_fac,
                               maxval=max_fac,
                               dtype=tf.float32,
                               seed=42,
                               name=None)
     image = tf.image.resize_images(
         image, [tf.ceil(scale * height),
                 tf.ceil(scale * width)])
   with tf.name_scope("random_crop"):
     image = tf.image.random_crop(image, [crop_size, crop_size, 3])
   return image
Exemple #2
0
def resample_voxels(v, xs, ys, zs, method="trilinear"):
    
    if method == "trilinear":
        floor_xs = tf.floor(tf.clip_by_value(xs, 0, 64))
        floor_ys = tf.floor(tf.clip_by_value(ys, 0, 64))
        floor_zs = tf.floor(tf.clip_by_value(zs, 0, 64))

        ceil_xs = tf.ceil(tf.clip_by_value(xs, 0, 64))
        ceil_ys = tf.ceil(tf.clip_by_value(ys, 0, 64))
        ceil_zs = tf.ceil(tf.clip_by_value(zs, 0, 64))

        final_value =( tf.abs((xs-floor_xs)*(ys-floor_ys)*(zs-floor_zs))*get_voxel_values(v, ceil_xs, ceil_ys, ceil_zs) + 
                       tf.abs((xs-floor_xs)*(ys-floor_ys)*(zs-ceil_zs))*get_voxel_values(v, ceil_xs, ceil_ys, floor_zs) +
                       tf.abs((xs-floor_xs)*(ys-ceil_ys)*(zs-floor_zs))*get_voxel_values(v, ceil_xs, floor_ys, ceil_zs) +
                       tf.abs((xs-floor_xs)*(ys-ceil_ys)*(zs-ceil_zs))*get_voxel_values(v, ceil_xs, floor_ys, floor_zs) +
                       tf.abs((xs-ceil_xs)*(ys-floor_ys)*(zs-floor_zs))*get_voxel_values(v, floor_xs, ceil_ys, ceil_zs) +
                       tf.abs((xs-ceil_xs)*(ys-floor_ys)*(zs-ceil_zs))*get_voxel_values(v, floor_xs, ceil_ys, floor_zs) +
                       tf.abs((xs-ceil_xs)*(ys-ceil_ys)*(zs-floor_zs))*get_voxel_values(v, floor_xs, floor_ys, ceil_zs) +
                       tf.abs((xs-ceil_xs)*(ys-ceil_ys)*(zs-ceil_zs))*get_voxel_values(v, floor_xs, floor_ys, floor_zs)
                     )
        return final_value
    
    elif method == "nearest":
        r_xs = tf.round(xs)
        r_ys = tf.round(ys)
        r_zs = tf.round(zs)
        return get_voxel_values(v, r_xs, r_ys, r_zs)
    
    else:
        raise NameError(method)
Exemple #3
0
def transformCropImage(opt, image, pMtrx):
    with tf.name_scope("transformImage"):
        refMtrx = tf.tile(tf.expand_dims(opt.refMtrx_b, axis=0),
                          [opt.batchSize, 1, 1])
        transMtrx = tf.matmul(refMtrx, pMtrx)
        # warp the canonical coordinates
        X, Y = np.meshgrid(np.linspace(-1, 1, opt.W),
                           np.linspace(-1, 1, opt.H))
        X, Y = X.flatten(), Y.flatten()
        XYhom = np.stack([X, Y, np.ones_like(X)], axis=1).T
        XYhom = np.tile(XYhom, [opt.batchSize, 1, 1]).astype(np.float32)
        XYwarpHom = tf.matmul(transMtrx, XYhom)
        XwarpHom, YwarpHom, ZwarpHom = tf.unstack(XYwarpHom, axis=1)
        Xwarp = tf.reshape(XwarpHom / (ZwarpHom + 1e-8),
                           [opt.batchSize, opt.H, opt.W])
        Ywarp = tf.reshape(YwarpHom / (ZwarpHom + 1e-8),
                           [opt.batchSize, opt.H, opt.W])
        # get the integer sampling coordinates
        Xfloor, Xceil = tf.floor(Xwarp), tf.ceil(Xwarp)
        Yfloor, Yceil = tf.floor(Ywarp), tf.ceil(Ywarp)
        XfloorInt, XceilInt = tf.to_int32(Xfloor), tf.to_int32(Xceil)
        YfloorInt, YceilInt = tf.to_int32(Yfloor), tf.to_int32(Yceil)
        imageIdx = np.tile(
            np.arange(opt.batchSize).reshape([opt.batchSize, 1, 1]),
            [1, opt.H, opt.W])
        imageVec = tf.reshape(image, [-1, 3])
        imageVecOut = tf.concat([imageVec, tf.zeros([1, 3])], axis=0)
        idxUL = (imageIdx * opt.dataH + YfloorInt) * opt.dataW + XfloorInt
        idxUR = (imageIdx * opt.dataH + YfloorInt) * opt.dataW + XceilInt
        idxBL = (imageIdx * opt.dataH + YceilInt) * opt.dataW + XfloorInt
        idxBR = (imageIdx * opt.dataH + YceilInt) * opt.dataW + XceilInt
        idxOutside = tf.fill([opt.batchSize, opt.H, opt.W],
                             opt.batchSize * opt.dataH * opt.dataW)

        def insideIm(Xint, Yint):
            return (Xint >= 0) & (Xint < opt.dataW) & (Yint >= 0) & (Yint <
                                                                     opt.dataH)

        idxUL = tf.where(insideIm(XfloorInt, YfloorInt), idxUL, idxOutside)
        idxUR = tf.where(insideIm(XceilInt, YfloorInt), idxUR, idxOutside)
        idxBL = tf.where(insideIm(XfloorInt, YceilInt), idxBL, idxOutside)
        idxBR = tf.where(insideIm(XceilInt, YceilInt), idxBR, idxOutside)
        # bilinear interpolation
        Xratio = tf.reshape(Xwarp - Xfloor, [opt.batchSize, opt.H, opt.W, 1])
        Yratio = tf.reshape(Ywarp - Yfloor, [opt.batchSize, opt.H, opt.W, 1])
        imageUL = tf.to_float(tf.gather(imageVecOut,
                                        idxUL)) * (1 - Xratio) * (1 - Yratio)
        imageUR = tf.to_float(tf.gather(imageVecOut,
                                        idxUR)) * (Xratio) * (1 - Yratio)
        imageBL = tf.to_float(tf.gather(imageVecOut,
                                        idxBL)) * (1 - Xratio) * (Yratio)
        imageBR = tf.to_float(tf.gather(imageVecOut,
                                        idxBR)) * (Xratio) * (Yratio)
        imageWarp = imageUL + imageUR + imageBL + imageBR
    return imageWarp
Exemple #4
0
  def max_pad_length(self, features):
    """Finds max padding length.

    If target length not specified use fixed padding
    length from hparams.max_length.

    Args:
      features: Dictionary with input and target tensors

    Returns:
      tf.Tensor:  Length of input and output sequence. Length is power of 2.
    """

    if self.hparams.force_max_length or features.get("targets") is None:
      assert math.log(self.hparams.max_length, 2).is_integer(), \
        "hparams.max_length should be power of w"

      return self.hparams.max_length

    length = tf.shape(features["inputs"])[1]
    targets_length = tf.shape(features["targets"])[1]
    length = tf.maximum(length, targets_length)

    p = tf.log(tf.cast(length, tf.float32)) / tf.log(2.0)
    p = tf.cast(tf.ceil(p), tf.int32)
    return tf.pow(2, p)
Exemple #5
0
def predict_target_lengths(encoder_output,
                           inputs_mask,
                           hparams,
                           length_diff=None):
    """Predict target lengths."""
    bound = hparams.lendiff_bound
    inputs_length = tf.cast(tf.reduce_sum(inputs_mask, 1), tf.int32)
    targets_length = inputs_length
    loss = None
    if hparams.predict_target_length:
        encoder_output = gops.reduce_mean_over_l(encoder_output, inputs_mask)
        logits = tf.stop_gradient(encoder_output)
        logits = lenpred_mlp("lenpred", logits, hparams.hidden_size, bound)
        if length_diff is not None:
            labels = tf.maximum(tf.minimum(length_diff, bound), -bound)
            labels = tf.cast(labels + bound, tf.int32)
            labels = tf.stop_gradient(labels)
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=labels, logits=logits)
            loss = tf.reduce_mean(loss)
        diff_pred = tf.argmax(logits, 1)
        diff_pred = tf.cast(diff_pred - bound, tf.int32)
        targets_length = inputs_length + diff_pred
        targets_length = tf.maximum(targets_length, 1)
    divi = 4
    targets_length = tf.ceil(targets_length / divi) * divi
    targets_length = tf.cast(targets_length, tf.int32)
    return targets_length, loss
Exemple #6
0
def crop_or_pad(waves, length, channels):
    """Crop or pad wave to have shape [N, length, channels].

  Args:
    waves: A 3D `Tensor` of NLC format.
    length: A Python scalar. The output wave size.
    channels: Number of output waves channels.

  Returns:
    A 3D `Tensor` of NLC format with shape [N, length, channels].
  """
    waves = tf.convert_to_tensor(waves)
    batch_size = waves.shape[0].value
    waves_shape = tf.shape(waves)

    # Force audio length.
    pad = tf.maximum(0, length - waves_shape[1])
    right_pad = tf.to_int32(tf.to_float(pad) / 2.0)
    left_pad = pad - right_pad
    waves = tf.pad(waves, [[0, 0], [left_pad, right_pad], [0, 0]])
    waves = waves[:, :length, :]

    # Force number of channels.
    num_repeats = tf.to_int32(
        tf.ceil(tf.to_float(channels) / tf.to_float(waves_shape[2])))
    waves = tf.tile(waves, [1, 1, num_repeats])[:, :, :channels]

    waves.set_shape([batch_size, length, channels])
    return waves
Exemple #7
0
 def _anchor_component(self):
     with tf.variable_scope('ANCHOR_' + 'default'):
         # just to get the shape right
         height = tf.to_int32(
             tf.ceil(self._im_info[0, 0] /
                     np.float32(self._feat_stride[0])))
         width = tf.to_int32(
             tf.ceil(self._im_info[0, 1] /
                     np.float32(self._feat_stride[0])))
         anchors, anchor_length = tf.py_func(generate_anchors_pre, [
             height, width, self._feat_stride, self._anchor_scales,
             self._anchor_ratios
         ], [tf.float32, tf.int32],
                                             name="generate_anchors")
         anchors.set_shape([None, 4])
         anchor_length.set_shape([])
         self._anchors = anchors
         self._anchor_length = anchor_length
def _batch_stitch(features, mean_length=4.0, stddev=2.0):
    """Stitches a batch of single-step data to a batch of multi-step data."""
    batch_size = common_layers.shape_list(features['task'])[0]
    num_sequences = tf.maximum(
        tf.to_int32(tf.to_float(batch_size) / mean_length), 1)
    lengths = tf.random.truncated_normal(shape=[num_sequences],
                                         mean=mean_length,
                                         stddev=stddev)
    max_length = tf.reduce_max(lengths) * (tf.to_float(batch_size) /
                                           tf.reduce_sum(lengths))
    max_length = tf.to_int32(tf.ceil(max_length))
    total_items = max_length * num_sequences
    num_paddings = total_items - batch_size
    indices = tf.random.shuffle(tf.range(total_items))
    for key in features:
        shape_list = common_layers.shape_list(features[key])
        assert len(shape_list) >= 1
        with tf.control_dependencies([
                tf.assert_greater_equal(num_paddings,
                                        0,
                                        name='num_paddings_positive')
        ]):
            paddings = [[0, num_paddings]] + [[0, 0]] * (len(shape_list) - 1)
        features[key] = tf.pad(features[key],
                               paddings,
                               constant_values=-1 if key == 'obj_type' else 0)
        features[key] = tf.gather(features[key], indices)
        shape = [num_sequences, max_length]
        if len(shape_list) >= 2:
            shape += shape_list[1:]
        features[key] = tf.reshape(features[key], shape)
    # Remove all-padding seqs
    step_mask = tf.reduce_any(tf.greater(features['task'], 1), axis=-1)
    mask = tf.reduce_any(step_mask, axis=-1)
    step_mask = tf.boolean_mask(step_mask, mask)
    for key in features:
        features[key] = tf.boolean_mask(features[key], mask=mask)
    num_sequences = tf.shape(features['task'])[0]
    # Sort steps within each seq
    _, step_indices = tf.math.top_k(tf.to_int32(step_mask), k=max_length)
    step_indices = step_indices + tf.expand_dims(
        tf.range(num_sequences) * max_length, 1)
    step_indices = tf.reshape(step_indices, [-1])
    for key in features:
        shape_list = common_layers.shape_list(features[key])
        features[key] = tf.gather(
            tf.reshape(features[key], [-1] + shape_list[2:]), step_indices)
        features[key] = tf.reshape(features[key], shape_list)
    features = _stitch(features)
    return features
Exemple #9
0
def StitchImages(images):
  # images is [batch, x, y, c]
  batch, width, _, channels = tf.unstack(tf.shape(images))
  num_per_side = tf.to_int32(tf.ceil(tf.sqrt(tf.to_float(batch))))
  new_width = num_per_side * width
  paddings = tf.concat([tf.zeros([4, 1], dtype=tf.int32), tf.stack([num_per_side * num_per_side - batch, 0, 0, 0])[Ellipsis, tf.newaxis]], -1)
  images = tf.pad(images, paddings)

  images = tf.transpose(images, [1, 0, 2, 3])
  images = tf.reshape(images, [width, num_per_side, new_width, channels])
  images = tf.transpose(images, [1, 0, 2, 3])
  images = tf.reshape(images, [1, new_width, new_width, channels])

  return images
Exemple #10
0
def sample_mask_indices(tokens, mask_rate, mask_blacklist, max_num_to_mask):
    """Samples indices to mask.

  Args:
    tokens (Tensor): 1-D string Tensor.
    mask_rate (float): percentage of tokens to mask.
    mask_blacklist (Tensor): 1-D string Tensor of tokens to NEVER mask.
    max_num_to_mask (int): max # of masks.

  Returns:
    mask_indices (Tensor): 1-D int32 Tensor of indices to mask.
  """
    if mask_rate < 0 or mask_rate > 1:
        raise ValueError("mask_rate must be within [0, 1].")

    # Compute how many tokens to mask.
    num_tokens = tf.size(tokens)
    num_to_mask = tf.to_int32(tf.ceil(mask_rate * tf.to_float(num_tokens)))

    if mask_rate > 0:
        # If masking is enabled, then mask at least one, no matter what.
        # Original BERT code does this too.
        num_to_mask = tf.maximum(num_to_mask, 1)

    num_to_mask = tf.minimum(num_to_mask, max_num_to_mask)

    # If there are any [CLS] or [SEP], we count these as part of num_tokens.
    # Note that the original implementation of BERT does this as well.

    all_indices = tf.range(num_tokens)

    # Filter out indices containing CLS and SEP.
    allow_masking = tf.reduce_all(tf.not_equal(tokens, mask_blacklist[:,
                                                                      None]),
                                  axis=0)

    filtered_indices = tf.boolean_mask(all_indices, allow_masking)

    # Randomly select indices without replacement.
    shuffled_indices = tf.random.shuffle(filtered_indices)
    mask_indices = shuffled_indices[:num_to_mask]

    return mask_indices
Exemple #11
0
def sorted_non_max_suppression_padded(scores, boxes, max_output_size,
                                      iou_threshold):
    """A wrapper that handles non-maximum suppression.

  Assumption:
    * The boxes are sorted by scores unless the box is a dot (all coordinates
      are zero).
    * Boxes with higher scores can be used to suppress boxes with lower scores.

  The overal design of the algorithm is to handle boxes tile-by-tile:

  boxes = boxes.pad_to_multiply_of(tile_size)
  num_tiles = len(boxes) // tile_size
  output_boxes = []
  for i in range(num_tiles):
    box_tile = boxes[i*tile_size : (i+1)*tile_size]
    for j in range(i - 1):
      suppressing_tile = boxes[j*tile_size : (j+1)*tile_size]
      iou = bbox_overlap(box_tile, suppressing_tile)
      # if the box is suppressed in iou, clear it to a dot
      box_tile *= _update_boxes(iou)
    # Iteratively handle the diagnal tile.
    iou = _box_overlap(box_tile, box_tile)
    iou_changed = True
    while iou_changed:
      # boxes that are not suppressed by anything else
      suppressing_boxes = _get_suppressing_boxes(iou)
      # boxes that are suppressed by suppressing_boxes
      suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes)
      # clear iou to 0 for boxes that are suppressed, as they cannot be used
      # to suppress other boxes any more
      new_iou = _clear_iou(iou, suppressed_boxes)
      iou_changed = (new_iou != iou)
      iou = new_iou
    # remaining boxes that can still suppress others, are selected boxes.
    output_boxes.append(_get_suppressing_boxes(iou))
    if len(output_boxes) >= max_output_size:
      break

  Args:
    scores: a tensor with a shape of [batch_size, anchors].
    boxes: a tensor with a shape of [batch_size, anchors, 4].
    max_output_size: a scalar integer `Tensor` representing the maximum number
      of boxes to be selected by non max suppression.
    iou_threshold: a float representing the threshold for deciding whether boxes
      overlap too much with respect to IOU.

  Returns:
    nms_scores: a tensor with a shape of [batch_size, anchors]. It has same
      dtype as input scores.
    nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has
      same dtype as input boxes.
  """
    batch_size = tf.shape(boxes)[0]
    num_boxes = tf.shape(boxes)[1]
    pad = tf.cast(tf.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE),
                  tf.int32) * NMS_TILE_SIZE - num_boxes
    boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]])
    scores = tf.pad(tf.cast(scores, tf.float32), [[0, 0], [0, pad]])
    num_boxes += pad

    def _loop_cond(unused_boxes, unused_threshold, output_size, idx):
        return tf.logical_and(
            tf.reduce_min(output_size) < max_output_size,
            idx < num_boxes // NMS_TILE_SIZE)

    selected_boxes, _, output_size, _ = tf.while_loop(
        _loop_cond, _suppression_loop_body, [
            boxes, iou_threshold,
            tf.zeros([batch_size], tf.int32),
            tf.constant(0)
        ])
    idx = num_boxes - tf.cast(
        tf.nn.top_k(
            tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) *
            tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0],
        tf.int32)
    idx = tf.minimum(idx, num_boxes - 1)
    idx = tf.reshape(
        idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]), [-1])
    boxes = tf.reshape(tf.gather(tf.reshape(boxes, [-1, 4]), idx),
                       [batch_size, max_output_size, 4])
    boxes = boxes * tf.cast(
        tf.reshape(tf.range(max_output_size), [1, -1, 1]) < tf.reshape(
            output_size, [-1, 1, 1]), boxes.dtype)
    scores = tf.reshape(tf.gather(tf.reshape(scores, [-1, 1]), idx),
                        [batch_size, max_output_size])
    scores = scores * tf.cast(
        tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape(
            output_size, [-1, 1]), scores.dtype)
    return scores, boxes
 def compute_num_leapfrog_steps(self, step_size):
     return tf.cast(tf.ceil(self.trajectory_length / step_size), tf.int64)
Exemple #13
0
def randomly_crop_and_resize(image,
                             masks,
                             boxes,
                             keypoints,
                             image_size,
                             probability=0.5):
    """
    Arguments:
        image: a float tensor with shape [height, width, 3].
        masks: a float tensor with shape [height / DOWNSAMPLE, width / DOWNSAMPLE, 2].
        boxes: a float tensor with shape [num_persons, 4].
        keypoints: an int tensor with shape [num_persons, 17, 3].
        image_size: a tuple of integers (h, w).
        probability: a float number.
    Returns:
        image: a float tensor with shape [h, w, 3].
        masks: a float tensor with shape [h / DOWNSAMPLE, w / DOWNSAMPLE, 2].
        boxes: a float tensor with shape [num_remaining, 4].
        keypoints: an int tensor with shape [num_remaining, 17, 3].
    """

    shape = tf.to_float(tf.shape(image))
    height, width = shape[0], shape[1]
    scaler = tf.stack([height, width, height, width])
    boxes /= scaler  # to the [0, 1] range

    def crop(image, boxes, keypoints):
        """
        Arguments:
            image: a float tensor with shape [height, width, 3].
            boxes: a float tensor with shape [num_persons, 4].
            keypoints: an int tensor with shape [num_persons, 17, 3].
        Returns:
            image: a float tensor with shape [None, None, 3].
            boxes: a float tensor with shape [num_remaining, 4].
            keypoints: an int tensor with shape [num_remaining, 17, 3].
            window: a float tensor with shape [4].
        """

        image, boxes, window, keep_indices = random_image_crop(
            image,
            boxes,
            min_object_covered=0.9,
            aspect_ratio_range=(0.95, 1.05),
            area_range=(0.5, 1.0),
            overlap_threshold=OVERLAP_THRESHOLD)

        keypoints = tf.gather(keypoints, keep_indices)
        # it has shape [num_remaining, 17, 3]

        ymin, xmin, ymax, xmax = tf.unstack(window * scaler)
        points, v = tf.split(keypoints, [2, 1], axis=2)
        points = tf.to_float(points)  # shape [num_remaining, 17, 2]

        translation = tf.stack([ymin, xmin])
        points = tf.to_int32(tf.round(points - translation))
        keypoints = tf.concat([points, v], axis=2)

        # note that after this some keypoints will be invisible,
        # so we need to modify the `v` vector later

        return image, boxes, keypoints, window

    whole_image_window = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32)
    do_it = tf.less(tf.random_uniform([]), probability)

    image, boxes, keypoints, window = tf.cond(
        do_it, lambda: crop(image, boxes, keypoints), lambda:
        (image, boxes, keypoints, whole_image_window))

    def correct_keypoints(image_shape, keypoints):
        """
        Arguments:
            image_shape: an int tensor with shape [3].
            keypoints: an int tensor with shape [num_persons, 17, 3].
        Returns:
            an int tensor with shape [num_persons, 17, 3].
        """
        y, x, v = tf.split(keypoints, 3, axis=2)

        height = image_shape[0]
        width = image_shape[1]

        coordinate_violations = tf.concat([
            tf.less(y, 0),
            tf.less(x, 0),
            tf.greater_equal(y, height),
            tf.greater_equal(x, width)
        ],
                                          axis=2)  # shape [num_persons, 17, 4]

        valid_indicator = tf.logical_not(
            tf.reduce_any(coordinate_violations, axis=2))
        valid_indicator = tf.expand_dims(valid_indicator, 2)
        # it has shape [num_persons, 17, 1]

        v *= tf.to_int32(valid_indicator)
        keypoints = tf.concat([y, x, v], axis=2)
        return keypoints

    def rescale(boxes, keypoints, old_shape, new_shape):
        """
        Arguments:
            boxes: a float tensor with shape [num_persons, 4].
            keypoints: an int tensor with shape [num_persons, 17, 3].
            old_shape, new_shape: int tensors with shape [3].
        Returns:
            a float tensor with shape [num_persons, 4].
            an int tensor with shape [num_persons, 17, 3].
        """
        points, v = tf.split(keypoints, [2, 1], axis=2)
        points = tf.to_float(points)

        old_shape = tf.to_float(old_shape)
        new_shape = tf.to_float(new_shape)
        old_height, old_width = old_shape[0], old_shape[1]
        new_height, new_width = new_shape[0], new_shape[1]

        scaler = tf.stack([new_height / old_height, new_width / old_width])
        points *= scaler

        scaler = tf.stack([new_height, new_width])
        scaler = tf.concat(2 * [scaler], axis=0)
        boxes *= scaler

        new_height = tf.to_int32(new_height)
        new_width = tf.to_int32(new_width)

        points = tf.to_int32(tf.round(points))
        y, x = tf.split(points, 2, axis=2)
        y = tf.clip_by_value(y, 0, new_height - 1)
        x = tf.clip_by_value(x, 0, new_width - 1)
        keypoints = tf.concat([y, x, v], axis=2)
        return boxes, keypoints

    old_shape = tf.shape(image)
    keypoints = correct_keypoints(old_shape, keypoints)

    h, w = image_size  # image size that will be used for training
    image = tf.image.resize_images(image, [h, w], method=RESIZE_METHOD)

    masks_height = tf.to_int32(tf.ceil(h / DOWNSAMPLE))
    masks_width = tf.to_int32(tf.ceil(w / DOWNSAMPLE))

    masks = tf.image.crop_and_resize(image=tf.expand_dims(masks, 0),
                                     boxes=tf.expand_dims(window, 0),
                                     box_indices=tf.constant([0],
                                                             dtype=tf.int32),
                                     crop_size=[masks_height, masks_width],
                                     method='nearest')
    masks = masks[0]

    boxes, keypoints = rescale(boxes, keypoints, old_shape, tf.shape(image))
    return image, masks, boxes, keypoints
Exemple #14
0
 def scale(x):
     unpadded_x = tf.to_int32(tf.round(tf.to_float(x) * scale_factor))
     x = tf.to_int32(tf.ceil(unpadded_x / divisor))
     pad = divisor * x - unpadded_x
     return (unpadded_x, pad)
Exemple #15
0
def resize_keeping_aspect_ratio(image, masks, boxes, keypoints, min_dimension,
                                divisor):
    """
    This function resizes and possibly pads with zeros.
    When using a usual FPN, divisor must be equal to 128.

    Arguments:
        image: a float tensor with shape [height, width, 3].
        masks: a float tensor with shape [height / DOWNSAMPLE, width / DOWNSAMPLE, 2].
        boxes: a float tensor with shape [num_persons, 4].
        keypoints: an int tensor with shape [num_persons, 17, 3].
        min_dimension, divisor: integers.
    Returns:
        image: a float tensor with shape [h, w, 3],
            where `min_dimension = min(h, w)`,
            `h` and `w` are divisible by `DIVISOR`.
        masks: a float tensor with shape [h / DOWNSAMPLE, w / DOWNSAMPLE, 2].
        boxes: a float tensor with shape [num_persons, 4].
        keypoints: an int tensor with shape [num_persons, 17, 3].
    """

    assert min_dimension % divisor == 0
    min_dimension = tf.constant(min_dimension, dtype=tf.int32)
    divisor = tf.constant(divisor, dtype=tf.int32)

    shape = tf.shape(image)
    height, width = shape[0], shape[1]

    original_min_dim = tf.minimum(height, width)
    scale_factor = tf.to_float(min_dimension / original_min_dim)

    # RESIZE AND PAD IMAGE

    def scale(x):
        unpadded_x = tf.to_int32(tf.round(tf.to_float(x) * scale_factor))
        x = tf.to_int32(tf.ceil(unpadded_x / divisor))
        pad = divisor * x - unpadded_x
        return (unpadded_x, pad)

    zero = tf.constant(0, dtype=tf.int32)
    new_height, pad_height, new_width, pad_width = tf.cond(
        tf.greater_equal(height, width), lambda: scale(height) +
        (min_dimension, zero), lambda: (min_dimension, zero) + scale(width))

    # final image size
    h = new_height + pad_height
    w = new_width + pad_width

    # resize keeping aspect ratio
    image = tf.image.resize_images(image, [new_height, new_width],
                                   method=RESIZE_METHOD)

    # pad image at the bottom or at the right
    image = tf.image.pad_to_bounding_box(image,
                                         offset_height=0,
                                         offset_width=0,
                                         target_height=h,
                                         target_width=w)

    # RESIZE AND PAD MASKS

    # new size of masks with padding
    map_height = tf.to_int32(tf.ceil(h / DOWNSAMPLE))
    map_width = tf.to_int32(tf.ceil(w / DOWNSAMPLE))

    # new size of only masks without padding
    map_only_height = tf.to_int32(tf.ceil(new_height / DOWNSAMPLE))
    map_only_width = tf.to_int32(tf.ceil(new_width / DOWNSAMPLE))

    masks = tf.image.resize_images(
        masks, [map_only_height, map_only_width],
        method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)

    masks = tf.image.pad_to_bounding_box(masks,
                                         offset_height=0,
                                         offset_width=0,
                                         target_height=map_height,
                                         target_width=map_width)

    # TRANSFORM KEYPOINTS

    keypoint_scaler = tf.stack([new_height / height, new_width / width])
    keypoint_scaler = tf.to_float(keypoint_scaler)

    points, v = tf.split(keypoints, [2, 1], axis=2)
    points = tf.to_int32(tf.round(tf.to_float(points) * keypoint_scaler))
    y, x = tf.split(points, 2, axis=2)
    y = tf.clip_by_value(y, 0, h - 1)
    x = tf.clip_by_value(x, 0, w - 1)
    keypoints = tf.concat([y, x, v], axis=2)

    # TRANSFORM BOXES

    box_scaler = tf.concat(2 * [keypoint_scaler], axis=0)
    boxes *= box_scaler

    return image, masks, boxes, keypoints
Exemple #16
0
def get_patch_mask(y, x, patch_size, image_shape):
    """Creates a 2D mask array for a square patch of a given size and location.

  The mask is created with its center at the y and x coordinates, which must be
  within the image. While the mask center must be within the image, the mask
  itself can be partially outside of it. If patch_size is an even number, then
  the mask is created with lower-valued coordinates first (top and left).

  Args:
    y: An integer or scalar int32 tensor. The vertical coordinate of the
      patch mask center. Must be within the range [0, image_height).
    x: An integer or scalar int32 tensor. The horizontal coordinate of the
      patch mask center. Must be within the range [0, image_width).
    patch_size: An integer or scalar int32 tensor. The square size of the
      patch mask. Must be at least 1.
    image_shape: A list or 1D int32 tensor representing the shape of the image
      to which the mask will correspond, with the first two values being image
      height and width. For example, [image_height, image_width] or
      [image_height, image_width, image_channels].

  Returns:
    Boolean mask tensor of shape [image_height, image_width] with True values
    for the patch.

  Raises:
    tf.errors.InvalidArgumentError: if x is not in the range [0, image_width), y
      is not in the range [0, image_height), or patch_size is not at least 1.
  """
    image_hw = image_shape[:2]
    mask_center_yx = tf.stack([y, x])
    with tf.control_dependencies([
            tf.debugging.assert_greater_equal(
                patch_size, 1, message='Patch size must be >= 1'),
            tf.debugging.assert_greater_equal(
                mask_center_yx,
                0,
                message='Patch center (y, x) must be >= (0, 0)'),
            tf.debugging.assert_less(
                mask_center_yx,
                image_hw,
                message='Patch center (y, x) must be < image (h, w)')
    ]):
        mask_center_yx = tf.identity(mask_center_yx)

    half_patch_size = tf.cast(patch_size, dtype=tf.float32) / 2
    start_yx = mask_center_yx - tf.cast(tf.floor(half_patch_size),
                                        dtype=tf.int32)
    end_yx = mask_center_yx + tf.cast(tf.ceil(half_patch_size), dtype=tf.int32)

    start_yx = tf.maximum(start_yx, 0)
    end_yx = tf.minimum(end_yx, image_hw)

    start_y = start_yx[0]
    start_x = start_yx[1]
    end_y = end_yx[0]
    end_x = end_yx[1]

    lower_pad = image_hw[0] - end_y
    upper_pad = start_y
    left_pad = start_x
    right_pad = image_hw[1] - end_x
    mask = tf.ones([end_y - start_y, end_x - start_x], dtype=tf.bool)
    return tf.pad(mask, [[upper_pad, lower_pad], [left_pad, right_pad]])
def compress(args):
    """Compresses an image, or a batch of images of the same shape in npy format."""
    from configs import get_eval_batch_size

    if args.input_file.endswith('.npy'):
        # .npy file should contain N images of the same shapes, in the form of an array of shape [N, H, W, 3]
        X = np.load(args.input_file)
    else:
        # Load input image and add batch dimension.
        from PIL import Image
        x = np.asarray(Image.open(args.input_file).convert('RGB'))
        X = x[None, ...]

    num_images = int(X.shape[0])
    img_num_pixels = int(np.prod(X.shape[1:-1]))
    X = X.astype('float32')
    X /= 255.

    eval_batch_size = get_eval_batch_size(img_num_pixels)
    dataset = tf.data.Dataset.from_tensor_slices(X)
    dataset = dataset.batch(batch_size=eval_batch_size)
    # https://www.tensorflow.org/api_docs/python/tf/compat/v1/data/Iterator
    # Importantly, each sess.run(op) call will consume a new batch, where op is any operation that depends on
    # x. Therefore if multiple ops need to be evaluated on the same batch of data, they have to be grouped like
    # sess.run([op1, op2, ...]).
    # x = dataset.make_one_shot_iterator().get_next()
    x_next = dataset.make_one_shot_iterator().get_next()

    x_ph = x = tf.placeholder(
        'float32',
        (None, *X.shape[1:]))  # keep a reference around for feed_dict

    #### BEGIN build compression graph ####
    from utils import log_normal_pdf
    from learned_prior import BMSHJ2018Prior
    hyper_prior = BMSHJ2018Prior(args.num_filters, dims=(3, 3, 3))

    # Instantiate model.
    analysis_transform = AnalysisTransform(args.num_filters)
    synthesis_transform = SynthesisTransform(args.num_filters)
    hyper_analysis_transform = HyperAnalysisTransform(args.num_filters,
                                                      num_output_filters=2 *
                                                      args.num_filters)
    hyper_synthesis_transform = HyperSynthesisTransform(args.num_filters,
                                                        num_output_filters=2 *
                                                        args.num_filters)
    # entropy_bottleneck = tfc.EntropyBottleneck()

    # Initial optimization (where we still have access to x)
    # Soft-to-hard rounding with Gumbel-softmax trick; for each element of z_tilde, let R be a 2D auxiliary one-hot
    # random vector, such that R=[1, 0] means rounding DOWN and [0, 1] means rounding UP.
    # Let the logits of each outcome be -(z - z_floor) / T and -(z_ceil - z) / T (i.e., Boltzmann distribution with
    # energies (z - floor(z)) and (ceil(z) - z), so p(R==[1,0]) = softmax((z - z_floor) / T), ...
    # Let z_tilde = p(R==[1,0]) * floor(z) + p(R==[0,1]) * ceil(z), so z_tilde -> round(z) as T -> 0.
    import tensorflow_probability as tfp
    T = tf.placeholder('float32', shape=[], name='temperature')
    y_init = analysis_transform(x)
    y = tf.placeholder('float32', y_init.shape)
    y_floor = tf.floor(y)
    y_ceil = tf.ceil(y)
    y_bds = tf.stack([y_floor, y_ceil], axis=-1)
    epsilon = 1e-5
    logits = tf.stack(
        [
            -tf.math.atanh(
                tf.clip_by_value(y - y_floor, -1 + epsilon, 1 - epsilon)) / T,
            -tf.math.atanh(
                tf.clip_by_value(y_ceil - y, -1 + epsilon, 1 - epsilon)) / T
        ],
        axis=-1
    )  # last dim are logits for DOWN or UP; clip to prevent NaN as temperature -> 0
    rounding_dist = tfp.distributions.RelaxedOneHotCategorical(
        T,
        logits=logits)  # technically we can use a different temperature here
    sample_concrete = rounding_dist.sample()
    y_tilde = tf.reduce_sum(y_bds * sample_concrete,
                            axis=-1)  # inner product in last dim
    x_tilde = synthesis_transform(y_tilde)
    x_shape = tf.shape(x)
    x_tilde = x_tilde[:, :x_shape[1], :x_shape[
        2], :]  # crop reconstruction to have the same shape as input

    # z_tilde ~ q(z_tilde | h_a(\tilde y))
    z_mean_init, z_logvar_init = tf.split(hyper_analysis_transform(y_tilde),
                                          num_or_size_splits=2,
                                          axis=-1)
    z_mean = tf.placeholder(
        'float32',
        z_mean_init.shape)  # initialize to inference network results
    z_logvar = tf.placeholder('float32', z_logvar_init.shape)

    eps = tf.random.normal(shape=tf.shape(z_mean))
    z_tilde = eps * tf.exp(z_logvar * .5) + z_mean

    log_q_z_tilde = log_normal_pdf(z_tilde, z_mean, z_logvar)  # bits back

    # compute the pdf of z_tilde under the flexible (hyper)prior p(z_tilde) ("z_likelihoods")
    z_likelihoods = hyper_prior.pdf(z_tilde, stop_gradient=False)
    z_likelihoods = math_ops.lower_bound(z_likelihoods, likelihood_lowerbound)

    # compute parameters of p(y_tilde|z_tilde)
    mu, sigma = tf.split(hyper_synthesis_transform(z_tilde),
                         num_or_size_splits=2,
                         axis=-1)
    sigma = tf.exp(sigma)  # make positive

    # need to handle images with non-standard sizes during compression; mu/sigma must have the same shape as y
    y_shape = tf.shape(y_tilde)
    mu = mu[:, :y_shape[1], :y_shape[2], :]
    sigma = sigma[:, :y_shape[1], :y_shape[2], :]
    scale_table = np.exp(
        np.linspace(np.log(SCALES_MIN), np.log(SCALES_MAX), SCALES_LEVELS))
    conditional_bottleneck = tfc.GaussianConditional(sigma,
                                                     scale_table,
                                                     mean=mu)
    # compute the pdf of y_tilde under the conditional prior/entropy model p(y_tilde|z_tilde)
    # = N(y_tilde|mu, sigma^2) conv U(-0.5, 0.5)
    y_likelihoods = conditional_bottleneck._likelihood(
        y_tilde)  # p(\tilde y | \tilde z)
    if conditional_bottleneck.likelihood_bound > 0:
        likelihood_bound = conditional_bottleneck.likelihood_bound
        y_likelihoods = math_ops.lower_bound(y_likelihoods, likelihood_bound)
    #### END build compression graph ####

    # Total number of bits divided by number of pixels.
    # - log p(\tilde y | \tilde z) - log p(\tilde z) - - log q(\tilde z | \tilde y)
    axes_except_batch = list(range(1, len(x.shape)))  # should be [1,2,3]
    batch_log_q_z_tilde = tf.reduce_sum(log_q_z_tilde, axis=axes_except_batch)
    bpp_back = -batch_log_q_z_tilde / (np.log(2) * img_num_pixels)
    batch_log_cond_p_y_tilde = tf.reduce_sum(tf.log(y_likelihoods),
                                             axis=axes_except_batch)
    y_bpp = -batch_log_cond_p_y_tilde / (np.log(2) * img_num_pixels)
    batch_log_p_z_tilde = tf.reduce_sum(tf.log(z_likelihoods),
                                        axis=axes_except_batch)
    z_bpp = -batch_log_p_z_tilde / (np.log(2) * img_num_pixels)
    eval_bpp = y_bpp + z_bpp - bpp_back  # shape (N,)
    train_bpp = tf.reduce_mean(eval_bpp)

    # Mean squared error across pixels.
    train_mse = tf.reduce_mean(tf.squared_difference(x, x_tilde))
    # Multiply by 255^2 to correct for rescaling.
    # float_train_mse = train_mse
    # psnr = - 10 * (tf.log(float_train_mse) / np.log(10))  # float MSE computed on float images
    train_mse *= 255**2

    # The rate-distortion cost.
    if args.lmbda < 0:
        args.lmbda = float(args.runname.split('lmbda=')[1].split('-')
                           [0])  # re-use the lmbda as used for training
        print(
            'Defaulting lmbda (mse coefficient) to %g as used in model training.'
            % args.lmbda)
    if args.lmbda > 0:
        rd_loss = args.lmbda * train_mse + train_bpp
    else:
        rd_loss = train_bpp
    rd_gradients = tf.gradients(rd_loss, [y, z_mean, z_logvar])
    r_gradients = tf.gradients(train_bpp, [z_mean, z_logvar])

    # Bring both images back to 0..255 range, for evaluation only.
    x *= 255
    x_tilde = tf.clip_by_value(x_tilde, 0, 1)
    x_tilde = tf.round(x_tilde * 255)

    mse = tf.reduce_mean(tf.squared_difference(x, x_tilde),
                         axis=axes_except_batch)  # shape (N,)
    psnr = tf.image.psnr(x_tilde, x, 255)  # shape (N,)
    msssim = tf.image.ssim_multiscale(x_tilde, x, 255)  # shape (N,)
    msssim_db = -10 * tf.log(1 - msssim) / np.log(10)  # shape (N,)

    with tf.Session() as sess:
        # Load the latest model checkpoint, get compression stats
        save_dir = os.path.join(args.checkpoint_dir, args.runname)
        latest = tf.train.latest_checkpoint(checkpoint_dir=save_dir)
        tf.train.Saver().restore(sess, save_path=latest)
        eval_fields = [
            'mse', 'psnr', 'msssim', 'msssim_db', 'est_bpp', 'est_y_bpp',
            'est_z_bpp', 'est_bpp_back'
        ]
        eval_tensors = [
            mse, psnr, msssim, msssim_db, eval_bpp, y_bpp, z_bpp, bpp_back
        ]
        all_results_arrs = {key: []
                            for key in eval_fields
                            }  # append across all batches

        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt

        log_itv = 100
        rd_lr = 0.005
        # rd_opt_its = args.sga_its
        rd_opt_its = 2000
        annealing_scheme = 'exp0'
        annealing_rate = args.annealing_rate  # default annealing_rate = 1e-3
        t0 = args.t0  # default t0 = 700
        T_ub = 0.5  # max/initial temperature
        from utils import annealed_temperature
        r_lr = 0.003
        r_opt_its = 2000
        from adam import Adam

        batch_idx = 0
        while True:
            try:
                x_val = sess.run(x_next)
                x_feed_dict = {x_ph: x_val}
                # 1. Perform R-D optimization conditioned on ground truth x
                print('----RD Optimization----')
                y_cur = sess.run(y_init, feed_dict=x_feed_dict)  # np arrays
                z_mean_cur, z_logvar_cur = sess.run(
                    [z_mean_init, z_logvar_init], feed_dict={y_tilde: y_cur})
                rd_loss_hist = []
                adam_optimizer = Adam(lr=rd_lr)

                opt_record = {
                    'its': [],
                    'T': [],
                    'rd_loss': [],
                    'rd_loss_after_rounding': []
                }
                for it in range(rd_opt_its):
                    temperature = annealed_temperature(it,
                                                       r=annealing_rate,
                                                       ub=T_ub,
                                                       scheme=annealing_scheme,
                                                       t0=t0)
                    grads, obj, mse_, train_bpp_, psnr_ = sess.run(
                        [rd_gradients, rd_loss, train_mse, train_bpp, psnr],
                        feed_dict={
                            y: y_cur,
                            z_mean: z_mean_cur,
                            z_logvar: z_logvar_cur,
                            **x_feed_dict, T: temperature
                        })
                    y_cur, z_mean_cur, z_logvar_cur = adam_optimizer.update(
                        [y_cur, z_mean_cur, z_logvar_cur], grads)
                    if it % log_itv == 0 or it + 1 == rd_opt_its:
                        psnr_ = psnr_.mean()
                        if args.verbose:
                            bpp_after_rounding, psnr_after_rounding, rd_loss_after_rounding = sess.run(
                                [train_bpp, psnr, rd_loss],
                                feed_dict={
                                    y_tilde: np.round(y_cur),
                                    z_mean: z_mean_cur,
                                    z_logvar: z_logvar_cur,
                                    **x_feed_dict
                                })
                            psnr_after_rounding = psnr_after_rounding.mean()
                            print(
                                'it=%d, T=%.3f rd_loss=%.4f mse=%.3f bpp=%.4f psnr=%.4f\t after rounding: rd_loss=%.4f, bpp=%.4f psnr=%.4f'
                                % (it, temperature, obj, mse_, train_bpp_,
                                   psnr_, rd_loss_after_rounding,
                                   bpp_after_rounding, psnr_after_rounding))
                        else:
                            print(
                                'it=%d, T=%.3f rd_loss=%.4f mse=%.3f bpp=%.4f psnr=%.4f'
                                % (it, temperature, obj, mse_, train_bpp_,
                                   psnr_))
                    rd_loss_hist.append(obj)
                print()

                # 2. Fix y_tilde, perform rate optimization w.r.t. z_mean and z_logvar.
                y_tilde_cur = np.round(
                    y_cur)  # this is the latents we end up transmitting
                # rate_feed_dict = {y_tilde: y_tilde_cur, **x_feed_dict}
                rate_feed_dict = {y_tilde: y_tilde_cur}
                np.random.seed(seed)
                tf.set_random_seed(seed)
                print('----Rate Optimization----')
                # Reinitialize based on the value of y_tilde
                z_mean_cur, z_logvar_cur = sess.run(
                    [z_mean_init, z_logvar_init],
                    feed_dict=rate_feed_dict)  # np arrays

                r_loss_hist = []
                # rate_grad_hist = []

                adam_optimizer = Adam(lr=r_lr)
                for it in range(r_opt_its):
                    grads, obj = sess.run(
                        [r_gradients, train_bpp],
                        feed_dict={
                            z_mean: z_mean_cur,
                            z_logvar: z_logvar_cur,
                            **rate_feed_dict
                        })
                    z_mean_cur, z_logvar_cur = adam_optimizer.update(
                        [z_mean_cur, z_logvar_cur], grads)
                    if it % log_itv == 0 or it + 1 == r_opt_its:
                        print('it=', it, '\trate=', obj)
                    r_loss_hist.append(obj)
                    # rate_grad_hist.append(np.mean(np.abs(grads)))
                print()

                # fig, axes = plt.subplots(nrows=2, sharex=True)
                # axes[0].plot(rd_loss_hist)
                # axes[0].set_ylabel('RD loss')
                # axes[1].plot(r_loss_hist)
                # axes[1].set_ylabel('Rate loss')
                # axes[1].set_xlabel('SGD iterations')
                # plt.savefig('plots/local_q_opt_hist-%s-input=%s-b=%d.png' %
                #             (args.runname, os.path.basename(args.input_file), batch_idx))

                # If requested, transform the quantized image back and measure performance.
                eval_arrs = sess.run(eval_tensors,
                                     feed_dict={
                                         y_tilde: y_tilde_cur,
                                         z_mean: z_mean_cur,
                                         z_logvar: z_logvar_cur,
                                         **x_feed_dict
                                     })
                for field, arr in zip(eval_fields, eval_arrs):
                    all_results_arrs[field] += arr.tolist()

                batch_idx += 1

            except tf.errors.OutOfRangeError:
                break

        for field in eval_fields:
            all_results_arrs[field] = np.asarray(all_results_arrs[field])

        input_file = os.path.basename(args.input_file)
        results_dict = all_results_arrs
        trained_script_name = args.runname.split('-')[0]
        script_name = os.path.splitext(os.path.basename(__file__))[
            0]  # current script name, without extension
        save_file = 'rd-%s-input=%s.npz' % (args.runname, input_file)
        if script_name != trained_script_name:
            save_file = 'rd-%s-lmbda=%g+%s-input=%s.npz' % (
                script_name, args.lmbda, args.runname, input_file)
        np.savez(os.path.join(args.results_dir, save_file), **results_dict)

        for field in eval_fields:
            arr = all_results_arrs[field]
            print('Avg {}: {:0.4f}'.format(field, arr.mean()))
Exemple #18
0
def compress(args):
    """Compresses an image, or a batch of images of the same shape in npy format. or a batch of images of the same shape in npy format."""
    from configs import get_eval_batch_size

    if args.input_file.endswith('.npy'):
        # .npy file should contain N images of the same shapes, in the form of an array of shape [N, H, W, 3]
        X = np.load(args.input_file)
    else:
        # Load input image and add batch dimension.
        from PIL import Image
        x = np.asarray(Image.open(args.input_file).convert('RGB'))
        X = x[None, ...]

    num_images = int(X.shape[0])
    img_num_pixels = int(np.prod(X.shape[1:-1]))
    X = X.astype('float32')
    X /= 255.

    eval_batch_size = get_eval_batch_size(img_num_pixels)
    dataset = tf.data.Dataset.from_tensor_slices(X)
    dataset = dataset.batch(batch_size=eval_batch_size)
    # https://www.tensorflow.org/api_docs/python/tf/compat/v1/data/Iterator
    # Importantly, each sess.run(op) call will consume a new batch, where op is any operation that depends on
    # x. Therefore if multiple ops need to be evaluated on the same batch of data, they have to be grouped like
    # sess.run([op1, op2, ...]).
    # x = dataset.make_one_shot_iterator().get_next()
    x_next = dataset.make_one_shot_iterator().get_next()

    x_shape = (None, *X.shape[1:])
    x_ph = x = tf.placeholder('float32',
                              x_shape)  # keep a reference around for feed_dict

    #### BEGIN build compression graph ####
    # Instantiate model.
    analysis_transform = AnalysisTransform(args.num_filters)
    synthesis_transform = SynthesisTransform(args.num_filters)
    hyper_analysis_transform = HyperAnalysisTransform(args.num_filters)
    hyper_synthesis_transform = HyperSynthesisTransform(args.num_filters,
                                                        num_output_filters=2 *
                                                        args.num_filters)
    entropy_bottleneck = tfc.EntropyBottleneck()

    # Initial values for optimization
    y_init = analysis_transform(x)
    z_init = hyper_analysis_transform(y_init)

    # Soft-to-hard rounding with Gumbel-softmax trick; for each element of z_tilde, let R be a 2D auxiliary one-hot
    # random vector, such that R=[1, 0] means rounding DOWN and [0, 1] means rounding UP.
    # Let the logits of each outcome be -(z - z_floor) / T and -(z_ceil - z) / T (i.e., Boltzmann distribution with
    # energies (z - floor(z)) and (ceil(z) - z), so p(R==[1,0]) = softmax((z - z_floor) / T), ...
    # Let z_tilde = p(R==[1,0]) * floor(z) + p(R==[0,1]) * ceil(z), so z_tilde -> round(z) as T -> 0.
    import tensorflow_probability as tfp
    T = tf.placeholder('float32', shape=[], name='temperature')

    z = tf.placeholder(
        'float32', z_init.shape
    )  # interface ("proxy") variable for SGA (to be annealed to int)
    z_floor = tf.floor(z)
    z_ceil = tf.ceil(z)
    z_bds = tf.stack([z_floor, z_ceil], axis=-1)
    rz_logits = tf.stack(
        [
            -tf.math.atanh(
                tf.clip_by_value(z - z_floor, -1 + epsilon, 1 - epsilon)) / T,
            -tf.math.atanh(
                tf.clip_by_value(z_ceil - z, -1 + epsilon, 1 - epsilon)) / T
        ],
        axis=-1
    )  # last dim are logits for DOWN or UP; clip to prevent NaN as temperature -> 0
    rz_dist = tfp.distributions.RelaxedOneHotCategorical(
        T, logits=rz_logits
    )  # technically we can use a different temperature here
    rz_sample = rz_dist.sample()
    z_tilde = tf.reduce_sum(z_bds * rz_sample,
                            axis=-1)  # inner product in last dim

    _ = entropy_bottleneck(
        z, training=False
    )  # dummy call to ensure entropy_bottleneck is properly built
    z_likelihoods = entropy_bottleneck._likelihood(z_tilde)  # p(\tilde z)
    if entropy_bottleneck.likelihood_bound > 0:
        likelihood_bound = entropy_bottleneck.likelihood_bound
        z_likelihoods = math_ops.lower_bound(z_likelihoods, likelihood_bound)

    # compute parameters of conditional prior p(y_tilde|z_tilde)
    mu, sigma = tf.split(hyper_synthesis_transform(z_tilde),
                         num_or_size_splits=2,
                         axis=-1)
    sigma = tf.exp(sigma)  # make positive

    # set up SGA for low-level latents
    y = tf.placeholder(
        'float32', y_init.shape
    )  # interface ("proxy") variable for SGA (to be annealed to int)
    y_floor = tf.floor(y)
    y_ceil = tf.ceil(y)
    y_bds = tf.stack([y_floor, y_ceil], axis=-1)
    ry_logits = tf.stack([
        -tf.math.atanh(tf.clip_by_value(y - y_floor, -1 + epsilon,
                                        1 - epsilon)) / T,
        -tf.math.atanh(tf.clip_by_value(y_ceil - y, -1 + epsilon, 1 - epsilon))
        / T
    ],
                         axis=-1)  # last dim are logits for DOWN or UP
    ry_dist = tfp.distributions.RelaxedOneHotCategorical(
        T, logits=ry_logits
    )  # technically we can use a different temperature here
    ry_sample = ry_dist.sample()
    y_tilde = tf.reduce_sum(y_bds * ry_sample,
                            axis=-1)  # inner product in last dim
    x_tilde = synthesis_transform(y_tilde)
    x_tilde = x_tilde[:, :x_shape[1], :x_shape[
        2], :]  # crop reconstruction to have the same shape as input

    # need to handle images with non-standard sizes during compression; mu/sigma must have the same shape as y
    y_shape = tf.shape(y_tilde)
    mu = mu[:, :y_shape[1], :y_shape[2], :]
    sigma = sigma[:, :y_shape[1], :y_shape[2], :]
    scale_table = np.exp(
        np.linspace(np.log(SCALES_MIN), np.log(SCALES_MAX), SCALES_LEVELS))
    conditional_bottleneck = tfc.GaussianConditional(sigma,
                                                     scale_table,
                                                     mean=mu)
    # compute the pdf of y_tilde under the conditional prior/entropy model p(y_tilde|z_tilde)
    # = N(y_tilde|mu, sigma^2) conv U(-0.5, 0.5)
    y_likelihoods = conditional_bottleneck._likelihood(
        y_tilde)  # p(\tilde y | \tilde z)
    if conditional_bottleneck.likelihood_bound > 0:
        likelihood_bound = conditional_bottleneck.likelihood_bound
        y_likelihoods = math_ops.lower_bound(y_likelihoods, likelihood_bound)
    #### END build compression graph ####

    # graph = build_graph(args, x, training=False)

    # Total number of bits divided by number of pixels.
    # - log p(\tilde y | \tilde z) - log p(\tilde z) - - log q(\tilde z | \tilde y)
    axes_except_batch = list(range(1, len(x.shape)))  # should be [1,2,3]
    y_bpp = tf.reduce_sum(-tf.log(y_likelihoods), axis=axes_except_batch) / (
        np.log(2) * img_num_pixels)
    z_bpp = tf.reduce_sum(-tf.log(z_likelihoods), axis=axes_except_batch) / (
        np.log(2) * img_num_pixels)
    eval_bpp = y_bpp + z_bpp  # shape (N,)
    train_bpp = tf.reduce_mean(eval_bpp)

    # Mean squared error across pixels.
    train_mse = tf.reduce_mean(tf.squared_difference(x, x_tilde))
    # Multiply by 255^2 to correct for rescaling.
    # float_train_mse = train_mse
    # psnr = - 10 * (tf.log(float_train_mse) / np.log(10))  # float MSE computed on float images
    train_mse *= 255**2

    # The rate-distortion cost.
    if args.lmbda < 0:
        args.lmbda = float(args.runname.split('lmbda=')[1].split('-')
                           [0])  # re-use the lmbda as used for training
        print(
            'Defaulting lmbda (mse coefficient) to %g as used in model training.'
            % args.lmbda)
    if args.lmbda > 0:
        rd_loss = args.lmbda * train_mse + train_bpp
    else:
        rd_loss = train_bpp
    rd_gradients = tf.gradients(rd_loss, [y, z])

    # Bring both images back to 0..255 range, for evaluation only.
    x *= 255
    if save_reconstruction:
        x_tilde_float = x_tilde
    x_tilde = tf.clip_by_value(x_tilde, 0, 1)
    x_tilde = tf.round(x_tilde * 255)

    mse = tf.reduce_mean(tf.squared_difference(x, x_tilde),
                         axis=axes_except_batch)  # shape (N,)
    psnr = tf.image.psnr(x_tilde, x, 255)  # shape (N,)
    msssim = tf.image.ssim_multiscale(x_tilde, x, 255)  # shape (N,)
    msssim_db = -10 * tf.log(1 - msssim) / np.log(10)  # shape (N,)

    with tf.Session() as sess:
        # Load the latest model checkpoint, get compression stats
        save_dir = os.path.join(args.checkpoint_dir, args.runname)
        latest = tf.train.latest_checkpoint(checkpoint_dir=save_dir)
        tf.train.Saver().restore(sess, save_path=latest)
        eval_fields = [
            'mse', 'psnr', 'msssim', 'msssim_db', 'est_bpp', 'est_y_bpp',
            'est_z_bpp'
        ]
        eval_tensors = [mse, psnr, msssim, msssim_db, eval_bpp, y_bpp, z_bpp]
        all_results_arrs = {key: []
                            for key in eval_fields
                            }  # append across all batches

        log_itv = 100
        if save_opt_record:
            log_itv = 10
        rd_lr = 0.005
        # rd_opt_its = args.sga_its
        rd_opt_its = 2000
        annealing_scheme = 'exp0'
        annealing_rate = args.annealing_rate  # default annealing_rate = 1e-3
        t0 = args.t0  # default t0 = 700
        T_ub = 0.5  # max/initial temperature
        from utils import annealed_temperature
        from adam import Adam

        batch_idx = 0
        while True:
            try:
                x_val = sess.run(x_next)
                x_feed_dict = {x_ph: x_val}
                # 1. Perform R-D optimization conditioned on ground truth x
                print('----RD Optimization----')
                y_cur, z_cur = sess.run([y_init, z_init],
                                        feed_dict=x_feed_dict)  # np arrays
                adam_optimizer = Adam(lr=rd_lr)
                opt_record = {
                    'its': [],
                    'T': [],
                    'rd_loss': [],
                    'rd_loss_after_rounding': []
                }
                for it in range(rd_opt_its):
                    temperature = annealed_temperature(it,
                                                       r=annealing_rate,
                                                       ub=T_ub,
                                                       scheme=annealing_scheme,
                                                       t0=t0)
                    grads, obj, mse_, train_bpp_, psnr_ = sess.run(
                        [rd_gradients, rd_loss, train_mse, train_bpp, psnr],
                        feed_dict={
                            y: y_cur,
                            z: z_cur,
                            **x_feed_dict, T: temperature
                        })
                    y_cur, z_cur = adam_optimizer.update([y_cur, z_cur], grads)
                    if it % log_itv == 0 or it + 1 == rd_opt_its:
                        psnr_ = psnr_.mean()
                        if args.verbose:
                            bpp_after_rounding, psnr_after_rounding, rd_loss_after_rounding = sess.run(
                                [train_bpp, psnr, rd_loss],
                                feed_dict={
                                    y_tilde: np.round(y_cur),
                                    z_tilde: np.round(z_cur),
                                    **x_feed_dict
                                })
                            psnr_after_rounding = psnr_after_rounding.mean()
                            print(
                                'it=%d, T=%.3f rd_loss=%.4f mse=%.3f bpp=%.4f psnr=%.4f\t after rounding: rd_loss=%.4f, bpp=%.4f psnr=%.4f'
                                % (it, temperature, obj, mse_, train_bpp_,
                                   psnr_, rd_loss_after_rounding,
                                   bpp_after_rounding, psnr_after_rounding))
                            opt_record['rd_loss_after_rounding'].append(
                                rd_loss_after_rounding)
                        else:
                            print(
                                'it=%d, T=%.3f rd_loss=%.4f mse=%.3f bpp=%.4f psnr=%.4f'
                                % (it, temperature, obj, mse_, train_bpp_,
                                   psnr_))
                        opt_record['its'].append(it)
                        opt_record['T'].append(temperature)
                        opt_record['rd_loss'].append(obj)

                print()

                y_tilde_cur = np.round(
                    y_cur)  # this is the latents we end up transmitting
                z_tilde_cur = np.round(z_cur)

                # If requested, transform the quantized image back and measure performance.
                eval_arrs = sess.run(eval_tensors,
                                     feed_dict={
                                         y_tilde: y_tilde_cur,
                                         z_tilde: z_tilde_cur,
                                         **x_feed_dict
                                     })
                for field, arr in zip(eval_fields, eval_arrs):
                    all_results_arrs[field] += arr.tolist()

                batch_idx += 1

            except tf.errors.OutOfRangeError:
                break

        for field in eval_fields:
            all_results_arrs[field] = np.asarray(all_results_arrs[field])

        input_file = os.path.basename(args.input_file)
        results_dict = all_results_arrs
        trained_script_name = args.runname.split('-')[0]
        script_name = os.path.splitext(os.path.basename(__file__))[
            0]  # current script name, without extension

        # save RD evaluation results
        prefix = 'rd'
        save_file = '%s-%s-input=%s.npz' % (prefix, args.runname, input_file)
        if script_name != trained_script_name:
            save_file = '%s-%s-lmbda=%g+%s-input=%s.npz' % (
                prefix, script_name, args.lmbda, args.runname, input_file)
        np.savez(os.path.join(args.results_dir, save_file), **results_dict)

        if save_opt_record:
            # save optimization record
            prefix = 'opt'
            save_file = '%s-%s-input=%s.npz' % (prefix, args.runname,
                                                input_file)
            if script_name != trained_script_name:
                save_file = '%s-%s-lmbda=%g+%s-input=%s.npz' % (
                    prefix, script_name, args.lmbda, args.runname, input_file)
            np.savez(os.path.join(args.results_dir, save_file), **opt_record)

        if save_reconstruction:
            assert num_images == 1
            prefix = 'recon'
            save_file = '%s-%s-input=%s.png' % (prefix, args.runname,
                                                input_file)
            if script_name != trained_script_name:
                save_file = '%s-%s-lmbda=%g-rd_opt_its=%d+%s-input=%s.png' % (
                    prefix, script_name, args.lmbda, rd_opt_its, args.runname,
                    input_file)
            # Write reconstructed image out as a PNG file.
            save_file = os.path.join(args.results_dir, save_file)
            print("Saving image reconstruction to ", save_file)
            save_png_op = write_png(save_file, x_tilde_float[0])
            sess.run(save_png_op, feed_dict={y_tilde: y_tilde_cur})

        for field in eval_fields:
            arr = all_results_arrs[field]
            print('Avg {}: {:0.4f}'.format(field, arr.mean()))
Exemple #19
0
def compress(args):
    """Compresses an image, or a batch of images of the same shape in npy format."""
    from configs import get_eval_batch_size

    if args.input_file.endswith('.npy'):
        # .npy file should contain N images of the same shapes, in the form of an array of shape [N, H, W, 3]
        X = np.load(args.input_file)
    else:
        # Load input image and add batch dimension.
        from PIL import Image
        x = np.asarray(Image.open(args.input_file).convert('RGB'))
        X = x[None, ...]

    num_images = int(X.shape[0])
    img_num_pixels = int(np.prod(X.shape[1:-1]))
    X = X.astype('float32')
    X /= 255.

    eval_batch_size = get_eval_batch_size(img_num_pixels)
    dataset = tf.data.Dataset.from_tensor_slices(X)
    dataset = dataset.batch(batch_size=eval_batch_size)
    # https://www.tensorflow.org/api_docs/python/tf/compat/v1/data/Iterator
    # Importantly, each sess.run(op) call will consume a new batch, where op is any operation that depends on
    # x. Therefore if multiple ops need to be evaluated on the same batch of data, they have to be grouped like
    # sess.run([op1, op2, ...]).
    # x = dataset.make_one_shot_iterator().get_next()
    x_next = dataset.make_one_shot_iterator().get_next()

    x_ph = x = tf.placeholder(
        'float32',
        (None, *X.shape[1:]))  # keep a reference around for feed_dict

    #### BEGIN build compression graph ####
    # Instantiate model.
    analysis_transform = AnalysisTransform(args.num_filters)
    synthesis_transform = SynthesisTransform(args.num_filters)
    hyper_analysis_transform = HyperAnalysisTransform(args.num_filters)
    hyper_synthesis_transform = HyperSynthesisTransform(args.num_filters,
                                                        num_output_filters=2 *
                                                        args.num_filters)
    entropy_bottleneck = tfc.EntropyBottleneck()

    # Initial values for optimization
    y_init = analysis_transform(x)
    z_init = hyper_analysis_transform(y_init)

    y = tf.placeholder('float32', y_init.shape)
    T = tf.placeholder('float32', shape=[], name='temperature')
    y_floor = tf.floor(y)
    y_ceil = tf.ceil(y)
    y_bds = tf.stack([y_floor, y_ceil], axis=-1)
    epsilon = 1e-5
    ry_logits = tf.stack(
        [
            -tf.math.atanh(
                tf.clip_by_value(y - y_floor, -1 + epsilon, 1 - epsilon)) / T,
            -tf.math.atanh(
                tf.clip_by_value(y_ceil - y, -1 + epsilon, 1 - epsilon)) / T
        ],
        axis=-1
    )  # last dim are logits for DOWN or UP; clip to prevent NaN as temperature -> 0
    ry = tf.nn.softmax(ry_logits, axis=-1)
    y_tilde = tf.reduce_sum(y_bds * ry, axis=-1)  # inner product in last dim
    x_tilde = synthesis_transform(y_tilde)
    x_shape = tf.shape(x)
    x_tilde = x_tilde[:, :x_shape[1], :x_shape[
        2], :]  # crop reconstruction to have the same shape as input

    # # sample z_tilde from q(z_tilde|x) = q(z_tilde|h_a(g_a(x))), and compute the pdf of z_tilde under the flexible prior
    # # p(z_tilde) ("z_likelihoods")
    # z_tilde, z_likelihoods = entropy_bottleneck(z, training=training)
    z = tf.placeholder('float32', z_init.shape)
    z_floor = tf.floor(z)
    z_ceil = tf.ceil(z)
    z_bds = tf.stack([z_floor, z_ceil], axis=-1)
    rz_logits = tf.stack([
        -tf.math.atanh(tf.clip_by_value(z - z_floor, -1 + epsilon,
                                        1 - epsilon)) / T,
        -tf.math.atanh(tf.clip_by_value(z_ceil - z, -1 + epsilon, 1 - epsilon))
        / T
    ],
                         axis=-1)  # last dim are logits for DOWN or UP
    rz = tf.nn.softmax(rz_logits, axis=-1)
    z_tilde = tf.reduce_sum(z_bds * rz, axis=-1)  # inner product in last dim

    # # We have to manually call entropy_bottleneck.build because we don't directly call entropy_bottleneck like we did
    # # with 'z_tilde, z_likelihoods = entropy_bottleneck(z, training=training)' during training
    # # UPDATE: this doesn't quite work, as the resulting variables don't have the proper name scope (will just be named
    # # "matrix_0", "bias_0", etc., instead of "entropy_bottleneck/matrix_0", "entropy_bottleneck/bias_0" as would with
    # # calling entropy_bottleneck on tensor, which breaks model loading (will get "Key bias_0 not found in checkpoint..
    # # tensorflow.python.framework.errors_impl.NotFoundError: Restoring from checkpoint failed. This is most likely due
    # # to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not
    # # altered the graph expected based on the checkpoint.").
    # entropy_bottleneck.build(z_tilde.shape)
    _ = entropy_bottleneck(
        z, training=False
    )  # dummy call to ensure entropy_bottleneck is properly built
    z_likelihoods = entropy_bottleneck._likelihood(z_tilde)  # p(\tilde z)
    if entropy_bottleneck.likelihood_bound > 0:
        likelihood_bound = entropy_bottleneck.likelihood_bound
        z_likelihoods = math_ops.lower_bound(z_likelihoods, likelihood_bound)

    # compute parameters of p(y_tilde|z_tilde)
    mu, sigma = tf.split(hyper_synthesis_transform(z_tilde),
                         num_or_size_splits=2,
                         axis=-1)
    sigma = tf.exp(sigma)  # make positive

    # need to handle images with non-standard sizes during compression; mu/sigma must have the same shape as y
    y_shape = tf.shape(y_tilde)
    mu = mu[:, :y_shape[1], :y_shape[2], :]
    sigma = sigma[:, :y_shape[1], :y_shape[2], :]
    scale_table = np.exp(
        np.linspace(np.log(SCALES_MIN), np.log(SCALES_MAX), SCALES_LEVELS))
    conditional_bottleneck = tfc.GaussianConditional(sigma,
                                                     scale_table,
                                                     mean=mu)
    # compute the pdf of y_tilde under the conditional prior/entropy model p(y_tilde|z_tilde)
    # = N(y_tilde|mu, sigma^2) conv U(-0.5, 0.5)
    y_likelihoods = conditional_bottleneck._likelihood(
        y_tilde)  # p(\tilde y | \tilde z)
    if conditional_bottleneck.likelihood_bound > 0:
        likelihood_bound = conditional_bottleneck.likelihood_bound
        y_likelihoods = math_ops.lower_bound(y_likelihoods, likelihood_bound)
    #### END build compression graph ####

    # Total number of bits divided by number of pixels.
    # - log p(\tilde y | \tilde z) - log p(\tilde z) - - log q(\tilde z | \tilde y)
    axes_except_batch = list(range(1, len(x.shape)))  # should be [1,2,3]
    y_bpp = tf.reduce_sum(-tf.log(y_likelihoods), axis=axes_except_batch) / (
        np.log(2) * img_num_pixels)
    z_bpp = tf.reduce_sum(-tf.log(z_likelihoods), axis=axes_except_batch) / (
        np.log(2) * img_num_pixels)
    eval_bpp = y_bpp + z_bpp  # shape (N,)
    train_bpp = tf.reduce_mean(eval_bpp)

    # Mean squared error across pixels.
    train_mse = tf.reduce_mean(tf.squared_difference(x, x_tilde))
    # Multiply by 255^2 to correct for rescaling.
    # float_train_mse = train_mse
    # psnr = - 10 * (tf.log(float_train_mse) / np.log(10))  # float MSE computed on float images
    train_mse *= 255**2

    # The rate-distortion cost.
    if args.lmbda < 0:
        args.lmbda = float(args.runname.split('lmbda=')[1].split('-')
                           [0])  # re-use the lmbda as used for training
        print(
            'Defaulting lmbda (mse coefficient) to %g as used in model training.'
            % args.lmbda)
    if args.lmbda > 0:
        rd_loss = args.lmbda * train_mse + train_bpp
    else:
        rd_loss = train_bpp
    rd_gradients = tf.gradients(rd_loss, [y, z])

    # Bring both images back to 0..255 range, for evaluation only.
    x *= 255
    x_tilde = tf.clip_by_value(x_tilde, 0, 1)
    x_tilde = tf.round(x_tilde * 255)

    mse = tf.reduce_mean(tf.squared_difference(x, x_tilde),
                         axis=axes_except_batch)  # shape (N,)
    psnr = tf.image.psnr(x_tilde, x, 255)  # shape (N,)
    msssim = tf.image.ssim_multiscale(x_tilde, x, 255)  # shape (N,)
    msssim_db = -10 * tf.log(1 - msssim) / np.log(10)  # shape (N,)

    with tf.Session() as sess:
        # Load the latest model checkpoint, get compression stats
        save_dir = os.path.join(args.checkpoint_dir, args.runname)
        latest = tf.train.latest_checkpoint(checkpoint_dir=save_dir)
        tf.train.Saver().restore(sess, save_path=latest)
        eval_fields = [
            'mse', 'psnr', 'msssim', 'msssim_db', 'est_bpp', 'est_y_bpp',
            'est_z_bpp'
        ]
        eval_tensors = [mse, psnr, msssim, msssim_db, eval_bpp, y_bpp, z_bpp]
        all_results_arrs = {key: []
                            for key in eval_fields
                            }  # append across all batches

        log_itv = 100
        if save_opt_record:
            log_itv = 10
        rd_lr = 0.005
        rd_opt_its = 2000
        annealing_rate = 4e-3
        T_ub = 0.2

        def annealed_temperature(t, r, ub, lb=1e-8, backend=np):
            # Using the exp schedule from section 4.2 of Jang et. al., ICLR2017
            if backend is None:
                return min(max(np.exp(-r * t), lb), ub)
            else:
                return backend.minimum(
                    backend.maximum(backend.exp(-r * t), lb), ub)

        from adam import Adam

        batch_idx = 0
        while True:
            try:
                x_val = sess.run(x_next)
                x_feed_dict = {x_ph: x_val}
                # 1. Perform R-D optimization conditioned on ground truth x
                print('----RD Optimization----')
                y_cur, z_cur = sess.run([y_init, z_init],
                                        feed_dict=x_feed_dict)  # np arrays
                adam_optimizer = Adam(lr=rd_lr)
                opt_record = {
                    'its': [],
                    'T': [],
                    'rd_loss': [],
                    'rd_loss_after_rounding': []
                }
                for it in range(rd_opt_its):
                    temperature = annealed_temperature(it,
                                                       r=annealing_rate,
                                                       ub=T_ub)
                    grads, obj, mse_, train_bpp_, psnr_ = sess.run(
                        [rd_gradients, rd_loss, train_mse, train_bpp, psnr],
                        feed_dict={
                            y: y_cur,
                            z: z_cur,
                            **x_feed_dict, T: temperature
                        })
                    y_cur, z_cur = adam_optimizer.update([y_cur, z_cur], grads)
                    if it % log_itv == 0 or it + 1 == rd_opt_its:
                        psnr_ = psnr_.mean()
                        if args.verbose:
                            bpp_after_rounding, psnr_after_rounding, rd_loss_after_rounding = sess.run(
                                [train_bpp, psnr, rd_loss],
                                feed_dict={
                                    y_tilde: np.round(y_cur),
                                    z_tilde: np.round(z_cur),
                                    **x_feed_dict
                                })
                            psnr_after_rounding = psnr_after_rounding.mean()
                            print(
                                'it=%d, T=%.3f rd_loss=%.4f mse=%.3f bpp=%.4f psnr=%.4f\t after rounding: rd_loss=%.4f, bpp=%.4f psnr=%.4f'
                                % (it, temperature, obj, mse_, train_bpp_,
                                   psnr_, rd_loss_after_rounding,
                                   bpp_after_rounding, psnr_after_rounding))
                            opt_record['rd_loss_after_rounding'].append(
                                rd_loss_after_rounding)
                        else:
                            print(
                                'it=%d, T=%.3f rd_loss=%.4f mse=%.3f bpp=%.4f psnr=%.4f'
                                % (it, temperature, obj, mse_, train_bpp_,
                                   psnr_))
                        opt_record['its'].append(it)
                        opt_record['T'].append(temperature)
                        opt_record['rd_loss'].append(obj)

                print()

                y_tilde_cur = np.round(
                    y_cur)  # this is the latents we end up transmitting
                z_tilde_cur = np.round(z_cur)

                # If requested, transform the quantized image back and measure performance.
                eval_arrs = sess.run(eval_tensors,
                                     feed_dict={
                                         y_tilde: y_tilde_cur,
                                         z_tilde: z_tilde_cur,
                                         **x_feed_dict
                                     })
                for field, arr in zip(eval_fields, eval_arrs):
                    all_results_arrs[field] += arr.tolist()

                batch_idx += 1

            except tf.errors.OutOfRangeError:
                break

        for field in eval_fields:
            all_results_arrs[field] = np.asarray(all_results_arrs[field])

        input_file = os.path.basename(args.input_file)
        results_dict = all_results_arrs
        trained_script_name = args.runname.split('-')[0]
        script_name = os.path.splitext(os.path.basename(__file__))[
            0]  # current script name, without extension

        # save RD evaluation results
        prefix = 'rd'
        save_file = '%s-%s-input=%s.npz' % (prefix, args.runname, input_file)
        if script_name != trained_script_name:
            save_file = '%s-%s-lmbda=%g+%s-input=%s.npz' % (
                prefix, script_name, args.lmbda, args.runname, input_file)
        np.savez(os.path.join(args.results_dir, save_file), **results_dict)

        if save_opt_record:
            # save optimization record
            prefix = 'opt'
            save_file = '%s-%s-input=%s.npz' % (prefix, args.runname,
                                                input_file)
            if script_name != trained_script_name:
                save_file = '%s-%s-lmbda=%g+%s-input=%s.npz' % (
                    prefix, script_name, args.lmbda, args.runname, input_file)
            np.savez(os.path.join(args.results_dir, save_file), **opt_record)

        for field in eval_fields:
            arr = all_results_arrs[field]
            print('Avg {}: {:0.4f}'.format(field, arr.mean()))
    def __call__(self, image_height, image_width):
        """
        Note that we don't need to pass feature map shapes
        because we use only 'SAME' padding in all our networks.

        Arguments:
            image_height, image_width: scalar int tensors.
        Returns:
            a float tensor with shape [num_anchors, 4],
            boxes with normalized coordinates.
        """
        with tf.name_scope('anchor_generator'):

            image_height = tf.to_float(image_height)
            image_width = tf.to_float(image_width)

            feature_map_info = []
            num_anchors_per_feature_map = []
            for stride in self.strides:
                h = tf.to_int32(tf.ceil(image_height / stride))
                w = tf.to_int32(tf.ceil(image_width / stride))
                feature_map_info.append((stride, h, w))
                num_anchors_per_feature_map.append(
                    h * w * self.num_anchors_per_location)

            # these are needed elsewhere
            self.num_anchors_per_feature_map = num_anchors_per_feature_map

            anchors = []

            # this is shared by all feature maps
            pairs = list(
                itertools.product(self.scale_multipliers, self.aspect_ratios))
            aspect_ratios = tf.constant([a for _, a in pairs],
                                        dtype=tf.float32)

            for i, (stride, h, w) in enumerate(feature_map_info):

                scales = tf.constant([m * self.scales[i] for m, _ in pairs],
                                     dtype=tf.float32)
                stride = tf.constant(stride, dtype=tf.float32)
                """
                It is true that
                image_height = h * stride - x, where 0 <= x < stride.

                Then image_height = (h - 1) * stride + (stride - x).
                So offset y must be equal to 0.5 * (stride - x).

                x = h * stride - image_height,
                y = 0.5 * (image_height - (h - 1) * stride),
                0 < y <= 0.5 * stride.

                Offset y is maximal when image_height is divisible by stride.
                Offset y is minimal when image_height = k * stride + 1, where k is a positive integer.
                """
                offset_y = 0.5 * (image_height -
                                  (tf.to_float(h) - 1.0) * stride)
                offset_x = 0.5 * (image_width -
                                  (tf.to_float(w) - 1.0) * stride)

                anchors.append(
                    tile_anchors(grid_height=h,
                                 grid_width=w,
                                 scales=scales,
                                 aspect_ratios=aspect_ratios,
                                 anchor_stride=(stride, stride),
                                 anchor_offset=(offset_y, offset_x)))

        with tf.name_scope('concatenate_normalize'):

            # this is for visualization and debugging only
            self.raw_anchors = anchors

            anchors = tf.concat(anchors, axis=0)

            # convert to the [0, 1] range
            scaler = tf.to_float(
                tf.stack(
                    [image_height, image_width, image_height, image_width]))
            anchors /= scaler

        return anchors
Exemple #21
0
def get_interp_coefficients(grid,
                            pts,
                            min_grid_value=(0, 0, 0),
                            max_grid_value=(1, 1, 1)):
    """Regular grid interpolator, returns inpterpolation coefficients.

  Args:
    grid: `[batch_size, *size, features]` tensor, input feature grid.
    pts: `[batch_size, num_points, dim]` tensor, coordinates of points that
    in each dim are within the range (min_grid_value[dim], max_grid_value[dim]).
    min_grid_value: tuple, minimum value in each dimension corresponding to the
      grid.
    max_grid_value: tuple, maximum values in each dimension corresponding to the
      grid.
  Returns:
    lat: `[batch_size, num_points, 2**dim, features]` tensor, neighbor
    latent codes for each input point.
    weights: `[batch_size, num_points, 2**dim]` tensor, bi/tri-linear
    interpolation weights for each neighbor.
    xloc: `[batch_size, num_points, 2**dim, dim]`tensor, relative coordinates.

  """
    # get dimensions
    bs = grid.get_shape().as_list()[0]
    npts = tf.shape(pts)[1]
    size = tf.shape(grid)[1:-1]
    cubesize = 1.0 / (tf.cast(size, tf.float32) - 1.0)
    dim = len(grid.get_shape().as_list()) - 2

    # normalize coords for interpolation
    if isinstance(min_grid_value, list) or isinstance(min_grid_value, tuple):
        min_grid_value = tf.constant(min_grid_value, dtype=tf.float32)
    if isinstance(max_grid_value, list) or isinstance(min_grid_value, tuple):
        max_grid_value = tf.constant(max_grid_value, dtype=tf.float32)
    bbox = max_grid_value - min_grid_value
    pts = (pts - min_grid_value) / bbox
    pts = tf.clip_by_value(pts, 1e-6, 1 - 1e-6)  # clip to boundary of the bbox

    # find neighbor indices
    ind0 = tf.floor(pts / cubesize)  # `[batch_size, num_points, dim]`
    ind1 = tf.ceil(pts / cubesize)  # `[batch_size, num_points, dim]`
    ind01 = tf.stack([ind0, ind1],
                     axis=0)  # `[2, batch_size, num_points, dim]`
    ind01 = tf.transpose(ind01, perm=[0, 3, 1, 2])  # `[2, d, b, n]`
    ind01 = tf.cast(ind01, tf.int32)

    # generate combinations for enumerating neighbors
    tmp = tf.constant([0, 1], dtype=tf.int32)
    com_ = tf.stack(tf.meshgrid(*tuple([tmp] * dim), indexing="ij"), axis=-1)
    com_ = tf.reshape(com_, [-1, dim])  # `[2**dim, dim]`
    dim_ = tf.reshape(tf.range(dim), [1, -1])
    dim_ = tf.tile(dim_, [2**dim, 1])  # `[2**dim, dim]`
    gather_ind = tf.stack([com_, dim_], axis=-1)  # `[2**dim, dim, 2]`
    gather_ind_ = tf.stack([1 - com_, dim_], axis=-1)  # `[2**dim, dim, 2]`
    ind_ = tf.gather_nd(ind01,
                        gather_ind)  # [2**dim, dim, batch_size, num_pts]
    ind_n = tf.transpose(ind_, perm=[2, 3, 0, 1])  # neighbor indices
    # `[batch_size, num_pts, 2**dim, dim]`
    ind_b = tf.reshape(tf.range(bs), [-1, 1, 1, 1])
    ind_b = tf.broadcast_to(ind_b,
                            [bs, npts, 2**dim, 1])  # dummy batch indices
    # `[batch_size, num_pts, 2**dim, 1]`
    gather_ind2 = tf.concat([ind_b, ind_n], axis=-1)
    lat = tf.gather_nd(grid, gather_ind2)
    # `[batch_size, num_points, 2**dim, in_features]`

    # weights of neighboring nodes
    xyz0 = ind0 * cubesize  # `[batch_size, num_points, dim]`
    xyz1 = (ind0 + 1) * cubesize  # `[batch_size, num_points, dim]`
    xyz01 = tf.stack([xyz0, xyz1],
                     axis=-1)  # [batch_size, num_points, dim, 2]`
    xyz01 = tf.transpose(xyz01, perm=[3, 2, 0, 1])  # [2, d, batch, npts]
    pos = tf.gather_nd(xyz01, gather_ind)  # `[2**dim, dim, batch, num_points]`
    pos = tf.transpose(pos, perm=[2, 3, 0, 1])
    pos_ = tf.gather_nd(xyz01,
                        gather_ind_)  # [2**dim, dim, batch, num_points]`
    pos_ = tf.transpose(pos_, perm=[2, 3, 0, 1])
    # `[batch_size, num_points, 2**dim, dim]`

    dxyz_ = tf.abs(tf.expand_dims(pts, -2) - pos_) / cubesize
    weights = tf.reduce_prod(dxyz_, axis=-1, keepdims=False)
    # `[batch_size, num_points, 2**dim]
    xloc = (tf.expand_dims(pts, -2) - pos) / cubesize
    # `[batch, num_points, 2**dim, dim]`
    return lat, weights, xloc
def resize_to_range(image,
                    label=None,
                    min_size=None,
                    max_size=None,
                    factor=None,
                    align_corners=True,
                    label_layout_is_chw=False,
                    scope=None,
                    method=tf.image.ResizeMethod.BILINEAR):
    """Resizes image or label so their sides are within the provided range.

  The output size can be described by two cases:
  1. If the image can be rescaled so its minimum size is equal to min_size
     without the other side exceeding max_size, then do so.
  2. Otherwise, resize so the largest side is equal to max_size.
  An integer in `range(factor)` is added to the computed sides so that the
  final dimensions are multiples of `factor` plus one.
  Args:
    image: A 3D tensor of shape [height, width, channels].
    label: (optional) A 3D tensor of shape [height, width, channels] (default)
      or [channels, height, width] when label_layout_is_chw = True.
    min_size: (scalar) desired size of the smaller image side.
    max_size: (scalar) maximum allowed size of the larger image side. Note that
      the output dimension is no larger than max_size and may be slightly
      smaller than min_size when factor is not None.
    factor: Make output size multiple of factor plus one.
    align_corners: If True, exactly align all 4 corners of input and output.
    label_layout_is_chw: If true, the label has shape [channel, height, width].
      We support this case because for some instance segmentation dataset, the
      instance segmentation is saved as [num_instances, height, width].
    scope: Optional name scope.
    method: Image resize method. Defaults to tf.image.ResizeMethod.BILINEAR.

  Returns:
    A 3-D tensor of shape [new_height, new_width, channels], where the image
    has been resized (with the specified method) so that
    min(new_height, new_width) == ceil(min_size) or
    max(new_height, new_width) == ceil(max_size).
  Raises:
    ValueError: If the image is not a 3D tensor.
  """
    with tf.name_scope(scope, 'resize_to_range', [image]):
        new_tensor_list = []
        min_size = tf.to_float(min_size)
        if max_size is not None:
            max_size = tf.to_float(max_size)
            # Modify the max_size to be a multiple of factor plus 1 and make sure the
            # max dimension after resizing is no larger than max_size.
            if factor is not None:
                max_size = (max_size + (factor -
                                        (max_size - 1) % factor) % factor -
                            factor)

        [orig_height, orig_width, _] = resolve_shape(image, rank=3)
        orig_height = tf.to_float(orig_height)
        orig_width = tf.to_float(orig_width)
        orig_min_size = tf.minimum(orig_height, orig_width)

        # Calculate the larger of the possible sizes
        large_scale_factor = min_size / orig_min_size
        large_height = tf.to_int32(tf.ceil(orig_height * large_scale_factor))
        large_width = tf.to_int32(tf.ceil(orig_width * large_scale_factor))
        large_size = tf.stack([large_height, large_width])

        new_size = large_size
        if max_size is not None:
            # Calculate the smaller of the possible sizes, use that if the larger
            # is too big.
            orig_max_size = tf.maximum(orig_height, orig_width)
            small_scale_factor = max_size / orig_max_size
            small_height = tf.to_int32(
                tf.ceil(orig_height * small_scale_factor))
            small_width = tf.to_int32(tf.ceil(orig_width * small_scale_factor))
            small_size = tf.stack([small_height, small_width])
            new_size = tf.cond(
                tf.to_float(tf.reduce_max(large_size)) > max_size,
                lambda: small_size, lambda: large_size)
        # Ensure that both output sides are multiples of factor plus one.
        if factor is not None:
            new_size += (factor - (new_size - 1) % factor) % factor
        new_tensor_list.append(
            tf.image.resize_images(image,
                                   new_size,
                                   method=method,
                                   align_corners=align_corners))
        if label is not None:
            if label_layout_is_chw:
                # Input label has shape [channel, height, width].
                resized_label = tf.expand_dims(label, 3)
                resized_label = tf.image.resize_nearest_neighbor(
                    resized_label, new_size, align_corners=align_corners)
                resized_label = tf.squeeze(resized_label, 3)
            else:
                # Input label has shape [height, width, channel].
                resized_label = tf.image.resize_images(
                    label,
                    new_size,
                    method=tf.image.ResizeMethod.NEAREST_NEIGHBOR,
                    align_corners=align_corners)
            new_tensor_list.append(resized_label)
        else:
            new_tensor_list.append(None)
        return new_tensor_list
Exemple #23
0
def resampler_with_unstacked_warp(data,
                                  warp_x,
                                  warp_y,
                                  safe=True,
                                  name='resampler'):
    """Resamples input data at user defined coordinates.

    Args:
      data: Tensor of shape `[batch_size, data_height, data_width,
        data_num_channels]` containing 2D data that will be resampled.
      warp_x: Tensor of shape `[batch_size, dim_0, ... , dim_n]` containing the x
        coordinates at which resampling will be performed.
      warp_y: Tensor of the same shape as warp_x containing the y coordinates at
        which resampling will be performed.
      safe: A boolean, if True, warp_x and warp_y will be clamped to their bounds.
        Disable only if you know they are within bounds, otherwise a runtime
        exception will be thrown.
      name: Optional name of the op.

    Returns:
       Tensor of resampled values from `data`. The output tensor shape is
      `[batch_size, dim_0, ... , dim_n, data_num_channels]`.

    Raises:
      ValueError: If warp_x, warp_y and data have incompatible shapes.
    """

    with tf.name_scope(name):
        warp_x = tf.convert_to_tensor(warp_x)
        warp_y = tf.convert_to_tensor(warp_y)
        data = tf.convert_to_tensor(data)
        if not warp_x.shape.is_compatible_with(warp_y.shape):
            raise ValueError(
                'warp_x and warp_y are of incompatible shapes: %s vs %s ' %
                (str(warp_x.shape), str(warp_y.shape)))
        warp_shape = tf.shape(warp_x)
        if warp_x.shape[0] != data.shape[0]:
            raise ValueError(
                '\'warp_x\' and \'data\' must have compatible first '
                'dimension (batch size), but their shapes are %s and %s ' %
                (str(warp_x.shape[0]), str(data.shape[0])))
        # Compute the four points closest to warp with integer value.
        warp_floor_x = tf.floor(warp_x)
        warp_floor_y = tf.floor(warp_y)
        # Compute the weight for each point.
        right_warp_weight = warp_x - warp_floor_x
        down_warp_weight = warp_y - warp_floor_y

        warp_floor_x = tf.to_int32(warp_floor_x)
        warp_floor_y = tf.to_int32(warp_floor_y)
        warp_ceil_x = tf.to_int32(tf.ceil(warp_x))
        warp_ceil_y = tf.to_int32(tf.ceil(warp_y))

        left_warp_weight = tf.subtract(
            tf.convert_to_tensor(1.0, right_warp_weight.dtype),
            right_warp_weight)
        up_warp_weight = tf.subtract(
            tf.convert_to_tensor(1.0, down_warp_weight.dtype),
            down_warp_weight)

        # Extend warps from [batch_size, dim_0, ... , dim_n, 2] to
        # [batch_size, dim_0, ... , dim_n, 3] with the first element in last
        # dimension being the batch index.

        # A shape like warp_shape but with all sizes except the first set to 1:
        warp_batch_shape = tf.concat(
            [warp_shape[0:1], tf.ones_like(warp_shape[1:])], 0)

        warp_batch = tf.reshape(tf.range(warp_shape[0], dtype=tf.int32),
                                warp_batch_shape)

        # Broadcast to match shape:
        warp_batch += tf.zeros_like(warp_y, dtype=tf.int32)
        left_warp_weight = tf.expand_dims(left_warp_weight, axis=-1)
        down_warp_weight = tf.expand_dims(down_warp_weight, axis=-1)
        up_warp_weight = tf.expand_dims(up_warp_weight, axis=-1)
        right_warp_weight = tf.expand_dims(right_warp_weight, axis=-1)

        up_left_warp = tf.stack([warp_batch, warp_floor_y, warp_floor_x],
                                axis=-1)
        up_right_warp = tf.stack([warp_batch, warp_floor_y, warp_ceil_x],
                                 axis=-1)
        down_left_warp = tf.stack([warp_batch, warp_ceil_y, warp_floor_x],
                                  axis=-1)
        down_right_warp = tf.stack([warp_batch, warp_ceil_y, warp_ceil_x],
                                   axis=-1)

        def gather_nd(params, indices):
            return (safe_gather_nd if safe else tf.gather_nd)(params, indices)

        # gather data then take weighted average to get resample result.
        result = ((gather_nd(data, up_left_warp) * left_warp_weight +
                   gather_nd(data, up_right_warp) * right_warp_weight) *
                  up_warp_weight +
                  (gather_nd(data, down_left_warp) * left_warp_weight +
                   gather_nd(data, down_right_warp) * right_warp_weight) *
                  down_warp_weight)
        result_shape = (warp_x.get_shape().as_list() +
                        data.get_shape().as_list()[-1:])
        result.set_shape(result_shape)
        return result
Exemple #24
0
def _max_pool_2d_nxn_regions(inputs, output_size: int, mode: str):
    """
  Performs a pooling operation that results in a fixed size:
  output_size x output_size.

  Used by spatial_pyramid_pool. Refer to appendix A in [1].

  Args:
      inputs: A 4D Tensor (B, H, W, C)
      output_size: The output size of the pooling operation.
      mode: The pooling mode {max, avg}

  Returns:
      A list of tensors, for each output bin.
      The list contains output_size * output_size elements, where
      each elment is a Tensor (N, C).

  References:
      [1] He, Kaiming et al (2015):
          Spatial Pyramid Pooling in Deep Convolutional Networks
          for Visual Recognition.
          https://arxiv.org/pdf/1406.4729.pdf.

  Ported from: https://github.com/luizgh/Lasagne/commit/c01e3d922a5712ca4c54617a15a794c23746ac8c
  """
    inputs_shape = tf.shape(inputs)
    h = tf.cast(tf.gather(inputs_shape, 1), tf.int32)
    w = tf.cast(tf.gather(inputs_shape, 2), tf.int32)

    if mode == 'max':
        pooling_op = tf.reduce_max
    elif mode == 'avg':
        pooling_op = tf.reduce_mean
    else:
        msg = "Mode must be either 'max' or 'avg'. Got '{0}'"
        raise ValueError(msg.format(mode))

    result = []
    n = output_size
    for row in range(output_size):
        for col in range(output_size):
            # start_h = floor(row / n * h)
            start_h = tf.cast(
                tf.floor(tf.multiply(tf.divide(row, n),
                                     tf.cast(h, tf.float64))), tf.int32)
            # end_h = ceil((row + 1) / n * h)
            end_h = tf.cast(
                tf.ceil(
                    tf.multiply(tf.divide((row + 1), n),
                                tf.cast(h, tf.float64))), tf.int32)
            # start_w = floor(col / n * w)
            start_w = tf.cast(
                tf.floor(tf.multiply(tf.divide(col, n),
                                     tf.cast(w, tf.float64))), tf.int32)
            # end_w = ceil((col + 1) / n * w)
            end_w = tf.cast(
                tf.ceil(
                    tf.multiply(tf.divide((col + 1), n),
                                tf.cast(w, tf.float64))), tf.int32)
            pooling_region = inputs[:, start_h:end_h, start_w:end_w, :]
            pool_result = pooling_op(pooling_region, axis=(1, 2))
            result.append(pool_result)
    return result
def pad_to_multiple(tensor, multiple):
  """Returns the tensor zero padded to the specified multiple.

  Appends 0s to the end of the first and second dimension (height and width) of
  the tensor until both dimensions are a multiple of the input argument
  'multiple'. E.g. given an input tensor of shape [1, 3, 5, 1] and an input
  multiple of 4, PadToMultiple will append 0s so that the resulting tensor will
  be of shape [1, 4, 8, 1].

  Args:
    tensor: rank 4 float32 tensor, where
            tensor -> [batch_size, height, width, channels].
    multiple: the multiple to pad to.

  Returns:
    padded_tensor: the tensor zero padded to the specified multiple.
  """
  if multiple == 1:
    return tensor

  tensor_shape = tensor.get_shape()
  batch_size = static_shape.get_batch_size(tensor_shape)
  tensor_height = static_shape.get_height(tensor_shape)
  tensor_width = static_shape.get_width(tensor_shape)
  tensor_depth = static_shape.get_depth(tensor_shape)

  if batch_size is None:
    batch_size = tf.shape(tensor)[0]

  if tensor_height is None:
    tensor_height = tf.shape(tensor)[1]
    padded_tensor_height = tf.cast(
        tf.ceil(
            tf.cast(tensor_height, dtype=tf.float32) /
            tf.cast(multiple, dtype=tf.float32)),
        dtype=tf.int32) * multiple
  else:
    padded_tensor_height = int(
        math.ceil(float(tensor_height) / multiple) * multiple)

  if tensor_width is None:
    tensor_width = tf.shape(tensor)[2]
    padded_tensor_width = tf.cast(
        tf.ceil(
            tf.cast(tensor_width, dtype=tf.float32) /
            tf.cast(multiple, dtype=tf.float32)),
        dtype=tf.int32) * multiple
  else:
    padded_tensor_width = int(
        math.ceil(float(tensor_width) / multiple) * multiple)

  if tensor_depth is None:
    tensor_depth = tf.shape(tensor)[3]

  # Use tf.concat instead of tf.pad to preserve static shape
  if padded_tensor_height != tensor_height:
    height_pad = tf.zeros([
        batch_size, padded_tensor_height - tensor_height, tensor_width,
        tensor_depth
    ])
    tensor = tf.concat([tensor, height_pad], 1)
  if padded_tensor_width != tensor_width:
    width_pad = tf.zeros([
        batch_size, padded_tensor_height, padded_tensor_width - tensor_width,
        tensor_depth
    ])
    tensor = tf.concat([tensor, width_pad], 2)

  return tensor
Exemple #26
0
    def parse(self, example_proto):
        """
        Returns:
            image: a float tensor with shape [height, width, 3],
                an RGB image with pixel values in the range [0, 1].
            masks: a float tensor with shape [height / DOWNSAMPLE, width / DOWNSAMPLE, 2].
            boxes: a float tensor with shape [num_persons, 4], in absolute coordinates.
            keypoints: an int tensor with shape [num_persons, 17, 3], in absolute coordinates.
        """
        features = {
            'image':
            tf.FixedLenFeature([], tf.string),
            'num_persons':
            tf.FixedLenFeature([], tf.int64),
            'boxes':
            tf.FixedLenSequenceFeature([], tf.float32, allow_missing=True),
            'keypoints':
            tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
            'masks':
            tf.FixedLenFeature([], tf.string)
        }
        parsed_features = tf.parse_single_example(example_proto, features)

        # get an image
        image = tf.image.decode_jpeg(parsed_features['image'], channels=3)
        image = tf.image.convert_image_dtype(image, tf.float32)
        # now pixel values are scaled to the [0, 1] range

        # get number of people on the image
        num_persons = tf.to_int32(parsed_features['num_persons'])
        # it is assumed that num_persons > 0

        # get groundtruth boxes, they are in absolute coordinates
        boxes = tf.reshape(parsed_features['boxes'], [num_persons, 4])
        # they are used to guide the data augmentation (when doing a random crop)
        # and to choose sigmas for gaussian blobs

        # get keypoints, they are in absolute coordinates
        keypoints = tf.to_int32(parsed_features['keypoints'])
        keypoints = tf.reshape(keypoints, [num_persons, 17, 3])

        # get size of masks, they are downsampled
        shape = tf.shape(image)
        image_height, image_width = shape[0], shape[1]
        masks_height = tf.to_int32(tf.ceil(image_height / DOWNSAMPLE))
        masks_width = tf.to_int32(tf.ceil(image_width / DOWNSAMPLE))
        # (we use the 'SAME' padding in the networks)

        # get masks (loss and segmentation masks)
        masks = tf.decode_raw(parsed_features['masks'], tf.uint8)
        # unpack bits (reverse np.packbits)
        b = tf.constant([128, 64, 32, 16, 8, 4, 2, 1], dtype=tf.uint8)
        masks = tf.reshape(tf.bitwise.bitwise_and(masks[:, None], b), [-1])
        masks = masks[:(masks_height * masks_width * 2)]
        masks = tf.cast(masks > 0, tf.uint8)

        # reshape to the initial form
        masks = tf.reshape(masks, [masks_height, masks_width, 2])
        masks = tf.to_float(masks)  # it has binary values only

        return image, masks, boxes, keypoints
Exemple #27
0
def sorted_non_max_suppression_padded(scores,
                                      boxes,
                                      max_output_size,
                                      iou_threshold):
  """A wrapper that handles non-maximum suppression.

  Assumption:
    * The boxes are sorted by scores unless the box is a dot (all coordinates
      are zero).
    * Boxes with higher scores can be used to suppress boxes with lower scores.

  While a serial NMS algorithm iteratively uses the highest-scored unprocessed
  box to suppress boxes, this algorithm uses many boxes to suppress other boxes
  in parallel. The key idea is to partition boxes into tiles based on their
  score and suppresses boxes tile by tile, thus achieving parallelism within a
  tile. The tile size determines the degree of parallelism.

  In cross suppression (using boxes of tile A to suppress boxes of tile B),
  all boxes in A can independently suppress boxes in B.

  Self suppression (suppressing boxes of the same tile) needs to be iteratively
  applied until there's no more suppression. In each iteration, boxes that
  cannot be suppressed are used to suppress boxes in the same tile.

  boxes = boxes.pad_to_multiply_of(tile_size)
  num_tiles = len(boxes) // tile_size
  output_boxes = []
  for i in range(num_tiles):
    box_tile = boxes[i*tile_size : (i+1)*tile_size]
    for j in range(i - 1):
      # in parallel suppress boxes in box_tile using boxes from suppressing_tile
      suppressing_tile = boxes[j*tile_size : (j+1)*tile_size]
      iou = bbox_overlap(box_tile, suppressing_tile)
      # if the box is suppressed in iou, clear it to a dot
      box_tile *= _update_boxes(iou)
    # Iteratively handle the diagnal tile.
    iou = _box_overlap(box_tile, box_tile)
    iou_changed = True
    while iou_changed:
      # boxes that are not suppressed by anything else
      suppressing_boxes = _get_suppressing_boxes(iou)
      # boxes that are suppressed by suppressing_boxes
      suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes)
      # clear iou to 0 for boxes that are suppressed, as they cannot be used
      # to suppress other boxes any more
      new_iou = _clear_iou(iou, suppressed_boxes)
      iou_changed = (new_iou != iou)
      iou = new_iou
    # remaining boxes that can still suppress others, are selected boxes.
    output_boxes.append(_get_suppressing_boxes(iou))
    if len(output_boxes) >= max_output_size:
      break

  Args:
    scores: a tensor with a shape of [batch_size, anchors].
    boxes: a tensor with a shape of [batch_size, anchors, 4].
    max_output_size: a scalar integer `Tensor` representing the maximum number
      of boxes to be selected by non max suppression.
    iou_threshold: a float representing the threshold for deciding whether boxes
      overlap too much with respect to IOU.

  Returns:
    nms_scores: a tensor with a shape of [batch_size, anchors]. It has same
      dtype as input scores.
    nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has
      same dtype as input boxes.
  """
  batch_size = tf.shape(boxes)[0]
  num_boxes = tf.shape(boxes)[1]
  pad = tf.cast(
      tf.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE),
      tf.int32) * NMS_TILE_SIZE - num_boxes
  boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]])
  scores = tf.pad(
      tf.cast(scores, tf.float32), [[0, 0], [0, pad]], constant_values=-1)
  num_boxes += pad

  def _loop_cond(unused_boxes, unused_threshold, output_size, idx):
    return tf.logical_and(
        tf.reduce_min(output_size) < max_output_size,
        idx < num_boxes // NMS_TILE_SIZE)

  selected_boxes, _, output_size, _ = tf.while_loop(
      _loop_cond, _suppression_loop_body, [
          boxes, iou_threshold,
          tf.zeros([batch_size], tf.int32),
          tf.constant(0)
      ])
  idx = num_boxes - tf.cast(
      tf.nn.top_k(
          tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) *
          tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0],
      tf.int32)
  idx = tf.minimum(idx, num_boxes - 1)
  idx = tf.reshape(
      idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]), [-1])
  boxes = tf.reshape(
      tf.gather(tf.reshape(boxes, [-1, 4]), idx),
      [batch_size, max_output_size, 4])
  boxes_mask = tf.less(
      tf.reshape(tf.range(max_output_size), [1, -1, 1]),
      tf.reshape(output_size, [-1, 1, 1]))
  boxes = boxes * tf.cast(boxes_mask, boxes.dtype)
  scores = tf.reshape(
      tf.gather(tf.reshape(scores, [-1, 1]), idx),
      [batch_size, max_output_size])
  scores_mask = tf.less(
      tf.reshape(tf.range(max_output_size), [1, -1]),
      tf.reshape(output_size, [-1, 1]))
  scores = tf.where(scores_mask, scores,
                    tf.ones_like(scores, scores.dtype) * -1)
  return scores, boxes
Exemple #28
0
def non_max_suppression_padded(scores, boxes, max_output_size, iou_threshold,
                               level):
    """A wrapper that handles non-maximum suppression.

  Assumption:
    * The boxes are sorted by scores unless the box is a dot (all coordinates
      are zero).
    * Boxes with higher scores can be used to suppress boxes with lower scores.

  While a serial NMS algorithm iteratively uses the highest-scored unprocessed
  box to suppress boxes, this algorithm uses many boxes to suppress other boxes
  in parallel. The key idea is to partition boxes into tiles based on their
  score and suppresses boxes tile by tile, thus achieving parallelism within a
  tile. The tile size determines the degree of parallelism.

  In cross suppression (using boxes of tile A to suppress boxes of tile B),
  all boxes in A can independently suppress boxes in B.

  Self suppression (suppressing boxes of the same tile) needs to be iteratively
  applied until there's no more suppression. In each iteration, boxes that
  cannot be suppressed are used to suppress boxes in the same tile.

  boxes = boxes.pad_to_multiply_of(tile_size)
  num_tiles = len(boxes) // tile_size
  output_boxes = []
  for i in range(num_tiles):
    box_tile = boxes[i*tile_size : (i+1)*tile_size]
    for j in range(i - 1):
      # in parallel suppress boxes in box_tile using boxes from suppressing_tile
      suppressing_tile = boxes[j*tile_size : (j+1)*tile_size]
      iou = _bbox_overlap(box_tile, suppressing_tile)
      # if the box is suppressed in iou, clear it to a dot
      box_tile *= _update_boxes(iou)
    # Iteratively handle the diagnal tile.
    iou = _box_overlap(box_tile, box_tile)
    iou_changed = True
    while iou_changed:
      # boxes that are not suppressed by anything else
      suppressing_boxes = _get_suppressing_boxes(iou)
      # boxes that are suppressed by suppressing_boxes
      suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes)
      # clear iou to 0 for boxes that are suppressed, as they cannot be used
      # to suppress other boxes any more
      new_iou = _clear_iou(iou, suppressed_boxes)
      iou_changed = (new_iou != iou)
      iou = new_iou
    # remaining boxes that can still suppress others, are selected boxes.
    output_boxes.append(_get_suppressing_boxes(iou))
    if len(output_boxes) >= max_output_size:
      break

  Args:
    scores: a tensor with a shape of [batch_size, num_boxes].
    boxes: a tensor with a shape of [batch_size, num_boxes, 4].
    max_output_size: a scalar integer `Tensor` representing the maximum number
      of boxes to be selected by non max suppression.
    iou_threshold: a float representing the threshold for deciding whether boxes
      overlap too much with respect to IOU.
    level: a integer for the level that the function operates on.
  Returns:
    idx: a tensor with a shape of [batch_size, num_boxes] representing the
      indices selected by non-max suppression. All numbers are are within
      [0, num_boxes). For each image (i.e., idx[i]), only the first num_valid[i]
      indices (i.e., idx[i][:num_valid[i]]) are valid.
    num_valid: a tensor with a shape of [batch_size] representing the number of
      valid indices in idx.
  """
    with tf.name_scope('nms_l%d' % level):
        batch_size = tf.shape(boxes)[0]
        num_boxes = tf.shape(boxes)[1]
        pad = tf.cast(tf.ceil(tf.cast(num_boxes, tf.float32) / _NMS_TILE_SIZE),
                      tf.int32) * _NMS_TILE_SIZE - num_boxes
        boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]])
        scores = tf.pad(tf.cast(scores, tf.float32), [[0, 0], [0, pad]])
        num_boxes_after_padding = num_boxes + pad

        def _loop_cond(unused_boxes, unused_threshold, output_size, idx):
            return tf.logical_and(
                tf.reduce_min(output_size) < max_output_size,
                idx < num_boxes_after_padding // _NMS_TILE_SIZE)

        selected_boxes, _, output_size, _ = tf.while_loop(
            _loop_cond, _suppression_loop_body, [
                boxes, iou_threshold,
                tf.zeros([batch_size], tf.int32),
                tf.constant(0)
            ])
        num_valid = tf.minimum(output_size, max_output_size)
        idx = num_boxes_after_padding - tf.cast(
            tf.nn.top_k(
                tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) *
                tf.expand_dims(tf.range(num_boxes_after_padding, 0, -1), 0),
                max_output_size)[0], tf.int32)
        idx = tf.minimum(idx, num_boxes - 1)
        return idx, num_valid