def _randomly_negate_tensor(tensor): """With 50% prob turn the tensor negative.""" should_flip = tf.cast(tf.floor(tf.random_uniform([]) + 0.5), tf.bool) final_tensor = tf.cond(should_flip, lambda: tensor, lambda: -tensor) return final_tensor
def _gen_mask(shape, drop_prob): """Generate a droppout mask.""" keep_prob = 1. - drop_prob mask = tf.random_uniform(shape, minval=0., maxval=1., dtype=tf.float32) mask = tf.floor(mask + keep_prob) / keep_prob return mask
def compute_grid_positions(boxes, boundaries, output_size, sample_offset): """Compute the grid position w.r.t. the corresponding feature map. Args: boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the information of each box w.r.t. the corresponding feature map. boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float) in terms of the number of pixels of the corresponding feature map size. boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing the boundary (in (y, x)) of the corresponding feature map for each box. Any resampled grid points that go beyond the bounary will be clipped. output_size: a scalar indicating the output crop size. sample_offset: a float number in [0, 1] indicates the subpixel sample offset from grid point. Returns: kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1]. kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1]. box_grid_y0y1: Tensor of size [batch_size, boxes, output_size, 2] box_grid_x0x1: Tensor of size [batch_size, boxes, output_size, 2] """ with tf.name_scope('compute_grid_positions'): batch_size, num_boxes, _ = boxes.get_shape().as_list() if batch_size is None: batch_size = tf.shape(boxes)[0] box_grid_x = [] box_grid_y = [] for i in range(output_size): box_grid_x.append(boxes[:, :, 1] + (i + sample_offset) * boxes[:, :, 3] / output_size) box_grid_y.append(boxes[:, :, 0] + (i + sample_offset) * boxes[:, :, 2] / output_size) box_grid_x = tf.stack(box_grid_x, axis=2) box_grid_y = tf.stack(box_grid_y, axis=2) box_grid_y0 = tf.floor(box_grid_y) box_grid_x0 = tf.floor(box_grid_x) box_grid_x0 = tf.maximum(0., box_grid_x0) box_grid_y0 = tf.maximum(0., box_grid_y0) box_grid_x0 = tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1)) box_grid_x1 = tf.minimum(box_grid_x0 + 1, tf.expand_dims(boundaries[:, :, 1], -1)) box_grid_y0 = tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1)) box_grid_y1 = tf.minimum(box_grid_y0 + 1, tf.expand_dims(boundaries[:, :, 0], -1)) box_gridx0x1 = tf.stack([box_grid_x0, box_grid_x1], axis=-1) box_gridy0y1 = tf.stack([box_grid_y0, box_grid_y1], axis=-1) # The RoIAlign feature f can be computed by bilinear interpolation of four # neighboring feature points f0, f1, f2, and f3. # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T # [f10, f11]] # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11 # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11 ly = box_grid_y - box_grid_y0 lx = box_grid_x - box_grid_x0 hy = 1.0 - ly hx = 1.0 - lx kernel_y = tf.reshape( tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size, 2, 1]) kernel_x = tf.reshape( tf.stack([hx, lx], axis=3), [batch_size, num_boxes, output_size, 2, 1]) return kernel_y, kernel_x, box_gridy0y1, box_gridx0x1
def bilinear_sampler(img, x, y): """ Performs bilinear sampling of the input images according to the normalized coordinates provided by the sampling grid. Note that the sampling is done identically for each channel of the input. To test if the function works properly, output image should be identical to input image when theta is initialized to identity transform. Input ----- - img: batch of images in (B, H, W, C) layout. - grid: x, y which is the output of affine_grid_generator. Returns ------- - interpolated images according to grids. Same size as grid. """ # prepare useful params B = tf.shape(img)[0] H = tf.shape(img)[1] W = tf.shape(img)[2] C = tf.shape(img)[3] max_y = tf.cast(H - 1, 'int32') max_x = tf.cast(W - 1, 'int32') zero = tf.zeros([], dtype='int32') # cast indices as float32 (for rescaling) x = tf.cast(x, 'float32') y = tf.cast(y, 'float32') # rescale x and y to [0, W/H] x = 0.5 * ((x + 1.0) * tf.cast(W, 'float32')) y = 0.5 * ((y + 1.0) * tf.cast(H, 'float32')) # grab 4 nearest corner points for each (x_i, y_i) # i.e. we need a rectangle around the point of interest x0 = tf.cast(tf.floor(x), 'int32') x1 = x0 + 1 y0 = tf.cast(tf.floor(y), 'int32') y1 = y0 + 1 # clip to range [0, H/W] to not violate img boundaries x0 = tf.clip_by_value(x0, zero, max_x) x1 = tf.clip_by_value(x1, zero, max_x) y0 = tf.clip_by_value(y0, zero, max_y) y1 = tf.clip_by_value(y1, zero, max_y) # get pixel value at corner coords Ia = get_pixel_value(img, x0, y0) Ib = get_pixel_value(img, x0, y1) Ic = get_pixel_value(img, x1, y0) Id = get_pixel_value(img, x1, y1) # recast as float for delta calculation x0 = tf.cast(x0, 'float32') x1 = tf.cast(x1, 'float32') y0 = tf.cast(y0, 'float32') y1 = tf.cast(y1, 'float32') # calculate deltas wa = (x1 - x) * (y1 - y) wb = (x1 - x) * (y - y0) wc = (x - x0) * (y1 - y) wd = (x - x0) * (y - y0) # add dimension for addition wa = tf.expand_dims(wa, axis=3) wb = tf.expand_dims(wb, axis=3) wc = tf.expand_dims(wc, axis=3) wd = tf.expand_dims(wd, axis=3) # compute output out = tf.add_n([wa * Ia, wb * Ib, wc * Ic, wd * Id]) return out
def op( name, labels, predictions, num_thresholds=None, weights=None, display_name=None, description=None, collections=None, ): """Create a PR curve summary op for a single binary classifier. Computes true/false positive/negative values for the given `predictions` against the ground truth `labels`, against a list of evenly distributed threshold values in `[0, 1]` of length `num_thresholds`. Each number in `predictions`, a float in `[0, 1]`, is compared with its corresponding boolean label in `labels`, and counts as a single tp/fp/tn/fn value at each threshold. This is then multiplied with `weights` which can be used to reweight certain values, or more commonly used for masking values. Args: name: A tag attached to the summary. Used by TensorBoard for organization. labels: The ground truth values. A Tensor of `bool` values with arbitrary shape. predictions: A float32 `Tensor` whose values are in the range `[0, 1]`. Dimensions must match those of `labels`. num_thresholds: Number of thresholds, evenly distributed in `[0, 1]`, to compute PR metrics for. Should be `>= 2`. This value should be a constant integer value, not a Tensor that stores an integer. weights: Optional float32 `Tensor`. Individual counts are multiplied by this value. This tensor must be either the same shape as or broadcastable to the `labels` tensor. display_name: Optional name for this summary in TensorBoard, as a constant `str`. Defaults to `name`. description: Optional long-form description for this summary, as a constant `str`. Markdown is supported. Defaults to empty. collections: Optional list of graph collections keys. The new summary op is added to these collections. Defaults to `[Graph Keys.SUMMARIES]`. Returns: A summary operation for use in a TensorFlow graph. The float32 tensor produced by the summary operation is of dimension (6, num_thresholds). The first dimension (of length 6) is of the order: true positives, false positives, true negatives, false negatives, precision, recall. """ # TODO(nickfelt): remove on-demand imports once dep situation is fixed. import tensorflow.compat.v1 as tf if num_thresholds is None: num_thresholds = _DEFAULT_NUM_THRESHOLDS if weights is None: weights = 1.0 dtype = predictions.dtype with tf.name_scope(name, values=[labels, predictions, weights]): tf.assert_type(labels, tf.bool) # We cast to float to ensure we have 0.0 or 1.0. f_labels = tf.cast(labels, dtype) # Ensure predictions are all in range [0.0, 1.0]. predictions = tf.minimum(1.0, tf.maximum(0.0, predictions)) # Get weighted true/false labels. true_labels = f_labels * weights false_labels = (1.0 - f_labels) * weights # Before we begin, flatten predictions. predictions = tf.reshape(predictions, [-1]) # Shape the labels so they are broadcast-able for later multiplication. true_labels = tf.reshape(true_labels, [-1, 1]) false_labels = tf.reshape(false_labels, [-1, 1]) # To compute TP/FP/TN/FN, we are measuring a binary classifier # C(t) = (predictions >= t) # at each threshold 't'. So we have # TP(t) = sum( C(t) * true_labels ) # FP(t) = sum( C(t) * false_labels ) # # But, computing C(t) requires computation for each t. To make it fast, # observe that C(t) is a cumulative integral, and so if we have # thresholds = [t_0, ..., t_{n-1}]; t_0 < ... < t_{n-1} # where n = num_thresholds, and if we can compute the bucket function # B(i) = Sum( (predictions == t), t_i <= t < t{i+1} ) # then we get # C(t_i) = sum( B(j), j >= i ) # which is the reversed cumulative sum in tf.cumsum(). # # We can compute B(i) efficiently by taking advantage of the fact that # our thresholds are evenly distributed, in that # width = 1.0 / (num_thresholds - 1) # thresholds = [0.0, 1*width, 2*width, 3*width, ..., 1.0] # Given a prediction value p, we can map it to its bucket by # bucket_index(p) = floor( p * (num_thresholds - 1) ) # so we can use tf.scatter_add() to update the buckets in one pass. # Compute the bucket indices for each prediction value. bucket_indices = tf.cast(tf.floor(predictions * (num_thresholds - 1)), tf.int32) # Bucket predictions. tp_buckets = tf.reduce_sum( input_tensor=tf.one_hot(bucket_indices, depth=num_thresholds) * true_labels, axis=0, ) fp_buckets = tf.reduce_sum( input_tensor=tf.one_hot(bucket_indices, depth=num_thresholds) * false_labels, axis=0, ) # Set up the cumulative sums to compute the actual metrics. tp = tf.cumsum(tp_buckets, reverse=True, name="tp") fp = tf.cumsum(fp_buckets, reverse=True, name="fp") # fn = sum(true_labels) - tp # = sum(tp_buckets) - tp # = tp[0] - tp # Similarly, # tn = fp[0] - fp tn = fp[0] - fp fn = tp[0] - tp precision = tp / tf.maximum(_MINIMUM_COUNT, tp + fp) recall = tp / tf.maximum(_MINIMUM_COUNT, tp + fn) return _create_tensor_summary( name, tp, fp, tn, fn, precision, recall, num_thresholds, display_name, description, collections, )
def selective_crop_and_resize(features, boxes, box_levels, boundaries, output_size=7, sample_offset=0.5): """Crop and resize boxes on a set of feature maps. Given multiple features maps indexed by different levels, and a set of boxes where each box is mapped to a certain level, it selectively crops and resizes boxes from the corresponding feature maps to generate the box features. We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf, figure 3 for reference). Specifically, for each feature map, we select an (output_size, output_size) set of pixels corresponding to the box location, and then use bilinear interpolation to select the feature value for each pixel. For performance, we perform the gather and interpolation on all layers as a single operation. This is op the multi-level features are first stacked and gathered into [2*output_size, 2*output_size] feature points. Then bilinear interpolation is performed on the gathered feature points to generate [output_size, output_size] RoIAlign feature map. Here is the step-by-step algorithm: 1. The multi-level features are gathered into a [batch_size, num_boxes, output_size*2, output_size*2, num_filters] Tensor. The Tensor contains four neighboring feature points for each vertice in the output grid. 2. Compute the interpolation kernel of shape [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis can be seen as stacking 2x2 interpolation kernels for all vertices in the output grid. 3. Element-wise multiply the gathered features and interpolation kernel. Then apply 2x2 average pooling to reduce spatial dimension to output_size. Args: features: a 5-D tensor of shape [batch_size, num_levels, max_height, max_width, num_filters] where cropping and resizing are based. boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the information of each box w.r.t. the corresponding feature map. boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float) in terms of the number of pixels of the corresponding feature map size. box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing the 0-based corresponding feature level index of each box. boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing the boundary (in (y, x)) of the corresponding feature map for each box. Any resampled grid points that go beyond the bounary will be clipped. output_size: a scalar indicating the output crop size. sample_offset: a float number in [0, 1] indicates the subpixel sample offset from grid point. Returns: features_per_box: a 5-D tensor of shape [batch_size, num_boxes, output_size, output_size, num_filters] representing the cropped features. """ (batch_size, num_levels, max_feature_height, max_feature_width, num_filters) = features.get_shape().as_list() _, num_boxes, _ = boxes.get_shape().as_list() # Compute the grid position w.r.t. the corresponding feature map. box_grid_x = [] box_grid_y = [] for i in range(output_size): box_grid_x.append(boxes[:, :, 1] + (i + sample_offset) * boxes[:, :, 3] / output_size) box_grid_y.append(boxes[:, :, 0] + (i + sample_offset) * boxes[:, :, 2] / output_size) box_grid_x = tf.stack(box_grid_x, axis=2) box_grid_y = tf.stack(box_grid_y, axis=2) # Compute indices for gather operation. box_grid_y0 = tf.floor(box_grid_y) box_grid_x0 = tf.floor(box_grid_x) box_grid_x0 = tf.maximum(0., box_grid_x0) box_grid_y0 = tf.maximum(0., box_grid_y0) box_gridx0x1 = tf.stack( [tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1)), tf.minimum(box_grid_x0 + 1, tf.expand_dims(boundaries[:, :, 1], -1))], axis=3) box_gridy0y1 = tf.stack( [tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1)), tf.minimum(box_grid_y0 + 1, tf.expand_dims(boundaries[:, :, 0], -1))], axis=3) x_indices = tf.cast( tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) y_indices = tf.cast( tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) height_dim_offset = max_feature_width level_dim_offset = max_feature_height * height_dim_offset batch_dim_offset = num_levels * level_dim_offset indices = tf.reshape( tf.tile(tf.reshape(tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]), [1, num_boxes, output_size * 2, output_size * 2]) + tf.tile(tf.reshape(box_levels * level_dim_offset, [batch_size, num_boxes, 1, 1]), [1, 1, output_size * 2, output_size * 2]) + tf.tile(tf.reshape(y_indices * height_dim_offset, [batch_size, num_boxes, output_size * 2, 1]), [1, 1, 1, output_size * 2]) + tf.tile(tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]), [1, 1, output_size * 2, 1]), [-1]) features = tf.reshape(features, [-1, num_filters]) features_per_box = tf.reshape( tf.gather(features, indices), [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters]) # The RoIAlign feature f can be computed by bilinear interpolation of four # neighboring feature points f0, f1, f2, and f3. # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T # [f10, f11]] # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11 # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11 ly = box_grid_y - box_grid_y0 lx = box_grid_x - box_grid_x0 hy = 1.0 - ly hx = 1.0 - lx kernel_x = tf.reshape(tf.stack([hx, lx], axis=3), [batch_size, num_boxes, 1, output_size*2]) kernel_y = tf.reshape(tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size*2, 1]) # Uses implicit broadcast to generate the interpolation kernel. The # multiplier `4` is for avg pooling. interpolation_kernel = kernel_y * kernel_x * 4 # Interpolates the gathered features with computed interpolation kernels. features_per_box *= tf.cast( tf.expand_dims(interpolation_kernel, axis=4), dtype=features_per_box.dtype) features_per_box = tf.reshape( features_per_box, [batch_size * num_boxes, output_size*2, output_size*2, num_filters]) features_per_box = tf.nn.avg_pool( features_per_box, [1, 2, 2, 1], [1, 2, 2, 1], 'VALID') features_per_box = tf.reshape( features_per_box, [batch_size, num_boxes, output_size, output_size, num_filters]) return features_per_box
def tf_interpolate(voxel, x, y, z, out_size): """ Trilinear interpolation for batch of voxels :param voxel: The whole voxel grid :param x,y,z: indices of voxel :param output_size: output size of voxel :return: """ batch_size = tf.shape(voxel)[0] height = tf.shape(voxel)[1] width = tf.shape(voxel)[2] depth = tf.shape(voxel)[3] n_channels = tf.shape(voxel)[4] x = tf.cast(x, 'float32') y = tf.cast(y, 'float32') z = tf.cast(z, 'float32') out_height = out_size[1] out_width = out_size[2] out_depth = out_size[3] out_channel = out_size[4] zero = tf.zeros([], dtype='int32') max_y = tf.cast(height - 1, 'int32') max_x = tf.cast(width - 1, 'int32') max_z = tf.cast(depth - 1, 'int32') # do sampling x0 = tf.cast(tf.floor(x), 'int32') x1 = x0 + 1 y0 = tf.cast(tf.floor(y), 'int32') y1 = y0 + 1 z0 = tf.cast(tf.floor(z), 'int32') z1 = z0 + 1 x0 = tf.clip_by_value(x0, zero, max_x) x1 = tf.clip_by_value(x1, zero, max_x) y0 = tf.clip_by_value(y0, zero, max_y) y1 = tf.clip_by_value(y1, zero, max_y) z0 = tf.clip_by_value(z0, zero, max_z) z1 = tf.clip_by_value(z1, zero, max_z) #A 1D tensor of base indicies describe First index for each shape/map in the whole batch #tf.range(batch_size) * width * height * depth : Element to repeat. Each selement in the list is incremented by width*height*depth amount # out_height * out_width * out_depth: n of repeat. Create chunks of out_height*out_width*out_depth length with the same value created by tf.rage(batch_size) *width*height*dept base = tf_repeat( tf.range(batch_size) * width * height * depth, out_height * out_width * out_depth) #Find the Z element of each index base_z0 = base + z0 * width * height base_z1 = base + z1 * width * height #Find the Y element based on Z base_z0_y0 = base_z0 + y0 * width base_z0_y1 = base_z0 + y1 * width base_z1_y0 = base_z1 + y0 * width base_z1_y1 = base_z1 + y1 * width # Find the X element based on Y, Z for Z=0 idx_a = base_z0_y0 + x0 idx_b = base_z0_y1 + x0 idx_c = base_z0_y0 + x1 idx_d = base_z0_y1 + x1 # Find the X element based on Y,Z for Z =1 idx_e = base_z1_y0 + x0 idx_f = base_z1_y1 + x0 idx_g = base_z1_y0 + x1 idx_h = base_z1_y1 + x1 # use indices to lookup pixels in the flat image and restore # channels dim voxel_flat = tf.reshape(voxel, [-1, n_channels]) voxel_flat = tf.cast(voxel_flat, 'float32') Ia = tf.gather(voxel_flat, idx_a) Ib = tf.gather(voxel_flat, idx_b) Ic = tf.gather(voxel_flat, idx_c) Id = tf.gather(voxel_flat, idx_d) Ie = tf.gather(voxel_flat, idx_e) If = tf.gather(voxel_flat, idx_f) Ig = tf.gather(voxel_flat, idx_g) Ih = tf.gather(voxel_flat, idx_h) # and finally calculate interpolated values x0_f = tf.cast(x0, 'float32') x1_f = tf.cast(x1, 'float32') y0_f = tf.cast(y0, 'float32') y1_f = tf.cast(y1, 'float32') z0_f = tf.cast(z0, 'float32') z1_f = tf.cast(z1, 'float32') #First slice XY along Z where z=0 wa = tf.expand_dims(((x1_f - x) * (y1_f - y) * (z1_f - z)), 1) wb = tf.expand_dims(((x1_f - x) * (y - y0_f) * (z1_f - z)), 1) wc = tf.expand_dims(((x - x0_f) * (y1_f - y) * (z1_f - z)), 1) wd = tf.expand_dims(((x - x0_f) * (y - y0_f) * (z1_f - z)), 1) # First slice XY along Z where z=1 we = tf.expand_dims(((x1_f - x) * (y1_f - y) * (z - z0_f)), 1) wf = tf.expand_dims(((x1_f - x) * (y - y0_f) * (z - z0_f)), 1) wg = tf.expand_dims(((x - x0_f) * (y1_f - y) * (z - z0_f)), 1) wh = tf.expand_dims(((x - x0_f) * (y - y0_f) * (z - z0_f)), 1) output = tf.add_n([ wa * Ia, wb * Ib, wc * Ic, wd * Id, we * Ie, wf * If, wg * Ig, wh * Ih ]) return output
def compute_floor_offsets_with_indices(y_source, x_source, y_target=None, x_target=None): """Computes offsets from floored source(floored) to target coordinates. This function computes the offsets from source coordinates ("floored" as if they were put on the grids) to target coordinates. Note that the input coordinates should be the "absolute" coordinates in terms of the output image dimensions as opposed to the normalized coordinates (i.e. values in [0, 1]). If the input y and x source have the second dimension (representing the neighboring pixels), then the offsets are computed from each of the neighboring pixels to their corresponding target (first dimension). Args: y_source: A tensor with shape [num_points] (or [num_points, num_neighbors]) representing the absolute y-coordinates (in the output image space) of the source points. x_source: A tensor with shape [num_points] (or [num_points, num_neighbors]) representing the absolute x-coordinates (in the output image space) of the source points. y_target: A tensor with shape [num_points] representing the absolute y-coordinates (in the output image space) of the target points. If not provided, then y_source is used as the targets. x_target: A tensor with shape [num_points] representing the absolute x-coordinates (in the output image space) of the target points. If not provided, then x_source is used as the targets. Returns: A tuple of two tensors: offsets: A tensor with shape [num_points, 2] (or [num_points, num_neighbors, 2]) representing the offsets of each input point. indices: A tensor with shape [num_points, 2] (or [num_points, num_neighbors, 2]) representing the indices of where the offsets should be retrieved in the output image dimension space. Raise: ValueError: source and target shapes have unexpected values. """ y_source_floored = tf.floor(y_source) x_source_floored = tf.floor(x_source) source_shape = shape_utils.combined_static_and_dynamic_shape(y_source) if y_target is None and x_target is None: y_target = y_source x_target = x_source else: target_shape = shape_utils.combined_static_and_dynamic_shape(y_target) if len(source_shape) == 2 and len(target_shape) == 1: _, num_neighbors = source_shape y_target = tf.tile( tf.expand_dims(y_target, -1), multiples=[1, num_neighbors]) x_target = tf.tile( tf.expand_dims(x_target, -1), multiples=[1, num_neighbors]) elif source_shape != target_shape: raise ValueError('Inconsistent source and target shape.') y_offset = y_target - y_source_floored x_offset = x_target - x_source_floored y_source_indices = tf.cast(y_source_floored, tf.int32) x_source_indices = tf.cast(x_source_floored, tf.int32) indices = tf.stack([y_source_indices, x_source_indices], axis=-1) offsets = tf.stack([y_offset, x_offset], axis=-1) return offsets, indices
def selective_crop_and_resize(features, boxes, box_levels, boundaries, output_size=7, is_gpu_inference=False): """Crop and resize boxes on a set of feature maps. Given multiple features maps indexed by different levels, and a set of boxes where each box is mapped to a certain level, it selectively crops and resizes boxes from the corresponding feature maps to generate the box features. We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf, figure 3 for reference). Specifically, for each feature map, we select an (output_size, output_size) set of pixels corresponding to the box location, and then use bilinear interpolation to select the feature value for each pixel. For performance, we perform the gather and interpolation on all layers as a single operation. This is op the multi-level features are first stacked and gathered into [2*output_size, 2*output_size] feature points. Then bilinear interpolation is performed on the gathered feature points to generate [output_size, output_size] RoIAlign feature map. Here is the step-by-step algorithm: 1. The multi-level features are gathered into a [batch_size, num_boxes, output_size*2, output_size*2, num_filters] Tensor. The Tensor contains four neighboring feature points for each vertice in the output grid. 2. Compute the interpolation kernel of shape [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis can be seen as stacking 2x2 interpolation kernels for all vertices in the output grid. 3. Element-wise multiply the gathered features and interpolation kernel. Then apply 2x2 average pooling to reduce spatial dimension to output_size. Args: features: a 5-D tensor of shape [batch_size, num_levels, max_height, max_width, num_filters] where cropping and resizing are based. boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the information of each box w.r.t. the corresponding feature map. boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float) in terms of the number of pixels of the corresponding feature map size. box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing the 0-based corresponding feature level index of each box. boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing the boundary (in (y, x)) of the corresponding feature map for each box. Any resampled grid points that go beyond the bounary will be clipped. output_size: a scalar indicating the output crop size. is_gpu_inference: whether to build the model for GPU inference. Returns: features_per_box: a 5-D tensor of shape [batch_size, num_boxes, output_size, output_size, num_filters] representing the cropped features. """ (batch_size, num_levels, max_feature_height, max_feature_width, num_filters) = features.get_shape().as_list() _, num_boxes, _ = boxes.get_shape().as_list() # Compute the grid position w.r.t. the corresponding feature map. box_grid_y, box_grid_x = compute_grid_positions(boxes, output_size) # Compute indices for gather operation. box_grid_y0 = tf.floor(box_grid_y) box_grid_x0 = tf.floor(box_grid_x) box_grid_x0 = tf.maximum(0., box_grid_x0) box_grid_y0 = tf.maximum(0., box_grid_y0) box_gridx0x1 = tf.stack([ tf.minimum(box_grid_x0, boundaries[:, :, 1:2]), tf.minimum(box_grid_x0 + 1, boundaries[:, :, 1:2]) ], axis=3) box_gridy0y1 = tf.stack([ tf.minimum(box_grid_y0, boundaries[:, :, 0:1]), tf.minimum(box_grid_y0 + 1, boundaries[:, :, 0:1]) ], axis=3) x_indices = ( tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2])) y_indices = ( tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2])) # If using GPU for inference, delay the cast until when Gather ops show up # since GPU inference supports float point better. # TODO(laigd): revisit this when newer versions of GPU libraries is released. indices_dtype = tf.float32 if is_gpu_inference else tf.int32 if not is_gpu_inference: x_indices = tf.cast(x_indices, tf.int32) y_indices = tf.cast(y_indices, tf.int32) height_dim_offset = max_feature_width level_dim_offset = max_feature_height * height_dim_offset batch_dim_offset = num_levels * level_dim_offset batch_dim_indices = ( tf.reshape( tf.range(batch_size, dtype=indices_dtype) * batch_dim_offset, [batch_size, 1, 1, 1]) * tf.ones([1, num_boxes, output_size * 2, output_size * 2], dtype=indices_dtype)) box_level_indices = ( tf.reshape(box_levels * level_dim_offset, [batch_size, num_boxes, 1, 1]) * tf.ones([1, 1, output_size * 2, output_size * 2], dtype=indices_dtype)) height_indices = ( tf.reshape(y_indices * height_dim_offset, [batch_size, num_boxes, output_size * 2, 1]) * tf.ones([1, 1, 1, output_size * 2], dtype=indices_dtype)) width_indices = ( tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]) * tf.ones([1, 1, output_size * 2, 1], dtype=indices_dtype)) # TODO(hongjunchoi): Remove the need for temporary variables as # temporary variables with int32 dtype are not supported for GPU's. indices = tf.add_n([ batch_dim_indices, box_level_indices, height_indices, width_indices, ]) if batch_size == 1: # Special handling for single batch input to make it friendly for GPU # inference. indices = tf.reshape(indices, [1, -1]) if is_gpu_inference: indices = tf.cast(indices, dtype=tf.int32) features = tf.reshape(features, [1, -1, num_filters]) # Cast should happen at last since GPU has better support for floating point # operations. features_per_box = tf.gather(features, indices, axis=1) else: indices = tf.reshape(indices, [-1]) if is_gpu_inference: indices = tf.cast(indices, dtype=tf.int32) features = tf.reshape(features, [-1, num_filters]) features_per_box = tf.gather(features, indices) features_per_box = tf.reshape( features_per_box, [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters]) # The RoIAlign feature f can be computed by bilinear interpolation of four # neighboring feature points f0, f1, f2, and f3. # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T # [f10, f11]] # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11 # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11 ly = box_grid_y - box_grid_y0 lx = box_grid_x - box_grid_x0 hy = 1.0 - ly hx = 1.0 - lx kernel_x = tf.reshape( tf.stack([hx, lx], axis=3), [batch_size, num_boxes, 1, output_size * 2]) kernel_y = tf.reshape( tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size * 2, 1]) # Use implicit broadcast to generate the interpolation kernel. The # multiplier `4` is for avg pooling. interpolation_kernel = kernel_y * kernel_x * 4 # Interpolate the gathered features with computed interpolation kernels. features_per_box *= tf.cast( tf.expand_dims(interpolation_kernel, axis=4), dtype=features_per_box.dtype) features_per_box = tf.reshape( features_per_box, [batch_size * num_boxes, output_size * 2, output_size * 2, num_filters]) features_per_box = tf.nn.avg_pool(features_per_box, [1, 2, 2, 1], [1, 2, 2, 1], 'VALID') features_per_box = tf.reshape( features_per_box, [batch_size, num_boxes, output_size, output_size, num_filters]) return features_per_box
def testLoss(self): """ Tests the loss of the FasterRCNN """ # Create prediction_dict's structure prediction_dict_random = { 'rpn_prediction': {}, 'classification_prediction': { 'rcnn': { 'cls_score': None, 'bbox_offsets': None }, 'target': {}, '_debug': { 'losses': {} } } } prediction_dict_perf = { 'rpn_prediction': {}, 'classification_prediction': { 'rcnn': { 'cls_score': None, 'bbox_offsets': None }, 'target': {}, '_debug': { 'losses': {} } } } # Set seeds for stable results rand_seed = 13 target_seed = 43 image_size = (60, 80) num_anchors = 1000 config = EasyDict(self.config) config.model.rpn.l2_regularization_scale = 0.0 config.model.rcnn.l2_regularization_scale = 0.0 config.model.base_network.arg_scope.weight_decay = 0.0 # RPN # Random generation of cls_targets for rpn # where: # {-1}: Ignore # { 0}: Background # { 1}: Object rpn_cls_target = tf.floor( tf.random_uniform([num_anchors], minval=-1, maxval=2, dtype=tf.float32, seed=target_seed, name=None)) # Creation of cls_scores with: # score 100 in correct class # score 0 in wrong class # Generation of opposite cls_score for rpn rpn_cls_score = tf.cast( tf.one_hot(tf.cast(tf.mod(tf.identity(rpn_cls_target) + 1, 2), tf.int32), depth=2, on_value=10), tf.float32) # Generation of correct cls_score for rpn rpn_cls_perf_score = tf.cast( tf.one_hot(tf.cast(tf.identity(rpn_cls_target), tf.int32), depth=2, on_value=100), tf.float32) # Random generation of target bbox deltas rpn_bbox_target = tf.floor( tf.random_uniform([num_anchors, 4], minval=-1, maxval=1, dtype=tf.float32, seed=target_seed, name=None)) # Random generation of predicted bbox deltas rpn_bbox_predictions = tf.floor( tf.random_uniform([num_anchors, 4], minval=-1, maxval=1, dtype=tf.float32, seed=rand_seed, name=None)) prediction_dict_random['rpn_prediction'][ 'rpn_cls_score'] = rpn_cls_score prediction_dict_random['rpn_prediction'][ 'rpn_cls_target'] = rpn_cls_target prediction_dict_random['rpn_prediction'][ 'rpn_bbox_target'] = rpn_bbox_target prediction_dict_random['rpn_prediction'][ 'rpn_bbox_pred'] = rpn_bbox_predictions prediction_dict_perf['rpn_prediction'][ 'rpn_cls_score'] = rpn_cls_perf_score prediction_dict_perf['rpn_prediction'][ 'rpn_cls_target'] = rpn_cls_target prediction_dict_perf['rpn_prediction'][ 'rpn_bbox_target'] = rpn_bbox_target prediction_dict_perf['rpn_prediction'][ 'rpn_bbox_pred'] = rpn_bbox_target # RCNN # Set the number of classes num_classes = config.model.network.num_classes # Randomly generate the bbox_offsets for the correct class = 1 prediction_dict_random['classification_prediction']['target'] = { 'bbox_offsets': tf.random_uniform([1, 4], minval=-1, maxval=1, dtype=tf.float32, seed=target_seed, name=None), 'cls': [1] } # Set the same bbox_offsets and cls for the perfect prediction prediction_dict_perf['classification_prediction'][ 'target'] = prediction_dict_random['classification_prediction'][ 'target'].copy() # Generate random scores for the num_classes + the background class rcnn_cls_score = tf.random_uniform([1, num_classes + 1], minval=-100, maxval=100, dtype=tf.float32, seed=rand_seed, name=None) # Generate a perfect prediction with the correct class score = 100 # and the rest set to 0 rcnn_cls_perf_score = tf.cast( tf.one_hot([1], depth=num_classes + 1, on_value=100), tf.float32) # Generate the random delta prediction for each class rcnn_bbox_offsets = tf.random_uniform([1, num_classes * 4], minval=-1, maxval=1, dtype=tf.float32, seed=rand_seed, name=None) # Copy the random prediction and set the correct class prediction # as the target one target_bbox_offsets = prediction_dict_random[ 'classification_prediction']['target']['bbox_offsets'] initial_val = 1 * 4 # cls value * 4 rcnn_bbox_perf_offsets = tf.Variable( tf.reshape( tf.random_uniform([1, num_classes * 4], minval=-1, maxval=1, dtype=tf.float32, seed=target_seed, name=None), [-1])) rcnn_bbox_perf_offsets = tf.reshape( tf.scatter_update(rcnn_bbox_perf_offsets, tf.range(initial_val, initial_val + 4), tf.reshape(target_bbox_offsets, [-1])), [1, -1]) prediction_dict_random['classification_prediction']['rcnn'][ 'cls_score'] = rcnn_cls_score prediction_dict_random['classification_prediction']['rcnn'][ 'bbox_offsets'] = rcnn_bbox_offsets prediction_dict_perf['classification_prediction']['rcnn'][ 'cls_score'] = rcnn_cls_perf_score prediction_dict_perf['classification_prediction']['rcnn'][ 'bbox_offsets'] = rcnn_bbox_perf_offsets loss_perfect = self._get_losses(config, prediction_dict_perf, image_size) loss_random = self._get_losses(config, prediction_dict_random, image_size) loss_random_compare = { 'rcnn_cls_loss': 5, 'rcnn_reg_loss': 3, 'rpn_cls_loss': 5, 'rpn_reg_loss': 3, 'no_reg_loss': 16, 'regularization_loss': 0, 'total_loss': 22, } for loss in loss_random: self.assertGreaterEqual(loss_random[loss], loss_random_compare[loss], loss) self.assertEqual(loss_perfect[loss], 0, loss)
def _model_fn(input_fea, input_lab): """Creates a model, add summary, modes (train or eval), and hooks.""" # input_fea and input_lab should be a list (laid_out_tensors). if not isinstance(input_fea, list): input_fea = [input_fea] if not isinstance(input_lab, list): input_lab = [input_lab] def _add_summary(lowering, train_or_eval, tf_loss, scalars, global_step): """Add all summaries.""" for k in scalars.keys(): if not isinstance(scalars[k], tf.Tensor): scalars[k] = tf.cast( lowering.export_to_tf_tensor(scalars[k]), tf.float32) def _host_loss_summary(global_step, tf_loss, **scalars): """Add summary.scalar in host side.""" gs = tf.cast(global_step, tf.int64) sum_loss = contrib_summary.scalar( '{}_loss'.format(train_or_eval), tf_loss, step=gs) sum_ops = [sum_loss.op] for description, tf_metric in scalars.iteritems(): sum_metric = contrib_summary.scalar('{}_{}'.format( train_or_eval, description), tf_metric, step=gs) sum_ops.append(sum_metric) with tf.control_dependencies(sum_ops): return tf.identity(tf_loss) if FLAGS.use_tpu: # Cast the global step to tf.int32, since # outside_compilation does not support tf.int64. tf_loss = tpu.outside_compilation( _host_loss_summary, tf.cast(global_step, tf.int32), tf_loss, **scalars) else: tf_loss = _host_loss_summary(tf.cast(global_step, tf.int32), tf_loss, **scalars) return tf_loss global_step = tf.train.get_or_create_global_step() graph, mesh, mesh_impl = mesh_context.create_graph_mesh_and_mesh_impl() with mtf.utils.outside_all_rewrites(): # Do not tpu_rewrite this part. Inside this unet, If you use Tensorflow, # instead of Mesh-Tensorflor, it will cause host to tpu send/rec. preds, loss, scalars, bn_update_ops = ( unet.unet_with_spatial_partition(mesh, mesh_impl, train_or_eval, input_fea, input_lab)) if train_or_eval == 'train': var_grads = mtf.gradients( [loss], [v.outputs[0] for v in graph.trainable_variables]) lr = FLAGS.lr * tf.pow( FLAGS.lr_drop_rate, tf.floor( tf.cast(global_step, tf.float32) / FLAGS.lr_drop_steps)) scalars['learning_rate'] = lr optimizer = mtf.optimize.AdafactorOptimizer(learning_rate=lr) update_ops = optimizer.apply_grads(var_grads, graph.trainable_variables) # This is where the actual tf graph got built. lowering = mtf.Lowering(graph, {mesh: mesh_impl}) tf_update_ops = [ lowering.lowered_operation(op) for op in update_ops ] tf_update_ops.append(tf.assign_add(global_step, 1)) tf_update_ops.extend( [lowering.lowered_operation(op) for op in bn_update_ops]) else: # train_or_eval == 'eval': preds = [mtf.anonymize(pred) for pred in preds] # This is where the actual tf graph got built. lowering = mtf.Lowering(graph, {mesh: mesh_impl}) tf_preds = [ tf.cast(lowering.export_to_tf_tensor(pred), tf.float32) for pred in preds ] tf_loss = tf.cast(lowering.export_to_tf_tensor(loss), tf.float32) if FLAGS.write_summary: tf_loss = _add_summary(lowering, train_or_eval, tf_loss, scalars, global_step) master_to_slice_hook = mtf.MtfRestoreHook(lowering) if train_or_eval == 'train': with mtf.utils.outside_all_rewrites(): saver = tf.train.Saver(tf.global_variables(), save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) saver_listener = mtf.MtfCheckpointSaverListener(lowering) slice_to_master_hook = tf.train.CheckpointSaverHook( FLAGS.checkpoint_dir, save_steps=FLAGS.save_checkpoints_steps, saver=saver, listeners=[saver_listener]) captured_hooks.capture( [master_to_slice_hook, slice_to_master_hook]) return tf.group([tf_loss] + tf_update_ops) else: # train_or_eval == 'eval': if FLAGS.use_tpu: tf_preds.extend([tf_loss, global_step]) tf_preds_dtypes = [tf_pred.dtype for tf_pred in tf_preds] tf_preds_shapes = [tf_pred.shape for tf_pred in tf_preds] captured_hooks.capture([master_to_slice_hook, None]) captured_output_dtypes_shapes.capture( [tf_preds_dtypes, tf_preds_shapes]) return tpu_ops.outfeed_enqueue_tuple(tf_preds) else: tf_preds.extend([tf_loss, global_step]) captured_hooks.capture([master_to_slice_hook, None]) return tf_preds
def bilinear_sampler(imgs, coords): """Construct a new image by bilinear sampling from the input image. Points falling outside the source image boundary have value 0. Args: imgs: source image to be sampled from [batch, height_s, width_s, channels] coords: coordinates of source pixels to sample from [batch, height_t, width_t, 2]. height_t/width_t correspond to the dimensions of the output image (don't need to be the same as height_s/width_s). The two channels correspond to x and y coordinates respectively. Returns: A new sampled image [batch, height_t, width_t, channels] """ def _repeat(x, n_repeats): rep = tf.transpose( tf.expand_dims(tf.ones(shape=tf.stack([ n_repeats, ])), 1), [1, 0]) rep = tf.cast(rep, 'float32') x = tf.matmul(tf.reshape(x, (-1, 1)), rep) return tf.reshape(x, [-1]) with tf.name_scope('image_sampling'): coords_x, coords_y = tf.split(coords, [1, 1], axis=3) inp_size = imgs.get_shape() coord_size = coords.get_shape() out_size = coords.get_shape().as_list() out_size[3] = imgs.get_shape().as_list()[3] coords_x = tf.cast(coords_x, 'float32') coords_y = tf.cast(coords_y, 'float32') x0 = tf.floor(coords_x) x1 = x0 + 1 y0 = tf.floor(coords_y) y1 = y0 + 1 y_max = tf.cast(tf.shape(imgs)[1] - 1, 'float32') x_max = tf.cast(tf.shape(imgs)[2] - 1, 'float32') zero = tf.zeros([1], dtype='float32') x0_safe = tf.clip_by_value(x0, zero, x_max) y0_safe = tf.clip_by_value(y0, zero, y_max) x1_safe = tf.clip_by_value(x1, zero, x_max) y1_safe = tf.clip_by_value(y1, zero, y_max) ## bilinear interp weights, with points outside the grid having weight 0 # wt_x0 = (x1 - coords_x) * tf.cast(tf.equal(x0, x0_safe), 'float32') # wt_x1 = (coords_x - x0) * tf.cast(tf.equal(x1, x1_safe), 'float32') # wt_y0 = (y1 - coords_y) * tf.cast(tf.equal(y0, y0_safe), 'float32') # wt_y1 = (coords_y - y0) * tf.cast(tf.equal(y1, y1_safe), 'float32') wt_x0 = x1_safe - coords_x wt_x1 = coords_x - x0_safe wt_y0 = y1_safe - coords_y wt_y1 = coords_y - y0_safe ## indices in the flat image to sample from dim2 = tf.cast(inp_size[2], 'float32') dim1 = tf.cast(inp_size[2] * inp_size[1], 'float32') base = tf.reshape( _repeat( tf.cast(tf.range(coord_size[0]), 'float32') * dim1, coord_size[1] * coord_size[2]), [out_size[0], out_size[1], out_size[2], 1]) base_y0 = base + y0_safe * dim2 base_y1 = base + y1_safe * dim2 idx00 = tf.reshape(x0_safe + base_y0, [-1]) idx01 = x0_safe + base_y1 idx10 = x1_safe + base_y0 idx11 = x1_safe + base_y1 ## sample from imgs imgs_flat = tf.reshape(imgs, tf.stack([-1, inp_size[3]])) imgs_flat = tf.cast(imgs_flat, 'float32') im00 = tf.reshape(tf.gather(imgs_flat, tf.cast(idx00, 'int32')), out_size) im01 = tf.reshape(tf.gather(imgs_flat, tf.cast(idx01, 'int32')), out_size) im10 = tf.reshape(tf.gather(imgs_flat, tf.cast(idx10, 'int32')), out_size) im11 = tf.reshape(tf.gather(imgs_flat, tf.cast(idx11, 'int32')), out_size) w00 = wt_x0 * wt_y0 w01 = wt_x0 * wt_y1 w10 = wt_x1 * wt_y0 w11 = wt_x1 * wt_y1 output = tf.add_n([w00 * im00, w01 * im01, w10 * im10, w11 * im11]) return output