def _enas_layer(self, layer_id, prev_layers, arc, out_filters): """ Args: layer_id: current layer prev_layers: cache of previous layers. for skip connections start_idx: where to start looking at. technically, we can infer this from layer_id, but why bother... """ assert len(prev_layers) == 2, "need exactly 2 inputs" layers = [prev_layers[0], prev_layers[1]] layers = self._maybe_calibrate_size(layers, out_filters, is_training=True) used = [] for cell_id in range(self.num_cells): prev_layers = tf.stack(layers, axis=0) with tf.variable_scope("cell_{0}".format(cell_id)): with tf.variable_scope("x"): x_id = arc[4 * cell_id] x_op = arc[4 * cell_id + 1] x = prev_layers[x_id, :, :, :, :] x = self._enas_cell(x, cell_id, x_id, x_op, out_filters) x_used = tf.one_hot(x_id, depth=self.num_cells + 2, dtype=tf.int32) with tf.variable_scope("y"): y_id = arc[4 * cell_id + 2] y_op = arc[4 * cell_id + 3] y = prev_layers[y_id, :, :, :, :] y = self._enas_cell(y, cell_id, y_id, y_op, out_filters) y_used = tf.one_hot(y_id, depth=self.num_cells + 2, dtype=tf.int32) out = x + y used.extend([x_used, y_used]) layers.append(out) used = tf.add_n(used) indices = tf.where(tf.equal(used, 0)) indices = tf.to_int32(indices) indices = tf.reshape(indices, [-1]) num_outs = tf.size(indices) out = tf.stack(layers, axis=0) out = tf.gather(out, indices, axis=0) inp = prev_layers[0] if self.data_format == "NHWC": N = tf.shape(inp)[0] H = tf.shape(inp)[1] W = tf.shape(inp)[2] C = tf.shape(inp)[3] out = tf.transpose(out, [1, 2, 3, 0, 4]) out = tf.reshape(out, [N, H, W, num_outs * out_filters]) elif self.data_format == "NCHW": N = tf.shape(inp)[0] C = tf.shape(inp)[1] H = tf.shape(inp)[2] W = tf.shape(inp)[3] out = tf.transpose(out, [1, 0, 2, 3, 4]) out = tf.reshape(out, [N, num_outs * out_filters, H, W]) else: raise ValueError("Unknown data_format '{0}'".format(self.data_format)) with tf.variable_scope("final_conv"): w = create_weight("w", [self.num_cells + 2, out_filters * out_filters]) w = tf.gather(w, indices, axis=0) w = tf.reshape(w, [1, 1, num_outs * out_filters, out_filters]) out = tf.nn.relu(out) out = tf.nn.conv2d(out, w, strides=[1, 1, 1, 1], padding="SAME", data_format=self.data_format) out = batch_norm(out, is_training=True, data_format=self.data_format) out = tf.reshape(out, tf.shape(prev_layers[0])) return out
def piecewise_linear(boundaries, values, name=None): """Piecewise linear function assuming given values at given boundaries. Args: boundaries: A list of `Tensor`s or `int`s or `float`s with strictly increasing entries. The first entry must be 0. values: A list of `Tensor`s or float`s or `int`s that specifies the values at the `boundaries`. It must have the same number of elements as `boundaries`, and all elements should have the same type. name: A string. Optional name of the operation. Defaults to 'PiecewiseConstant'. Returns: A 0-D Tensor. Its value is `values[0]` if `x < boundaries[0]` and `values[-1]` if `x >= boundaries[-1]. If `boundaries[i] <= x < boundaries[i+1]` it is the linear interpolation between `values[i]` and `values[i+1]`: `values[i] + (values[i+1]-values[i]) * (x-boundaries[i]) / (boundaries[i+1]-boundaries[i])`. Raises: AssertionError: if values or boundaries is empty, or not the same size. """ global_step = tf.train.get_or_create_global_step() with tf.name_scope(name, 'PiecewiseLinear', [global_step, boundaries, values, name]) as name: values = tf.convert_to_tensor(values) x = tf.cast(tf.convert_to_tensor(global_step), values.dtype) boundaries = tf.cast(tf.convert_to_tensor(boundaries), values.dtype) num_boundaries = np.prod(boundaries.shape.as_list()) num_values = np.prod(values.shape.as_list()) assert num_boundaries > 0, 'Need more than 0 boundaries' assert num_values > 0, 'Need more than 0 values' assert num_values == num_boundaries, ( 'boundaries and values must be of ' 'same size') # Make sure there is an unmet last boundary with the same value as the # last one that was passed in, and at least one boundary was met. values = tf.concat([values, tf.reshape(values[-1], [1])], 0) boundaries = tf.concat( [boundaries, tf.reshape(tf.maximum(x + 1, boundaries[-1]), [1])], 0) # Make sure there is at least one boundary that was already met, with the # same value as the first one that was passed in. values = tf.concat([tf.reshape(values[0], [1]), values], 0) boundaries = tf.concat( [tf.reshape(tf.minimum(x - 1, boundaries[0]), [1]), boundaries], 0) # Identify index of the last boundary that was passed. unreached_boundaries = tf.reshape(tf.where(tf.greater(boundaries, x)), [-1]) unreached_boundaries = tf.concat( [unreached_boundaries, [tf.cast(tf.size(boundaries), tf.int64)]], 0) index = tf.reshape(tf.reduce_min(unreached_boundaries), [1]) # Get values at last and next boundaries. value_left = tf.reshape(tf.slice(values, index - 1, [1]), []) left_boundary = tf.reshape(tf.slice(boundaries, index - 1, [1]), []) value_right = tf.reshape(tf.slice(values, index, [1]), []) right_boundary = tf.reshape(tf.slice(boundaries, index, [1]), []) # Calculate linear interpolation. a = (value_right - value_left) / (right_boundary - left_boundary) b = value_left - a * left_boundary return a * x + b
def parse_train_data(self, data): """Parse data for ShapeMask training.""" classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] masks = data['groundtruth_instance_masks'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training and self._is_training: num_groundtrtuhs = tf.shape(classes)[0] with tf.control_dependencies([num_groundtrtuhs, is_crowds]): indices = tf.cond( tf.greater(tf.size(is_crowds), 0), lambda: tf.where(tf.logical_not(is_crowds))[:, 0], lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) masks = tf.gather(masks, indices) # If not using category, makes all categories with id = 1. if not self._use_category: classes = tf.cast(tf.greater(classes, 0), dtype=tf.int32) image = self.get_normalized_image(data) # Flips image randomly during training. if self._aug_rand_hflip: image, boxes, masks = input_utils.random_horizontal_flip( image, boxes, masks) # Converts boxes from normalized coordinates to pixel coordinates. image_shape = tf.shape(image)[0:2] boxes = box_utils.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = input_utils.resize_and_crop_image( image, self._output_size, self._output_size, aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) self._train_image_scale = image_info[2, :] self._train_offset = image_info[3, :] # Resizes and crops boxes and masks. boxes = input_utils.resize_and_crop_boxes(boxes, self._train_image_scale, image_info[1, :], self._train_offset) # Filters out ground truth boxes that are all zeros. indices = box_utils.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) masks = tf.gather(masks, indices) # Assigns anchors. input_anchor = anchor.Anchor( self._min_level, self._max_level, self._num_scales, self._aspect_ratios, self._anchor_size, self._output_size) anchor_labeler = anchor.AnchorLabeler( input_anchor, self._match_threshold, self._unmatched_threshold) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors( boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32)) # Sample groundtruth masks/boxes/classes for mask branch. num_masks = tf.shape(masks)[0] mask_shape = tf.shape(masks)[1:3] # Randomly shuffle groundtruth masks for mask branch training. rand_indices = tf.random.shuffle(tf.range(num_masks)) shuffled_boxes = tf.gather(boxes, rand_indices) shuffled_classes = tf.gather(classes, rand_indices) shuffled_masks = tf.gather(masks, rand_indices) # Pad sampled boxes/masks/classes to a constant batch size. If the image # has more masks than `num_sampled_masks`, the tensor will be clipped. padded_boxes = input_utils.clip_or_pad_to_fixed_size( shuffled_boxes, self._num_sampled_masks) padded_classes = input_utils.clip_or_pad_to_fixed_size( shuffled_classes, self._num_sampled_masks) padded_masks = input_utils.clip_or_pad_to_fixed_size( shuffled_masks, self._num_sampled_masks) # Jitter the sampled boxes to mimic the noisy detections. padded_boxes = box_utils.jitter_boxes( padded_boxes, noise_scale=self._box_jitter_scale) padded_boxes = box_utils.clip_boxes(padded_boxes, self._output_size) # Compute mask targets in feature crop. A feature crop fully contains a # sampled box. mask_outer_boxes = box_utils.compute_outer_boxes( padded_boxes, tf.shape(image)[0:2], scale=self._outer_box_scale) mask_outer_boxes = box_utils.clip_boxes(mask_outer_boxes, self._output_size) # Compensate the offset of mask_outer_boxes to map it back to original image # scale. mask_outer_boxes_ori = mask_outer_boxes mask_outer_boxes_ori += tf.tile( tf.expand_dims(self._train_offset, axis=0), [1, 2]) mask_outer_boxes_ori /= tf.tile( tf.expand_dims(self._train_image_scale, axis=0), [1, 2]) norm_mask_outer_boxes_ori = box_utils.normalize_boxes( mask_outer_boxes_ori, mask_shape) # Set sampled_masks shape to [batch_size, height, width, 1]. padded_masks = tf.cast(tf.expand_dims(padded_masks, axis=-1), tf.float32) mask_targets = tf.image.crop_and_resize( padded_masks, norm_mask_outer_boxes_ori, box_ind=tf.range(self._num_sampled_masks), crop_size=[self._mask_crop_size, self._mask_crop_size], method='bilinear', extrapolation_value=0, name='train_mask_targets') mask_targets = tf.where(tf.greater_equal(mask_targets, 0.5), tf.ones_like(mask_targets), tf.zeros_like(mask_targets)) mask_targets = tf.squeeze(mask_targets, axis=-1) if self._up_sample_factor > 1: fine_mask_targets = tf.image.crop_and_resize( padded_masks, norm_mask_outer_boxes_ori, box_ind=tf.range(self._num_sampled_masks), crop_size=[ self._mask_crop_size * self._up_sample_factor, self._mask_crop_size * self._up_sample_factor ], method='bilinear', extrapolation_value=0, name='train_mask_targets') fine_mask_targets = tf.where( tf.greater_equal(fine_mask_targets, 0.5), tf.ones_like(fine_mask_targets), tf.zeros_like(fine_mask_targets)) fine_mask_targets = tf.squeeze(fine_mask_targets, axis=-1) else: fine_mask_targets = mask_targets # If bfloat16 is used, casts input image to tf.bfloat16. if self._use_bfloat16: image = tf.cast(image, dtype=tf.bfloat16) valid_image = tf.cast(tf.not_equal(num_masks, 0), tf.int32) if self._mask_train_class == 'all': mask_is_valid = valid_image * tf.ones_like(padded_classes, tf.int32) else: # Get the intersection of sampled classes with training splits. mask_valid_classes = tf.cast( tf.expand_dims( class_utils.coco_split_class_ids(self._mask_train_class), 1), padded_classes.dtype) match = tf.reduce_any( tf.equal(tf.expand_dims(padded_classes, 0), mask_valid_classes), 0) mask_is_valid = valid_image * tf.cast(match, tf.int32) # Packs labels for model_fn outputs. labels = { 'cls_targets': cls_targets, 'box_targets': box_targets, 'anchor_boxes': input_anchor.multilevel_boxes, 'num_positives': num_positives, 'image_info': image_info, # For ShapeMask. 'mask_boxes': padded_boxes, 'mask_outer_boxes': mask_outer_boxes, 'mask_targets': mask_targets, 'fine_mask_targets': fine_mask_targets, 'mask_classes': padded_classes, 'mask_is_valid': mask_is_valid, } return image, labels
def variable_gradient_stability_estimate(model, tape, losses, batchsize, nelem_per_piece=8, aggregate_variable_estimates=True): """Estimate the symmetric alpha-stable tail index of gradient noise. We construct the estimate based on a model and gradient tape and a vector of per-instance losses. The set of losses is grouped into batches and we compute per-batch gradients. The total gradient is used to center the per-batch gradients, resulting in a set of independent gradient noise samples. These zero-mean gradient noise samples form the input to a tail index estimator. Args: model: tf.keras.Model. tape: tf.GradientTape(persistent=True) that has been used to compute losses. losses: Tensor of shape (n,), one loss element per instance. batchsize: int, the number of instances per batch. nelem_per_piece: int, number of elements to group per block in the tail index estimator. Ideally this is around sqrt(n//batchsize). aggregate_variable_estimates: bool, if True all estimates in a tf.Variable are mean-reduced. If False individual estimates for each parameter are computed. Returns: stability_estimate: list of tf.Tensor objects containing the estimates of the tail index (stability == alpha). """ n = int(tf.size(losses)) # number of instances with tape: loss_total = tf.reduce_mean(losses) losses_batched = tf.split(losses, n // batchsize) loss_batches = list(map(tf.reduce_mean, losses_batched)) gradients_total = tape.gradient(loss_total, model.trainable_variables) gradients_total = _filter_gradient_tensors(gradients_total) gradients_batches = list( map(lambda loss_i: tape.gradient(loss_i, model.trainable_variables), loss_batches)) gradients_batches = list(map(_filter_gradient_tensors, gradients_batches)) gradients_noise = list( map( lambda gradients_batch_j: list( map( # pylint: disable=g-long-lambda lambda grads: grads[1] - grads[0], zip(gradients_total, gradients_batch_j))), gradients_batches)) noises = list(map(tf.stack, zip(*gradients_noise))) sample_axis = 0 invalphas_estimate = list( map( lambda noise: symmetric_alpha_stable_invstability_estimator( # pylint: disable=g-long-lambda noise, sample_axis, nelem_per_piece), noises)) if aggregate_variable_estimates: stability_estimate = list( map(lambda invalpha: 1.0 / tf.reduce_mean(invalpha), invalphas_estimate)) else: stability_estimate = list( map(lambda invalpha: 1.0 / invalpha, invalphas_estimate)) return stability_estimate
def num_unmatched_columns(self): return tf.size(self.unmatched_column_indices())
def dropblock(net, is_training, keep_prob, dropblock_size, data_format='channels_first'): """DropBlock: a regularization method for convolutional neural networks. DropBlock is a form of structured dropout, where units in a contiguous region of a feature map are dropped together. DropBlock works better than dropout on convolutional layers due to the fact that activation units in convolutional layers are spatially correlated. See https://arxiv.org/pdf/1810.12890.pdf for details. Args: net: `Tensor` input tensor. is_training: `bool` for whether the model is training. keep_prob: `float` or `Tensor` keep_prob parameter of DropBlock. "None" means no DropBlock. dropblock_size: `int` size of blocks to be dropped by DropBlock. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: A version of input tensor with DropBlock applied. Raises: if width and height of the input tensor are not equal. """ if not is_training or keep_prob is None: return net tf.logging.info( 'Applying DropBlock: dropblock_size {}, net.shape {}'.format( dropblock_size, net.shape)) if data_format == 'channels_last': _, width, height, _ = net.get_shape().as_list() else: _, _, width, height = net.get_shape().as_list() if width != height: raise ValueError('Input tensor with width!=height is not supported.') dropblock_size = min(dropblock_size, width) # seed_drop_rate is the gamma parameter of DropBlcok. seed_drop_rate = (1.0 - keep_prob) * width**2 / dropblock_size**2 / ( width - dropblock_size + 1)**2 # Forces the block to be inside the feature map. w_i, h_i = tf.meshgrid(tf.range(width), tf.range(width)) valid_block_center = tf.logical_and( tf.logical_and(w_i >= int(dropblock_size // 2), w_i < width - (dropblock_size - 1) // 2), tf.logical_and(h_i >= int(dropblock_size // 2), h_i < width - (dropblock_size - 1) // 2)) valid_block_center = tf.expand_dims(valid_block_center, 0) valid_block_center = tf.expand_dims( valid_block_center, -1 if data_format == 'channels_last' else 0) randnoise = tf.random_uniform(net.shape, dtype=tf.float32) block_pattern = ( 1 - tf.cast(valid_block_center, dtype=tf.float32) + tf.cast( (1 - seed_drop_rate), dtype=tf.float32) + randnoise) >= 1 block_pattern = tf.cast(block_pattern, dtype=tf.float32) if dropblock_size == width: block_pattern = tf.reduce_min( block_pattern, axis=[1, 2] if data_format == 'channels_last' else [2, 3], keepdims=True) else: if data_format == 'channels_last': ksize = [1, dropblock_size, dropblock_size, 1] else: ksize = [1, 1, dropblock_size, dropblock_size] block_pattern = -tf.nn.max_pool( -block_pattern, ksize=ksize, strides=[1, 1, 1, 1], padding='SAME', data_format='NHWC' if data_format == 'channels_last' else 'NCHW') percent_ones = tf.cast(tf.reduce_sum( (block_pattern)), tf.float32) / tf.cast(tf.size(block_pattern), tf.float32) net = net / tf.cast(percent_ones, net.dtype) * tf.cast( block_pattern, net.dtype) return net
def project_distribution(supports, weights, target_support, validate_args=False): """Projects a batch of (support, weights) onto target_support. Based on equation (7) in (Bellemare et al., 2017): https://arxiv.org/abs/1707.06887 In the rest of the comments we will refer to this equation simply as Eq7. This code is not easy to digest, so we will use a running example to clarify what is going on, with the following sample inputs: * supports = [[0, 2, 4, 6, 8], [1, 3, 4, 5, 6]] * weights = [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.2, 0.5, 0.1, 0.1]] * target_support = [4, 5, 6, 7, 8] In the code below, comments preceded with 'Ex:' will be referencing the above values. Args: supports: Tensor of shape (batch_size, num_dims) defining supports for the distribution. weights: Tensor of shape (batch_size, num_dims) defining weights on the original support points. Although for the CategoricalDQN agent these weights are probabilities, it is not required that they are. target_support: Tensor of shape (num_dims) defining support of the projected distribution. The values must be monotonically increasing. Vmin and Vmax will be inferred from the first and last elements of this tensor, respectively. The values in this tensor must be equally spaced. validate_args: Whether we will verify the contents of the target_support parameter. Returns: A Tensor of shape (batch_size, num_dims) with the projection of a batch of (support, weights) onto target_support. Raises: ValueError: If target_support has no dimensions, or if shapes of supports, weights, and target_support are incompatible. """ target_support_deltas = target_support[1:] - target_support[:-1] # delta_z = `\Delta z` in Eq7. delta_z = target_support_deltas[0] validate_deps = [] supports.shape.assert_is_compatible_with(weights.shape) supports[0].shape.assert_is_compatible_with(target_support.shape) target_support.shape.assert_has_rank(1) if validate_args: # Assert that supports and weights have the same shapes. validate_deps.append( tf.Assert( tf.reduce_all(tf.equal(tf.shape(supports), tf.shape(weights))), [supports, weights])) # Assert that elements of supports and target_support have the same shape. validate_deps.append( tf.Assert( tf.reduce_all( tf.equal(tf.shape(supports)[1], tf.shape(target_support))), [supports, target_support])) # Assert that target_support has a single dimension. validate_deps.append( tf.Assert(tf.equal(tf.size(tf.shape(target_support)), 1), [target_support])) # Assert that the target_support is monotonically increasing. validate_deps.append( tf.Assert(tf.reduce_all(target_support_deltas > 0), [target_support])) # Assert that the values in target_support are equally spaced. validate_deps.append( tf.Assert(tf.reduce_all(tf.equal(target_support_deltas, delta_z)), [target_support])) with tf.control_dependencies(validate_deps): # Ex: `v_min, v_max = 4, 8`. v_min, v_max = target_support[0], target_support[-1] # Ex: `batch_size = 2`. batch_size = tf.shape(supports)[0] # `N` in Eq7. # Ex: `num_dims = 5`. num_dims = tf.shape(target_support)[0] # clipped_support = `[\hat{T}_{z_j}]^{V_max}_{V_min}` in Eq7. # Ex: `clipped_support = [[[ 4. 4. 4. 6. 8.]] # [[ 4. 4. 4. 5. 6.]]]`. clipped_support = tf.clip_by_value(supports, v_min, v_max)[:, None, :] # Ex: `tiled_support = [[[[ 4. 4. 4. 6. 8.] # [ 4. 4. 4. 6. 8.] # [ 4. 4. 4. 6. 8.] # [ 4. 4. 4. 6. 8.] # [ 4. 4. 4. 6. 8.]] # [[ 4. 4. 4. 5. 6.] # [ 4. 4. 4. 5. 6.] # [ 4. 4. 4. 5. 6.] # [ 4. 4. 4. 5. 6.] # [ 4. 4. 4. 5. 6.]]]]`. tiled_support = tf.tile([clipped_support], [1, 1, num_dims, 1]) # Ex: `reshaped_target_support = [[[ 4.] # [ 5.] # [ 6.] # [ 7.] # [ 8.]] # [[ 4.] # [ 5.] # [ 6.] # [ 7.] # [ 8.]]]`. reshaped_target_support = tf.tile(target_support[:, None], [batch_size, 1]) reshaped_target_support = tf.reshape(reshaped_target_support, [batch_size, num_dims, 1]) # numerator = `|clipped_support - z_i|` in Eq7. # Ex: `numerator = [[[[ 0. 0. 0. 2. 4.] # [ 1. 1. 1. 1. 3.] # [ 2. 2. 2. 0. 2.] # [ 3. 3. 3. 1. 1.] # [ 4. 4. 4. 2. 0.]] # [[ 0. 0. 0. 1. 2.] # [ 1. 1. 1. 0. 1.] # [ 2. 2. 2. 1. 0.] # [ 3. 3. 3. 2. 1.] # [ 4. 4. 4. 3. 2.]]]]`. numerator = tf.abs(tiled_support - reshaped_target_support) quotient = 1 - (numerator / delta_z) # clipped_quotient = `[1 - numerator / (\Delta z)]_0^1` in Eq7. # Ex: `clipped_quotient = [[[[ 1. 1. 1. 0. 0.] # [ 0. 0. 0. 0. 0.] # [ 0. 0. 0. 1. 0.] # [ 0. 0. 0. 0. 0.] # [ 0. 0. 0. 0. 1.]] # [[ 1. 1. 1. 0. 0.] # [ 0. 0. 0. 1. 0.] # [ 0. 0. 0. 0. 1.] # [ 0. 0. 0. 0. 0.] # [ 0. 0. 0. 0. 0.]]]]`. clipped_quotient = tf.clip_by_value(quotient, 0, 1) # Ex: `weights = [[ 0.1 0.6 0.1 0.1 0.1] # [ 0.1 0.2 0.5 0.1 0.1]]`. weights = weights[:, None, :] # inner_prod = `\sum_{j=0}^{N-1} clipped_quotient * p_j(x', \pi(x'))` # in Eq7. # Ex: `inner_prod = [[[[ 0.1 0.6 0.1 0. 0. ] # [ 0. 0. 0. 0. 0. ] # [ 0. 0. 0. 0.1 0. ] # [ 0. 0. 0. 0. 0. ] # [ 0. 0. 0. 0. 0.1]] # [[ 0.1 0.2 0.5 0. 0. ] # [ 0. 0. 0. 0.1 0. ] # [ 0. 0. 0. 0. 0.1] # [ 0. 0. 0. 0. 0. ] # [ 0. 0. 0. 0. 0. ]]]]`. inner_prod = clipped_quotient * weights # Ex: `projection = [[ 0.8 0.0 0.1 0.0 0.1] # [ 0.8 0.1 0.1 0.0 0.0]]`. projection = tf.reduce_sum(inner_prod, 3) projection = tf.reshape(projection, [batch_size, num_dims]) return projection
def _generate_detections_tf(cls_outputs, box_outputs, anchor_boxes, indices, classes, image_id, image_scale, min_score_thresh=MIN_SCORE_THRESH, max_boxes_to_draw=MAX_DETECTIONS_PER_IMAGE, soft_nms_sigma=0.0, iou_threshold=0.5, use_native_nms=True): """Generates detections with model outputs and anchors. Args: cls_outputs: a numpy array with shape [N, 1], which has the highest class scores on all feature levels. The N is the number of selected top-K total anchors on all levels. (k being MAX_DETECTION_POINTS) box_outputs: a numpy array with shape [N, 4], which stacks box regression outputs on all feature levels. The N is the number of selected top-k total anchors on all levels. (k being MAX_DETECTION_POINTS) anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all feature levels. The N is the number of selected top-k total anchors on all levels. indices: a numpy array with shape [N], which is the indices from top-k selection. classes: a numpy array with shape [N], which represents the class prediction on all selected anchors from top-k selection. image_id: an integer number to specify the image id. image_scale: a float tensor representing the scale between original image and input image for the detector. It is used to rescale detections for evaluating with the original groundtruth annotations. min_score_thresh: A float representing the threshold for deciding when to remove boxes based on score. max_boxes_to_draw: Max number of boxes to draw. soft_nms_sigma: A scalar float representing the Soft NMS sigma parameter; See Bodla et al, https://arxiv.org/abs/1704.04503). When `soft_nms_sigma=0.0` (which is default), we fall back to standard (hard) NMS. iou_threshold: A float representing the threshold for deciding whether boxes overlap too much with respect to IOU. use_native_nms: a bool that indicates whether to use native nms. Returns: detections: detection results in a tensor with each row representing [image_id, y, x, height, width, score, class] """ logging.info('Using tf version of post-processing.') anchor_boxes = tf.gather(anchor_boxes, indices) scores = tf.math.sigmoid(cls_outputs) # apply bounding box regression to anchors boxes = decode_box_outputs_tf( tf.transpose(box_outputs, [1, 0]), tf.transpose(anchor_boxes, [1, 0])) if use_native_nms: logging.info('Using native nms.') top_detection_idx, scores = tf.image.non_max_suppression_with_scores( boxes, scores, max_boxes_to_draw, iou_threshold=iou_threshold, score_threshold=min_score_thresh, soft_nms_sigma=soft_nms_sigma) boxes = tf.gather(boxes, top_detection_idx) else: logging.info('Using customized nms.') scores = tf.expand_dims(scores, axis=1) all_detections = tf.concat([boxes, scores], axis=1) top_detection_idx = nms_tf(all_detections, iou_threshold) detections = tf.gather(all_detections, top_detection_idx) scores = detections[:, 4] boxes = detections[:, :4] height = boxes[:, 2] - boxes[:, 0] width = boxes[:, 3] - boxes[:, 1] detections = tf.stack([ tf.cast(tf.tile(image_id, [tf.size(top_detection_idx)]), tf.float32), boxes[:, 0] * image_scale, boxes[:, 1] * image_scale, height * image_scale, width * image_scale, scores, tf.cast(tf.gather(classes, top_detection_idx) + 1, tf.float32) ], axis=1) return detections
def input_producer(raw_data, batch_size, num_steps, shuffle=False, randomize=False, random_len=False): """Produces graph-based input for Penn Treebank. Args: raw_data: np tensor of size [num_words]. batch_size: self-explained. num_steps: number of BPTT steps. shuffle: whether to shuffle sentences. randomize: use random segments instead of the continuous corpus. random_len: random sequence len. Returns: If `random_len` is set, return op that represents whether we have reached the end of a sequence. Otherwise, return number of batches in an epoch. """ num_batches_per_epoch = ( (np.size(raw_data) // batch_size) - 1) // num_steps raw_data = tf.convert_to_tensor(raw_data, name='raw_data', dtype=tf.int32) data_len = tf.size(raw_data) batch_len = data_len // batch_size data = tf.reshape(raw_data[0:batch_size * batch_len], [batch_size, batch_len]) epoch_size = (batch_len - 1) // num_steps with tf.device('/cpu:0'): epoch_size = tf.identity(epoch_size, name='epoch_size') if random_len: start_idx = tf.Variable(0, name='start_idx', dtype=tf.int32, trainable=False) base_bptt = tf.cond( tf.random_uniform(shape=(), minval=0., maxval=1.) < 0.95, lambda: tf.cast(num_steps, dtype=tf.float32), lambda: tf.cast(num_steps, dtype=tf.float32) / 2.) seq_len = tf.random.truncated_normal(shape=(), mean=base_bptt, stddev=5., dtype=tf.float32) seq_len = tf.cast(seq_len, dtype=tf.int32) seq_len = tf.minimum(seq_len, num_steps + 20) # seq_len <= bptt + 40 seq_len = tf.minimum(seq_len, batch_len - start_idx - 1) end_idx = start_idx + seq_len x = data[:, start_idx:end_idx] y = data[:, start_idx + 1:end_idx + 1] with tf.control_dependencies([x, y]): with tf.control_dependencies([tf.assign(start_idx, end_idx)]): should_reset = tf.greater_equal(end_idx, batch_len - 3) reset_start_idx = tf.assign(start_idx, 0) return (x, y, num_batches_per_epoch, reset_start_idx, should_reset, base_bptt) if randomize: i = tf.random_uniform([1], minval=0, maxval=batch_len - num_steps, dtype=tf.int32) x = tf.strided_slice(data, [0, i], [batch_size, i + num_steps]) y = tf.strided_slice(data, [0, i + 1], [batch_size, i + num_steps + 1]) else: i = tf.train.range_input_producer(epoch_size, shuffle=shuffle).dequeue() x = tf.strided_slice(data, [0, i * num_steps], [batch_size, (i + 1) * num_steps]) y = tf.strided_slice(data, [0, i * num_steps + 1], [batch_size, (i + 1) * num_steps + 1]) x.set_shape([batch_size, num_steps]) y.set_shape([batch_size, num_steps]) return x, y, num_batches_per_epoch
def flatten(samples): """Flatten the input tensor into a vector.""" return tf.reshape(samples, (tf.size(samples), ))
def _count_all_pp(x): """Count all objects.""" # Count distribution (thresholded at 15): label = tf.math.minimum(tf.size(x["objects"]["type"]) - 1, 8) return {"image": x["image"], "label": label}
def train_step(self): def step_fn(inputs): """Step functon. Args: inputs: inputs from data iterator Returns: a set of variables want to observe in Tensorboard """ net = self.net (all_images, labels), (self.probe_images, self.probe_labels) = inputs assert len(all_images.shape) == 5 images, self.aug_images = all_images[:, 0], all_images[:, 1] self.images, self.labels = images, labels batch_size = int(self.batch_size / self.strategy.num_replicas_in_sync) logits = net(images, name='model', reuse=tf.AUTO_REUSE, training=True) self.logits = logits # other losses # initialized first to use self.guessed_label for meta step xe_loss, cs_loss = self.unsupervised_loss() # meta optimization weight, eps, meta_loss, meta_acc = self.meta_optimize() ## losses w.r.t new weight and loss onehot_labels = tf.one_hot(labels, self.dataset.num_classes) onehot_labels = tf.cast(onehot_labels, tf.float32) eps_k = tf.reshape(eps, [batch_size, 1]) mixed_labels = tf.math.add(eps_k * onehot_labels, (1 - eps_k) * self.guessed_label, name='mixed_labels') net_cost = tf.losses.softmax_cross_entropy( mixed_labels, logits, reduction=tf.losses.Reduction.NONE) # loss with initial weight net_loss1 = tf.reduce_mean(net_cost) # loss with initial eps init_eps = tf.constant([FLAGS.grad_eps_init] * batch_size, dtype=tf.float32) init_eps = tf.reshape(init_eps, (-1, 1)) init_mixed_labels = tf.math.add( init_eps * onehot_labels, (1 - init_eps) * self.guessed_label, name='init_mixed_labels') net_cost2 = tf.losses.softmax_cross_entropy( init_mixed_labels, logits, reduction=tf.losses.Reduction.NONE) net_loss2 = tf.reduce_sum(tf.math.multiply(net_cost2, weight)) net_loss = (net_loss1 + net_loss2) / 2 net_loss = net_loss + tf.add_n([xe_loss, cs_loss]) net_loss += net.regularization_loss net_loss /= self.strategy.num_replicas_in_sync # rescale by gpus with tf.control_dependencies(net.updates): net_grads = tf.gradients(net_loss, net.trainable_variables) minimizer_op = self.optimizer.apply_gradients( zip(net_grads, net.trainable_variables), global_step=self.global_step) with tf.control_dependencies([minimizer_op]): train_op = self.ema.apply(net.trainable_variables) acc_op, acc_update_op = self.acc_func(labels, tf.argmax(logits, axis=1)) with tf.control_dependencies([train_op, acc_update_op]): return (tf.identity(net_loss), tf.identity(xe_loss), tf.identity(cs_loss), tf.identity(meta_loss), tf.identity(meta_acc), tf.identity(acc_op), tf.identity(weight), tf.identity(labels)) # end of parallel (pr_net_loss, pr_xe_loss, pr_cs_loss, pr_metaloss, pr_metaacc, pr_acc, pr_weight, pr_labels) = self.strategy.experimental_run_v2( step_fn, args=((next(self.train_input_iterator), next(self.probe_input_iterator)), )) # collect device variables weights = self.strategy.unwrap(pr_weight) weights = tf.concat(weights, axis=0) labels = self.strategy.unwrap(pr_labels) labels = tf.concat(labels, axis=0) mean_acc = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_acc) mean_metaacc = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_metaacc) net_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_net_loss) xe_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_xe_loss) cs_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_cs_loss) meta_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_metaloss) # The following add variables for tensorboard visualization merges = [] merges.append(tf.summary.scalar('acc/train', mean_acc)) merges.append(tf.summary.scalar('loss/xemin', xe_loss)) merges.append(tf.summary.scalar('loss/consistency', cs_loss)) merges.append(tf.summary.scalar('loss/net', net_loss)) merges.append(tf.summary.scalar('loss/meta', meta_loss)) merges.append(tf.summary.scalar('acc/meta', mean_metaacc)) zw_inds = tf.squeeze( tf.where(tf.less_equal(weights, 0), name='zero_weight_index')) merges.append( tf.summary.scalar( 'weights/zeroratio', tf.math.divide(tf.cast(tf.size(zw_inds), tf.float32), tf.cast(tf.size(weights), tf.float32)))) self.epoch_var = tf.cast(self.global_step / self.iter_epoch, tf.float32, name='epoch') merges.append(tf.summary.scalar('epoch', self.epoch_var)) merges.append(tf.summary.scalar('learningrate', self.learning_rate)) summary = tf.summary.merge(merges) return [ net_loss, meta_loss, xe_loss, cs_loss, mean_acc, mean_metaacc, summary, weights ]
def _parse_train_data(self, data): """Parses data for training and evaluation.""" classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training and self._is_training: num_groundtrtuhs = tf.shape(classes)[0] with tf.control_dependencies([num_groundtrtuhs, is_crowds]): indices = tf.cond( tf.greater(tf.size(is_crowds), 0), lambda: tf.where(tf.logical_not(is_crowds))[:, 0], lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) # Gets original image and its size. image = data['image'] # NOTE: The autoaugment method works best when used alongside the standard # horizontal flipping of images along with size jittering and normalization. if self._use_autoaugment: try: from utils import autoaugment_utils # pylint: disable=g-import-not-at-top except ImportError as e: logging.exception('Autoaugment is not supported in TF 2.x.') raise e image, boxes = autoaugment_utils.distort_image_with_autoaugment( image, boxes, self._autoaugment_policy_name) image_shape = tf.shape(image)[0:2] # Normalizes image with mean and std pixel values. image = input_utils.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: image, boxes = input_utils.random_horizontal_flip(image, boxes) # Converts boxes from normalized coordinates to pixel coordinates. # Now the coordinates of boxes are w.r.t. the original image. boxes = box_utils.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = input_utils.resize_and_crop_image( image, self._output_size, padded_size=input_utils.compute_padded_size( self._output_size, 2**self._max_level), aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. # Now the coordinates of boxes are w.r.t the scaled image. image_scale = image_info[2, :] offset = image_info[3, :] boxes = input_utils.resize_and_crop_boxes(boxes, image_scale, (image_height, image_width), offset) # Filters out ground truth boxes that are all zeros. indices = input_utils.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) # Assigns anchor targets. # Note that after the target assignment, box targets are absolute pixel # offsets w.r.t. the scaled image. input_anchor = anchor.Anchor(self._min_level, self._max_level, self._num_scales, self._aspect_ratios, self._anchor_size, (image_height, image_width)) anchor_labeler = anchor.AnchorLabeler(input_anchor, self._match_threshold, self._unmatched_threshold) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors( boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32)) # If bfloat16 is used, casts input image to tf.bfloat16. if self._use_bfloat16: image = tf.cast(image, dtype=tf.bfloat16) # Packs labels for model_fn outputs. labels = { 'cls_targets': cls_targets, 'box_targets': box_targets, 'anchor_boxes': input_anchor.multilevel_boxes, 'num_positives': num_positives, 'image_info': image_info, } return image, labels
def GetEmbeddingLookupList(signals_list, embedding_vars, sparse_ids, sparse_weights=None, combiners='sqrtn', partition_strategies='mod'): """Get a list of embedding lookup tensors. Args: signals_list: A list of strings, representing names of features. embedding_vars: Dict mapping feature names to full embedding variables. sparse_ids: Dict mapping feature names to SparseTensors of their ids. sparse_weights: Either None, or a dict mapping feature names to SparseTensors of their weights (which can also be None). combiners: Either a common combiner type for all features ('mean', sqrtn' or 'sum') or a dict mapping each feature name to a combiner type. partition_strategies: Either a common partition_strategy for all features ('mod' or 'div') or a dict mapping feature_names to partition_stratgies. Returns: embedding_lookup_list: A list of embedding lookup tensors used for bag of words attribution, aligned with signals_list. """ assert isinstance(embedding_vars, dict) and isinstance(sparse_ids, dict) assert sparse_weights is None or isinstance(sparse_weights, dict) assert combiners in ('mean', 'sqrtn', 'sum') or isinstance(combiners, dict) assert (partition_strategies in ('mod', 'div') or isinstance(partition_strategies, dict)) embedding_lookup_list = [] for signal in signals_list: combiner = combiners[signal] if isinstance(combiners, dict) else combiners partition_strategy = (partition_strategies[signal] if isinstance( partition_strategies, dict) else partition_strategies) # Batch dimension should be 1 for attribution. with tf.control_dependencies( [tf.assert_equal(tf.shape(sparse_ids[signal])[0], 1)]): embedding_lookup = tf.nn.embedding_lookup( params=embedding_vars[signal], ids=tf.sparse_tensor_to_dense(sparse_ids[signal]), partition_strategy=partition_strategy) if sparse_weights is None or sparse_weights[signal] is None: num_vals = tf.size(sparse_ids[signal].values) if combiner == 'mean': embedding_weights = tf.fill([1, num_vals], 1.0 / tf.to_float(num_vals)) elif combiner == 'sqrtn': embedding_weights = tf.fill([1, num_vals], 1.0 / tf.sqrt(tf.to_float(num_vals))) else: embedding_weights = tf.ones([1, num_vals], dtype=tf.float32) else: # Batch dimension should be 1 for attribution. with tf.control_dependencies( [tf.assert_equal(tf.shape(sparse_weights[signal])[0], 1)]): dense_weights = tf.sparse_tensor_to_dense( sparse_weights[signal]) if combiner == 'mean': embedding_weights = dense_weights / tf.reduce_sum( dense_weights) elif combiner == 'sqrtn': embedding_weights = ( dense_weights / tf.sqrt(tf.reduce_sum(tf.pow(dense_weights, 2)))) else: embedding_weights = dense_weights embedding_lookup *= tf.expand_dims(embedding_weights, -1) embedding_lookup_list.append(embedding_lookup) return embedding_lookup_list
def _update_mask(self, weights, threshold, gradients): # pylint: disable=unused-argument """Updates the mask for a given weight tensor. This functions first computes the cdf of the weight tensor, and estimates the threshold value such that 'desired_sparsity' fraction of weights have magnitude less than the threshold. Args: weights: The weight tensor that needs to be masked. threshold: The current threshold value. The function will compute a new threshold and return the exponential moving average using the current value of threshold gradients: The gradient tensor that is used for salience calculation. Returns: new_threshold: The new value of the threshold based on weights, and sparsity at the current global_step new_mask: A numpy array of the same size and shape as weights containing 0 or 1 to indicate which of the values in weights falls below the threshold Raises: ValueError: if sparsity is not defined """ if self._sparsity is None: raise ValueError('Sparsity variable undefined') sparsity = self._get_sparsity(weights.op.name) with tf.name_scope(weights.op.name + '_pruning_ops'): tf.logging.info('Applying option %s pruning', self._spec.prune_option) if self._spec.prune_option == 'weight': abs_weights = tf.abs(weights) elif self._spec.prune_option in ('first_order_gradient', 'second_order_gradient'): if gradients is None: raise ValueError('gradient tensor cannot be None.') # gradient variable stores absolute value already abs_weights = tf.multiply(tf.abs(weights), gradients) else: raise ValueError('undefined option') k = tf.cast( tf.round( tf.cast(tf.size(abs_weights), tf.float32) * (1 - sparsity)), tf.int32) # Generate a random shuffling of the weights s.t. the tie-breaker on # weight magnitude is random uniform. shuffling = tf.random_shuffle(tf.range(tf.size(abs_weights))) shuffling = tf.reshape(shuffling, [-1, 1]) # Flatten the weights and scatter the values randomly. abs_weights = tf.reshape(abs_weights, [-1]) abs_weights = tf.scatter_nd(shuffling, abs_weights, tf.shape(abs_weights)) # Sort the entire array _, indices = tf.nn.top_k(abs_weights, k=tf.size(abs_weights)) # `k` is how many non-zero weights we're going to have. Create a new # mask where the first `k` elements are set to one and all others are # set to zero. mask_staging = tf.range(tf.size(abs_weights)) mask_staging = tf.cast(tf.less(mask_staging, k), tf.float32) # Scatter the mask back into the proper positions for the weight matrix. indices = tf.reshape(indices, [-1, 1]) new_mask = tf.scatter_nd(indices, mask_staging, tf.shape(mask_staging)) # Un-shuffle the newly created mask. new_mask = tf.reshape(tf.gather_nd(new_mask, shuffling), tf.shape(weights)) return tf.constant(0, tf.float32), new_mask
def decode(self, tf_seq_example_string_tensor): """Decodes serialized `tf.SequenceExample`s and returns a tensor dictionary. Args: tf_seq_example_string_tensor: a string tensor holding a serialized `tf.SequenceExample`. Returns: A list of dictionaries with (at least) the following tensors: fields.InputDataFields.source_id: a [num_frames] string tensor with a unique ID for each frame. fields.InputDataFields.num_groundtruth_boxes: a [num_frames] int32 tensor specifying the number of boxes in each frame. fields.InputDataFields.groundtruth_boxes: a [num_frames, num_boxes, 4] float32 tensor with bounding boxes for each frame. Note that num_boxes is the maximum boxes seen in any individual frame. Any frames with fewer boxes are padded with 0.0. fields.InputDataFields.groundtruth_classes: a [num_frames, num_boxes] int32 tensor with class indices for each box in each frame. fields.InputDataFields.groundtruth_weights: a [num_frames, num_boxes] float32 tensor with weights of the groundtruth boxes. fields.InputDataFields.is_annotated: a [num_frames] bool tensor specifying whether the image was annotated or not. If False, the corresponding entries in the groundtruth tensor will be ignored. fields.InputDataFields.context_features - 1D float32 tensor of shape [context_feature_length * num_context_features] fields.InputDataFields.context_feature_length - int32 tensor specifying the length of each feature in context_features fields.InputDataFields.image: a [num_frames] string tensor with the encoded images. """ serialized_example = tf.reshape(tf_seq_example_string_tensor, shape=[]) decoder = slim_example_decoder.TFSequenceExampleDecoder( self._context_keys_to_features, self._sequence_keys_to_feature_lists, self._items_to_handlers) keys = decoder.list_items() tensors = decoder.decode(serialized_example, items=keys) tensor_dict = dict(list(zip(keys, tensors))) tensor_dict[fields.InputDataFields.groundtruth_boxes].set_shape( [None, None, 4]) tensor_dict[fields.InputDataFields.num_groundtruth_boxes] = tf.cast( tensor_dict[fields.InputDataFields.num_groundtruth_boxes], dtype=tf.int32) tensor_dict[fields.InputDataFields.groundtruth_classes] = tf.cast( tensor_dict[fields.InputDataFields.groundtruth_classes], dtype=tf.int32) tensor_dict[ fields.InputDataFields.original_image_spatial_shape] = tf.cast( tf.stack([ tensor_dict[fields.InputDataFields.image_height], tensor_dict[fields.InputDataFields.image_width] ]), dtype=tf.int32) tensor_dict.pop(fields.InputDataFields.image_height) tensor_dict.pop(fields.InputDataFields.image_width) def default_groundtruth_weights(): """Produces weights of 1.0 for each valid box, and 0.0 otherwise.""" num_boxes_per_frame = tensor_dict[ fields.InputDataFields.num_groundtruth_boxes] max_num_boxes = tf.reduce_max(num_boxes_per_frame) num_boxes_per_frame_tiled = tf.tile( tf.expand_dims(num_boxes_per_frame, axis=-1), multiples=tf.stack([1, max_num_boxes])) range_tiled = tf.tile(tf.expand_dims(tf.range(max_num_boxes), axis=0), multiples=tf.stack( [tf.shape(num_boxes_per_frame)[0], 1])) return tf.cast(tf.greater(num_boxes_per_frame_tiled, range_tiled), tf.float32) tensor_dict[fields.InputDataFields.groundtruth_weights] = tf.cond( tf.greater( tf.size( tensor_dict[fields.InputDataFields.groundtruth_weights]), 0), lambda: tensor_dict[fields.InputDataFields.groundtruth_weights], default_groundtruth_weights) if self._fully_annotated: tensor_dict[fields.InputDataFields.is_annotated] = tf.ones_like( tensor_dict[fields.InputDataFields.num_groundtruth_boxes], dtype=tf.bool) else: tensor_dict[fields.InputDataFields.is_annotated] = tf.cast( tensor_dict[fields.InputDataFields.is_annotated], dtype=tf.bool) return tensor_dict
def _buckets(data, bucket_count=None): """Create a TensorFlow op to group data into histogram buckets. Arguments: data: A `Tensor` of any shape. Must be castable to `float64`. bucket_count: Optional positive `int` or scalar `int32` `Tensor`. Returns: A `Tensor` of shape `[k, 3]` and type `float64`. The `i`th row is a triple `[left_edge, right_edge, count]` for a single bucket. The value of `k` is either `bucket_count` or `1` or `0`. """ # TODO(nickfelt): remove on-demand imports once dep situation is fixed. import tensorflow.compat.v1 as tf if bucket_count is None: bucket_count = summary_v2.DEFAULT_BUCKET_COUNT with tf.name_scope("buckets", values=[data, bucket_count]), tf.control_dependencies([ tf.assert_scalar(bucket_count), tf.assert_type(bucket_count, tf.int32) ]): data = tf.reshape(data, shape=[-1]) # flatten data = tf.cast(data, tf.float64) is_empty = tf.equal(tf.size(input=data), 0) def when_empty(): return tf.constant([], shape=(0, 3), dtype=tf.float64) def when_nonempty(): min_ = tf.reduce_min(input_tensor=data) max_ = tf.reduce_max(input_tensor=data) range_ = max_ - min_ is_singular = tf.equal(range_, 0) def when_nonsingular(): bucket_width = range_ / tf.cast(bucket_count, tf.float64) offsets = data - min_ bucket_indices = tf.cast(tf.floor(offsets / bucket_width), dtype=tf.int32) clamped_indices = tf.minimum(bucket_indices, bucket_count - 1) # Use float64 instead of float32 to avoid accumulating floating point error # later in tf.reduce_sum when summing more than 2^24 individual `1.0` values. # See https://github.com/tensorflow/tensorflow/issues/51419 for details. one_hots = tf.one_hot(clamped_indices, depth=bucket_count, dtype=tf.float64) bucket_counts = tf.cast( tf.reduce_sum(input_tensor=one_hots, axis=0), dtype=tf.float64, ) edges = tf.linspace(min_, max_, bucket_count + 1) left_edges = edges[:-1] right_edges = edges[1:] return tf.transpose( a=tf.stack([left_edges, right_edges, bucket_counts])) def when_singular(): center = min_ bucket_starts = tf.stack([center - 0.5]) bucket_ends = tf.stack([center + 0.5]) bucket_counts = tf.stack( [tf.cast(tf.size(input=data), tf.float64)]) return tf.transpose( a=tf.stack([bucket_starts, bucket_ends, bucket_counts])) return tf.cond(is_singular, when_singular, when_nonsingular) return tf.cond(is_empty, when_empty, when_nonempty)
def get_iterator(src_dataset, tgt_dataset, src_vocab_table, tgt_vocab_table, batch_size, global_batch_size, sos, eos, random_seed, num_buckets, src_max_len=None, tgt_max_len=None, num_parallel_calls=4, output_buffer_size=None, skip_count=None, num_shards=1, shard_index=0, reshuffle_each_iteration=True, filter_oversized_sequences=False, return_raw=False): """Function that returns input dataset.""" # Total number of examples in src_dataset/tgt_dataset if not output_buffer_size: output_buffer_size = global_batch_size * 100 src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32) tgt_sos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(sos)), tf.int32) tgt_eos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(eos)), tf.int32) src_tgt_dataset = tf.data.Dataset.zip((src_dataset, tgt_dataset)) src_tgt_dataset = src_tgt_dataset.shard(num_shards, shard_index) if skip_count is not None: src_tgt_dataset = src_tgt_dataset.skip(skip_count) src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (tf.string_split([src]).values, tf.string_split([tgt]).values), num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) # Filter zero length input sequences. src_tgt_dataset = src_tgt_dataset.filter( lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0)) # Filter oversized input sequences (542 examples are filtered). if filter_oversized_sequences: src_tgt_dataset = src_tgt_dataset.filter( lambda src, tgt: tf.logical_and( tf.size(src) <= src_max_len - 2, tf.size(tgt) <= tgt_max_len - 1)) if src_max_len: src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (src[:src_max_len - 2], tgt), num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) if tgt_max_len: src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (src, tgt[:tgt_max_len]), num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) # Convert the word strings to ids. Word strings that are not in the # vocab get the lookup table's default_value integer. src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (tf.cast(src_vocab_table.lookup(src), tf.int32), tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)), num_parallel_calls=num_parallel_calls) src_tgt_dataset = src_tgt_dataset.prefetch(output_buffer_size) # Create a tgt_input prefixed with <sos> and a tgt_output suffixed with <eos>. src_tgt_dataset = src_tgt_dataset.map(lambda src, tgt: (tf.concat( ([tgt_sos_id], src, [src_eos_id]), 0), tf.concat( ([tgt_sos_id], tgt), 0), tf.concat((tgt, [tgt_eos_id]), 0)), num_parallel_calls=num_parallel_calls ).prefetch(output_buffer_size) # Add in sequence lengths. src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt_in, tgt_out: (src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in)), num_parallel_calls=num_parallel_calls) if return_raw: def map_fn(src, tgt_in, tgt_out, src_len, tgt_len): """Pad the dataset and emit the bucket id as key.""" src = tf.pad(src, [[0, src_max_len - tf.size(src)]], constant_values=src_eos_id) tgt_in = tf.pad(tgt_in, [[0, tgt_max_len - tf.size(tgt_in)]], constant_values=tgt_eos_id) tgt_out = tf.pad(tgt_out, [[0, tgt_max_len - tf.size(tgt_out)]], constant_values=tgt_eos_id) bucket_width = (src_max_len + num_buckets - 1) // num_buckets bucket_id = tf.cast( tf.minimum( num_buckets, tf.maximum(src_len // bucket_width, tgt_len // bucket_width)), tf.int32) return tf.concat([ src, tgt_in, tgt_out, tf.reshape(src_len, [1]), tf.reshape(tgt_len, [1]), tf.reshape(bucket_id, [1]) ], 0) src_tgt_dataset = src_tgt_dataset.map( map_fn, num_parallel_calls=num_parallel_calls) return src_tgt_dataset.batch(1024) src_tgt_dataset = src_tgt_dataset.prefetch(output_buffer_size) src_tgt_dataset = src_tgt_dataset.cache() # TODO(saeta): investigate shuffle_and_repeat. src_tgt_dataset = src_tgt_dataset.shuffle( output_buffer_size, random_seed, reshuffle_each_iteration).repeat() # Bucket by source sequence length (buckets for lengths 0-9, 10-19, ...) def batching_func(x): return x.padded_batch( batch_size, # The first three entries are the source and target line rows; # these have unknown-length vectors. The last two entries are # the source and target row sizes; these are scalars. padded_shapes=( tf.TensorShape([src_max_len]), # src tf.TensorShape([tgt_max_len]), # tgt_input tf.TensorShape([tgt_max_len]), # tgt_output tf.TensorShape([]), # src_len tf.TensorShape([])), # tgt_len # Pad the source and target sequences with eos tokens. # (Though notice we don't generally need to do this since # later on we will be masking out calculations past the true sequence. padding_values=( src_eos_id, # src tgt_eos_id, # tgt_input tgt_eos_id, # tgt_output 0, # src_len -- unused 0), # For TPU, must set drop_remainder to True or batch size will be None drop_remainder=True) # tgt_len -- unused if num_buckets > 1: def key_func(unused_1, unused_2, unused_3, src_len, tgt_len): """Calculate bucket_width by maximum source sequence length.""" # Pairs with length [0, bucket_width) go to bucket 0, length # [bucket_width, 2 * bucket_width) go to bucket 1, etc. Pairs with length # over ((num_bucket-1) * bucket_width) words all go into the last bucket. if src_max_len: bucket_width = (src_max_len + num_buckets - 1) // num_buckets else: bucket_width = 10 # Bucket sentence pairs by the length of their source sentence and target # sentence. bucket_id = tf.maximum(src_len // bucket_width, tgt_len // bucket_width) return tf.to_int64(tf.minimum(num_buckets, bucket_id)) def reduce_func(unused_key, windowed_data): return batching_func(windowed_data) batched_dataset = src_tgt_dataset.apply( tf.data.experimental.group_by_window( key_func=key_func, reduce_func=reduce_func, window_size=global_batch_size)) else: batched_dataset = batching_func(src_tgt_dataset) # Make_one_shot_iterator is not applicable here since we have lookup table. # Instead return a tf.data.dataset and let TpuEstimator to initialize and make # iterator out of it. batched_dataset = batched_dataset.map( lambda src, tgt_in, tgt_out, source_size, tgt_in_size: ({ "source": src, "target_input": tgt_in, "target_output": tgt_out, "source_sequence_length": source_size, "target_sequence_length": tgt_in_size })) return batched_dataset
def train_step(self): def step_fn(inputs): """Step function.""" net = self.net (images, labels), (self.probe_images, self.probe_labels) = inputs self.images, self.labels = images, labels logits = net(images, name='model', reuse=tf.AUTO_REUSE, training=True) self.logits = logits net_cost = tf.losses.sparse_softmax_cross_entropy( labels, logits, reduction=tf.losses.Reduction.NONE) weight, meta_loss, meta_acc = self.meta_optimize(net_cost) net_loss = tf.reduce_sum(tf.math.multiply(net_cost, weight)) net_loss += net.regularization_loss net_loss /= self.strategy.num_replicas_in_sync # rescale by gpus net_grads = tf.gradients(net_loss, net.trainable_variables) minimizer_op = self.optimizer.apply_gradients( zip(net_grads, net.trainable_variables), global_step=self.global_step) if FLAGS.use_ema: ema_op = self.ema.apply(net.trainable_variables) optimizer_op = tf.group([net.updates, minimizer_op, ema_op]) else: optimizer_op = tf.group([net.updates, minimizer_op]) acc_op, acc_update_op = self.acc_func(labels, tf.argmax(logits, axis=1)) with tf.control_dependencies([optimizer_op, acc_update_op]): return tf.identity(net_loss), tf.identity(meta_loss),\ tf.identity(meta_acc), tf.identity(acc_op),\ tf.identity(weight), tf.identity(labels) # end of parallel (pr_net_loss, pr_metaloss, pr_metaacc, pr_acc, pr_weight, pr_labels) = self.strategy.run( step_fn, args=(next(self.train_input_iterator),)) # collect device variables weights = self.strategy.unwrap(pr_weight) weights = tf.concat(weights, axis=0) labels = self.strategy.unwrap(pr_labels) labels = tf.concat(labels, axis=0) mean_acc = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_acc) mean_metaacc = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_metaacc) net_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_net_loss) meta_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_metaloss) merges = [] merges.append(tf.summary.scalar('acc/train', mean_acc)) merges.append(tf.summary.scalar('loss/net', net_loss)) merges.append(tf.summary.scalar('loss/meta', meta_loss)) merges.append(tf.summary.scalar('acc/meta', mean_metaacc)) zw_inds = tf.squeeze( tf.where(tf.less_equal(weights, 0), name='zero_weight_index')) merges.append( tf.summary.scalar( 'weights/zeroratio', tf.math.divide( tf.cast(tf.size(zw_inds), tf.float32), tf.cast(tf.size(weights), tf.float32)))) self.epoch_var = tf.cast( self.global_step / self.iter_epoch, tf.float32, name='epoch') merges.append(tf.summary.scalar('epoch', self.epoch_var)) merges.append(tf.summary.scalar('learningrate', self.learning_rate)) summary = tf.summary.merge(merges) return [net_loss, meta_loss, mean_acc, mean_metaacc, summary, weights]
def get_infer_iterator(dataset_data, dataset_kb, vocab_table, batch_size, eod, len_action, output_buffer_size=None, skip_count=None, num_shards=1, shard_index=0, self_play=False): """can be used to generate inference or self play iterators.""" if not output_buffer_size: output_buffer_size = batch_size * 1000 eod_id = tf.cast(vocab_table.lookup(tf.constant(eod)), tf.int32) # for padding combined_dataset = tf.data.Dataset.zip((dataset_data, dataset_kb)) combined_dataset = combined_dataset.shard(num_shards, shard_index) if skip_count is not None: combined_dataset = combined_dataset.skip(skip_count) # do not shuffle iterate on inference and self play mode # data is shuffled outside of iterator combined_dataset = combined_dataset.filter( lambda data, kb: tf.logical_and(tf.size(data) > 0, tf.size(kb) > 0)) if not self_play: get_sub_fu = get_sub_items_infer process_entry_fn = partial(process_entry_infer, vocab_table=vocab_table) else: get_sub_fu = get_sub_items_self_play process_entry_fn = partial(process_entry_self_play, vocab_table=vocab_table) combined_dataset = combined_dataset.map(get_sub_fu) combined_dataset = combined_dataset.map(process_entry_fn) def batching_func(x): return x.padded_batch( batch_size, padded_shapes=( tf.TensorShape([None]), # intent tf.TensorShape([]), # intent_len tf.TensorShape([None]), # source dialogue tf.TensorShape([None]), # target dialogue tf.TensorShape([]), # dialogue_len tf.TensorShape([len_action]), # predicted action tf.TensorShape([]), # action_len tf.TensorShape([len_action]), # trueth action tf.TensorShape([None]), # reward diag tf.TensorShape([len_action]), # reward action tf.TensorShape([None]), # kb tf.TensorShape([]), # kb_len tf.TensorShape([None]), # mask1 tf.TensorShape([None]), # mask2 tf.TensorShape([None]), # turn_point ), # action padding_values=( eod_id, # src 0, # tgt_input eod_id, # source eod_id, # target 0, eod_id, # predicted action 0, # action len eod_id, # truth action 0.0, # reward diag 0.0, # reward action eod_id, # src_len -- unused 0, False, # mask 1 False, # mask 2 0.0) # turn point ) batched_dataset = batching_func(combined_dataset) batched_iter = tf.data.make_initializable_iterator(batched_dataset) return batched_iter
def run_box_to_gaussian(logdir, verbose=False): """Run a box-blur-to-Gaussian-blur demonstration. See the summary description for more details. Arguments: logdir: Directory into which to write event logs. verbose: Boolean; whether to log any output. """ if verbose: logger.info("--- Starting run: box_to_gaussian") tf.reset_default_graph() tf.set_random_seed(0) image = get_image(verbose=verbose) blur_radius = tf.placeholder(shape=(), dtype=tf.int32) with tf.name_scope("filter"): blur_side_length = blur_radius * 2 + 1 pixel_filter = tf.ones((blur_side_length, blur_side_length)) pixel_filter = pixel_filter / tf.cast(tf.size(input=pixel_filter), tf.float32) # normalize iterations = 4 images = [tf.cast(image, tf.float32) / 255.0] for _ in xrange(iterations): images.append(convolve(images[-1], pixel_filter)) with tf.name_scope("convert_to_uint8"): images = tf.stack([ tf.cast(255 * tf.clip_by_value(image_, 0.0, 1.0), tf.uint8) for image_ in images ]) summ = image_summary.op( "box_to_gaussian", images, max_outputs=iterations, display_name="Gaussian blur as a limit process of box blurs", description=( "Demonstration of forming a Gaussian blur by " "composing box blurs, each of which can be expressed " "as a 2D convolution.\n\n" "A Gaussian blur is formed by convolving a Gaussian " "kernel over an image. But a Gaussian kernel is " "itself the limit of convolving a constant kernel " "with itself many times. Thus, while applying " "a box-filter convolution just once produces " "results that are noticeably different from those " "of a Gaussian blur, repeating the same convolution " "just a few times causes the result to rapidly " "converge to an actual Gaussian blur.\n\n" "Here, the step value controls the blur radius, " "and the image sample controls the number of times " "that the convolution is applied (plus one). " "So, when *sample*=1, the original image is shown; " "*sample*=2 shows a box blur; and a hypothetical " "*sample*=∞ would show a true Gaussian blur.\n\n" "This is one ingredient in a recipe to compute very " "fast Gaussian blurs. The other pieces require " "special treatment for the box blurs themselves " "(decomposition to dual one-dimensional box blurs, " "each of which is computed with a sliding window); " "we don’t perform those optimizations here.\n\n" "[Here are some slides describing the full process.]" "(%s)\n\n" "%s" % ( "http://elynxsdk.free.fr/ext-docs/Blur/Fast_box_blur.pdf", IMAGE_CREDIT, )), ) with tf.Session() as sess: sess.run(image.initializer) writer = tf.summary.FileWriter(os.path.join(logdir, "box_to_gaussian")) writer.add_graph(sess.graph) for step in xrange(8): if verbose: logger.info("--- box_to_gaussian: step: %s" % step) feed_dict = {blur_radius: step} run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = config_pb2.RunMetadata() s = sess.run( summ, feed_dict=feed_dict, options=run_options, run_metadata=run_metadata, ) writer.add_summary(s, global_step=step) writer.add_run_metadata(run_metadata, "step_%04d" % step) writer.close()
def get_iterator(dataset_data, dataset_kb, vocab_table, batch_size, t1, t2, eod, len_action, random_seed, num_buckets, max_dialogue_len=None, output_buffer_size=None, skip_count=None, num_shards=1, shard_index=0): """can be used to generate supervised learning iterators.""" if not output_buffer_size: output_buffer_size = batch_size * 1000 eod_id = tf.cast(vocab_table.lookup(tf.constant(eod)), tf.int32) t1_id = tf.cast(vocab_table.lookup(tf.constant(t1)), tf.int32) t2_id = tf.cast(vocab_table.lookup(tf.constant(t2)), tf.int32) combined_dataset = tf.data.Dataset.zip((dataset_data, dataset_kb)) combined_dataset = combined_dataset.shard(num_shards, shard_index) if skip_count is not None: combined_dataset = combined_dataset.skip(skip_count) combined_dataset = combined_dataset.shuffle(output_buffer_size, random_seed) combined_dataset = combined_dataset.filter( lambda data, kb: tf.logical_and(tf.size(data) > 0, tf.size(kb) > 0)) combined_dataset = combined_dataset.map(get_sub_items_supervised) combined_dataset = combined_dataset.map( partial(process_entry_supervised, vocab_table=vocab_table, t1_id=t1_id, t2_id=t2_id)) def batching_func(x): return x.padded_batch( batch_size, padded_shapes=( tf.TensorShape([None]), # intent tf.TensorShape([]), # intent_len tf.TensorShape([None]), # source dialogue tf.TensorShape([None]), # target dialogue tf.TensorShape([]), # dialogue_len tf.TensorShape([len_action]), # action tf.TensorShape([]), # action_len tf.TensorShape([len_action]), # pred_action tf.TensorShape([None]), # reward_diag tf.TensorShape([len_action]), # reward_action tf.TensorShape([None]), # kb tf.TensorShape([]), # kb_len tf.TensorShape([None]), # mask1 tf.TensorShape([None]), # mask2 tf.TensorShape([None]), # turn_point ), # action padding_values=( eod_id, # src 0, # tgt_input eod_id, # source eod_id, # target 0, # diag len eod_id, # action 0, # action len eod_id, # pred_action 0.0, # reward diag 0.0, # reward action eod_id, # kb 0, # kb len False, # mask 1 False, # mask 2 0.0) # turn point ) if num_buckets > 1: def key_func(unused_1, unused_2, unused_3, unused_4, dialogue_len, unused_6, unused_7, unused_8, unused_9, unused_10, unused_11, unused_12, unused_13, unused_14, unused_15): bucket_width = (max_dialogue_len + num_buckets - 1) // num_buckets bucket_id = dialogue_len // bucket_width return tf.to_int64(tf.minimum(num_buckets, bucket_id)) def reduce_func(unused_key, windowed_data): return batching_func(windowed_data) batched_dataset = combined_dataset.apply( contrib.data.group_by_window(key_func=key_func, reduce_func=reduce_func, window_size=batch_size)) else: batched_dataset = batching_func(combined_dataset) batched_iter = tf.data.make_initializable_iterator(batched_dataset) return batched_iter
def real_svg_top(body_output, unused_targets, model_hparams, unused_vocab_size, hard=False): """Applies the Mixture Density Network on top of the LSTM outputs. Args: body_output: outputs from LSTM with shape [batch, seqlen, 1, hidden_size] unused_targets: what the ground truth SVG outputted should be (unused). model_hparams: hyper-parameters, should include num_mixture, mix_temperature, and gauss_temperature. unused_vocab_size: unused hard: whether to force predict mode functionality, or return all MDN components Returns: The MDN output. Could be shape [batch, seqlen, 1, 10] if in predict mode (or hard=True) or shape [batch, seqlen, 1, 4 + 6 * num_mix * 3], in train. """ # mixture of gaussians for 6 args plus 4 extra states for cmds num_mix = model_hparams.num_mixture nout = 4 + 6 * num_mix * 3 # the 'hard' option is meant to be used if 'top' is called within body with tf.variable_scope('real_top', reuse=tf.AUTO_REUSE): ret = tf.layers.dense(body_output, nout, name='top') batch_size = common_layers.shape_list(ret)[0] if hard or model_hparams.mode == tf.estimator.ModeKeys.PREDICT: temperature = model_hparams.mix_temperature # apply temperature, do softmax command = tf.identity(ret[:, :, :, :4]) / temperature command = tf.exp(command - tf.reduce_max(command, axis=[-1], keepdims=True)) command = command / tf.reduce_sum( command, axis=[-1], keepdims=True) # sample from the given probs, this is the same as get_pi_idx, # and already returns not soft prob command = tf.distributions.Categorical(probs=command).sample() # this is now [batch, seq, 1], need to make it one_hot command = tf.one_hot(command, 4) arguments = ret[:, :, :, 4:] # args are [batch, seq, 1, 6*3*num_mix]. want [batch * seq * 6, 3*num_mix] arguments = tf.reshape(arguments, [-1, 3 * num_mix]) out_logmix, out_mean, out_logstd = _get_mdn_coef(arguments) # these are [batch*seq*6, num_mix] # apply temp to logmix out_logmix = tf.identity(out_logmix) / temperature out_logmix = tf.exp( out_logmix - tf.reduce_max(out_logmix, axis=[-1], keepdims=True)) out_logmix = out_logmix / tf.reduce_sum( out_logmix, axis=[-1], keepdims=True) # get_pi_idx out_logmix = tf.distributions.Categorical( probs=out_logmix).sample() # should now be [batch*seq*6, 1] out_logmix = tf.cast(out_logmix, tf.int32) out_logmix = tf.reshape(out_logmix, [-1]) # prepare for gather out_logmix = tf.stack([tf.range(tf.size(out_logmix)), out_logmix], axis=-1) chosen_mean = tf.gather_nd(out_mean, out_logmix) chosen_logstd = tf.gather_nd(out_logstd, out_logmix) # sample!! rand_gaussian = (tf.random.normal(tf.shape(chosen_mean)) * tf.sqrt(model_hparams.gauss_temperature)) arguments = chosen_mean + tf.exp(chosen_logstd) * rand_gaussian arguments = tf.reshape(arguments, [batch_size, -1, 1, 6]) # concat with the command we picked! ret = tf.concat([command, arguments], axis=-1) return ret
def process_data(object_str, vocab_table): """prelinminary process of dialogue data.""" separated = tf.string_split([object_str]).values indices = tf.cast(vocab_table.lookup(separated), tf.int32) return indices, tf.size(indices)
def num_ignored_columns(self): return tf.size(self.ignored_column_indices())
def _axis_size(x, axis=None): """Get number of elements of `x` in `axis`, as type `x.dtype`.""" if axis is None: return tf.cast(tf.size(x), x.dtype) return tf.cast(tf.reduce_prod(tf.gather(tf.shape(x), axis)), x.dtype)
def decode(self, tf_example_string_tensor): """Decodes serialized tensorflow example and returns a tensor dictionary. Args: tf_example_string_tensor: a string tensor holding a serialized tensorflow example proto. Returns: A dictionary of the following tensors. fields.InputDataFields.image - 3D uint8 tensor of shape [None, None, 3] containing image. fields.InputDataFields.original_image_spatial_shape - 1D int32 tensor of shape [2] containing shape of the image. fields.InputDataFields.source_id - string tensor containing original image id. fields.InputDataFields.key - string tensor with unique sha256 hash key. fields.InputDataFields.filename - string tensor with original dataset filename. fields.InputDataFields.groundtruth_boxes - 2D float32 tensor of shape [None, 4] containing box corners. fields.InputDataFields.groundtruth_classes - 1D int64 tensor of shape [None] containing classes for the boxes. fields.InputDataFields.groundtruth_weights - 1D float32 tensor of shape [None] indicating the weights of groundtruth boxes. fields.InputDataFields.groundtruth_area - 1D float32 tensor of shape [None] containing containing object mask area in pixel squared. fields.InputDataFields.groundtruth_is_crowd - 1D bool tensor of shape [None] indicating if the boxes enclose a crowd. Optional: fields.InputDataFields.groundtruth_image_confidences - 1D float tensor of shape [None] indicating if a class is present in the image (1.0) or a class is not present in the image (0.0). fields.InputDataFields.image_additional_channels - 3D uint8 tensor of shape [None, None, num_additional_channels]. 1st dim is height; 2nd dim is width; 3rd dim is the number of additional channels. fields.InputDataFields.groundtruth_difficult - 1D bool tensor of shape [None] indicating if the boxes represent `difficult` instances. fields.InputDataFields.groundtruth_group_of - 1D bool tensor of shape [None] indicating if the boxes represent `group_of` instances. fields.InputDataFields.groundtruth_keypoints - 3D float32 tensor of shape [None, num_keypoints, 2] containing keypoints, where the coordinates of the keypoints are ordered (y, x). fields.InputDataFields.groundtruth_keypoint_visibilities - 2D bool tensor of shape [None, num_keypoints] containing keypoint visibilites. fields.InputDataFields.groundtruth_instance_masks - 3D float32 tensor of shape [None, None, None] containing instance masks. fields.InputDataFields.groundtruth_image_classes - 1D int64 of shape [None] containing classes for the boxes. fields.InputDataFields.multiclass_scores - 1D float32 tensor of shape [None * num_classes] containing flattened multiclass scores for groundtruth boxes. fields.InputDataFields.context_features - 1D float32 tensor of shape [context_feature_length * num_context_features] fields.InputDataFields.context_feature_length - int32 tensor specifying the length of each feature in context_features """ serialized_example = tf.reshape(tf_example_string_tensor, shape=[]) decoder = slim_example_decoder.TFExampleDecoder( self.keys_to_features, self.items_to_handlers) keys = decoder.list_items() tensors = decoder.decode(serialized_example, items=keys) tensor_dict = dict(zip(keys, tensors)) is_crowd = fields.InputDataFields.groundtruth_is_crowd tensor_dict[is_crowd] = tf.cast(tensor_dict[is_crowd], dtype=tf.bool) tensor_dict[fields.InputDataFields.image].set_shape([None, None, 3]) tensor_dict[ fields.InputDataFields.original_image_spatial_shape] = tf.shape( tensor_dict[fields.InputDataFields.image])[:2] if fields.InputDataFields.image_additional_channels in tensor_dict: channels = tensor_dict[ fields.InputDataFields.image_additional_channels] channels = tf.squeeze(channels, axis=3) channels = tf.transpose(channels, perm=[1, 2, 0]) tensor_dict[ fields.InputDataFields.image_additional_channels] = channels def default_groundtruth_weights(): return tf.ones([ tf.shape( tensor_dict[fields.InputDataFields.groundtruth_boxes])[0] ], dtype=tf.float32) tensor_dict[fields.InputDataFields.groundtruth_weights] = tf.cond( tf.greater( tf.shape(tensor_dict[ fields.InputDataFields.groundtruth_weights])[0], 0), lambda: tensor_dict[fields.InputDataFields.groundtruth_weights], default_groundtruth_weights) if fields.InputDataFields.groundtruth_keypoints in tensor_dict: # Set all keypoints that are not labeled to NaN. gt_kpt_fld = fields.InputDataFields.groundtruth_keypoints gt_kpt_vis_fld = fields.InputDataFields.groundtruth_keypoint_visibilities visibilities_tiled = tf.tile( tf.expand_dims(tensor_dict[gt_kpt_vis_fld], -1), [1, 1, 2]) tensor_dict[gt_kpt_fld] = tf.where( visibilities_tiled, tensor_dict[gt_kpt_fld], np.nan * tf.ones_like(tensor_dict[gt_kpt_fld])) if self._expand_hierarchy_labels: input_fields = fields.InputDataFields image_classes, image_confidences = self._expand_image_label_hierarchy( tensor_dict[input_fields.groundtruth_image_classes], tensor_dict[input_fields.groundtruth_image_confidences]) tensor_dict[input_fields.groundtruth_image_classes] = image_classes tensor_dict[input_fields.groundtruth_image_confidences] = ( image_confidences) box_fields = [ fields.InputDataFields.groundtruth_group_of, fields.InputDataFields.groundtruth_is_crowd, fields.InputDataFields.groundtruth_difficult, fields.InputDataFields.groundtruth_area, fields.InputDataFields.groundtruth_boxes, fields.InputDataFields.groundtruth_weights, ] def expand_field(field_name): return self._expansion_box_field_labels( tensor_dict[input_fields.groundtruth_classes], tensor_dict[field_name]) # pylint: disable=cell-var-from-loop for field in box_fields: if field in tensor_dict: tensor_dict[field] = tf.cond( tf.size(tensor_dict[field]) > 0, lambda: expand_field(field), lambda: tensor_dict[field]) # pylint: enable=cell-var-from-loop tensor_dict[input_fields.groundtruth_classes] = ( self._expansion_box_field_labels( tensor_dict[input_fields.groundtruth_classes], tensor_dict[input_fields.groundtruth_classes], True)) if fields.InputDataFields.groundtruth_group_of in tensor_dict: group_of = fields.InputDataFields.groundtruth_group_of tensor_dict[group_of] = tf.cast(tensor_dict[group_of], dtype=tf.bool) if fields.InputDataFields.groundtruth_dp_num_points in tensor_dict: tensor_dict[ fields.InputDataFields.groundtruth_dp_num_points] = tf.cast( tensor_dict[ fields.InputDataFields.groundtruth_dp_num_points], dtype=tf.int32) tensor_dict[ fields.InputDataFields.groundtruth_dp_part_ids] = tf.cast( tensor_dict[ fields.InputDataFields.groundtruth_dp_part_ids], dtype=tf.int32) if fields.InputDataFields.groundtruth_track_ids in tensor_dict: tensor_dict[ fields.InputDataFields.groundtruth_track_ids] = tf.cast( tensor_dict[fields.InputDataFields.groundtruth_track_ids], dtype=tf.int32) return tensor_dict
def _generate_detections_tf(cls_outputs, box_outputs, anchor_boxes, indices, classes, image_id, image_scale, num_classes, min_score_thresh=0.2, max_boxes_to_draw=50, soft_nms_sigma=0.0, iou_threshold=0.5, use_native_nms=False): """Generates detections with model outputs and anchors. Args: cls_outputs: a numpy array with shape [N, 1], which has the highest class scores on all feature levels. The N is the number of selected top-K total anchors on all levels. (k being MAX_DETECTION_POINTS) box_outputs: a numpy array with shape [N, 4], which stacks box regression outputs on all feature levels. The N is the number of selected top-k total anchors on all levels. (k being MAX_DETECTION_POINTS) anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all feature levels. The N is the number of selected top-k total anchors on all levels. indices: a numpy array with shape [N], which is the indices from top-k selection. classes: a numpy array with shape [N], which represents the class prediction on all selected anchors from top-k selection. image_id: an integer number to specify the image id. image_scale: a float tensor representing the scale between original image and input image for the detector. It is used to rescale detections for evaluating with the original groundtruth annotations. num_classes: a integer that indicates the number of classes. min_score_thresh: A float representing the threshold for deciding when to remove boxes based on score. max_boxes_to_draw: Max number of boxes to draw. soft_nms_sigma: A scalar float representing the Soft NMS sigma parameter; See Bodla et al, https://arxiv.org/abs/1704.04503). When `soft_nms_sigma=0.0` (which is default), we fall back to standard (hard) NMS. iou_threshold: A float representing the threshold for deciding whether boxes overlap too much with respect to IOU. use_native_nms: a bool that indicates whether to use native nms. Returns: detections: detection results in a tensor with each row representing [image_id, y, x, height, width, score, class] """ anchor_boxes = tf.gather(anchor_boxes, indices) scores = tf.math.sigmoid(cls_outputs) # apply bounding box regression to anchors boxes = decode_box_outputs_tf(tf.transpose(box_outputs, [1, 0]), tf.transpose(anchor_boxes, [1, 0])) def _else(detections, class_id, indices): """Else branch for generating detections.""" boxes_cls = tf.gather(boxes, indices) scores_cls = tf.gather(scores, indices) # Select top-scoring boxes in each class and apply non-maximum suppression # (nms) for boxes in the same class. The selected boxes from each class are # then concatenated for the final detection outputs. if use_native_nms: top_detection_idx, scores_cls = tf.image.non_max_suppression_with_scores( boxes_cls, scores_cls, max_boxes_to_draw, iou_threshold=iou_threshold, score_threshold=min_score_thresh, soft_nms_sigma=soft_nms_sigma) scores_cls = tf.expand_dims(scores_cls, axis=1) boxes_cls = tf.gather(boxes_cls, top_detection_idx) top_detections_cls = tf.concat([boxes_cls, scores_cls], axis=1) else: scores_cls = tf.expand_dims(scores_cls, axis=1) all_detections_cls = tf.concat([boxes_cls, scores_cls], axis=1) top_detection_idx = nms_tf(all_detections_cls, iou_threshold) top_detections_cls = tf.gather(all_detections_cls, top_detection_idx) width = top_detections_cls[:, 2] - top_detections_cls[:, 0] height = top_detections_cls[:, 3] - top_detections_cls[:, 1] top_detections_cls = tf.stack([ top_detections_cls[:, 1] * image_scale, top_detections_cls[:, 0] * image_scale, height * image_scale, width * image_scale, top_detections_cls[:, 4] ], axis=-1) top_detections_cls = tf.stack([ tf.cast(tf.repeat(image_id, tf.size(top_detection_idx)), tf.float32), *tf.unstack(top_detections_cls, 5, axis=1), tf.repeat(class_id + 1.0, tf.size(top_detection_idx)) ], axis=1) detections = tf.concat([detections, top_detections_cls], axis=0) return detections detections = tf.constant([], tf.float32, [0, 7]) for c in range(num_classes): indices_cls = tf.squeeze(tf.where_v2(tf.equal(classes, c)), axis=-1) detections = tf.cond( tf.equal(tf.size(indices), 0), lambda: detections, lambda id=c, id_cls=indices_cls: _else(detections, id, id_cls)) indices_final = tf.argsort(detections[:, -2], direction='DESCENDING') detections = tf.gather(detections, indices_final[:max_boxes_to_draw], name='detection') return detections
def _parse_train_data(self, data): """Parses data for training. Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: image: image tensor that is preproessed to have normalized value and dimension [output_size[0], output_size[1], 3] labels: a dictionary of tensors used for training. The following describes {key: value} pairs in the dictionary. image_info: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [scaled_height, scaled_width], anchor_boxes: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, 4] representing anchor boxes at each level. rpn_score_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location]. The height_l and width_l represent the dimension of class logits at l-th level. rpn_box_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled image that is fed to the network. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by mask_crop_size. """ classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] if self._include_mask: masks = data['groundtruth_instance_masks'] if self._visual_feature_distill: roi_boxes = data['roi_boxes'] distill_features = data['groundtruth_visual_features'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training and self._is_training: num_groundtrtuhs = tf.shape(classes)[0] with tf.control_dependencies([num_groundtrtuhs, is_crowds]): indices = tf.cond( tf.greater(tf.size(is_crowds), 0), lambda: tf.where(tf.logical_not(is_crowds))[:, 0], lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Gets original image and its size. image = data['image'] image_shape = tf.shape(image)[0:2] # Normalizes image with mean and std pixel values. image = input_utils.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: if self._visual_feature_distill: assert self._include_mask image, boxes, masks, roi_boxes = input_utils.random_horizontal_flip( image, boxes, masks, roi_boxes) if self._include_mask: image, boxes, masks = input_utils.random_horizontal_flip( image, boxes, masks) else: image, boxes = input_utils.random_horizontal_flip(image, boxes) # Converts boxes from normalized coordinates to pixel coordinates. # Now the coordinates of boxes are w.r.t. the original image. boxes = box_utils.denormalize_boxes(boxes, image_shape) if self._visual_feature_distill: roi_boxes = box_utils.denormalize_boxes(roi_boxes, image_shape) # filter out roi boxes smaller than given size if self._filter_distill_boxes_size > 0: roi_indices = box_utils.get_non_empty_box_indices( roi_boxes, self._filter_distill_boxes_size) roi_boxes = tf.gather(roi_boxes, roi_indices) distill_features = tf.gather(distill_features, roi_indices) # Resizes and crops image. image, image_info = input_utils.resize_and_crop_image( image, self._output_size, padded_size=input_utils.compute_padded_size( self._output_size, 2**self._max_level), aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. # Now the coordinates of boxes are w.r.t the scaled image. image_scale = image_info[2, :] offset = image_info[3, :] boxes = input_utils.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset) if self._visual_feature_distill: roi_boxes = input_utils.resize_and_crop_boxes( roi_boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_utils.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Transfer boxes to the original image space and do normalization. cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) cropped_boxes = box_utils.normalize_boxes(cropped_boxes, image_shape) num_masks = tf.shape(masks)[0] masks = tf.image.crop_and_resize( tf.expand_dims(masks, axis=-1), cropped_boxes, box_indices=tf.range(num_masks, dtype=tf.int32), crop_size=[self._mask_crop_size, self._mask_crop_size], method='bilinear') masks = tf.squeeze(masks, axis=-1) # Assigns anchor targets. # Note that after the target assignment, box targets are absolute pixel # offsets w.r.t. the scaled image. input_anchor = anchor.Anchor(self._min_level, self._max_level, self._num_scales, self._aspect_ratios, self._anchor_size, (image_height, image_width)) anchor_labeler = anchor.RpnAnchorLabeler(input_anchor, self._rpn_match_threshold, self._rpn_unmatched_threshold, self._rpn_batch_size_per_im, self._rpn_fg_fraction) rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors( boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32)) # If bfloat16 is used, casts input image to tf.bfloat16. if self._use_bfloat16: image = tf.cast(image, dtype=tf.bfloat16) # Packs labels for model_fn outputs. labels = { 'anchor_boxes': input_anchor.multilevel_boxes, 'image_info': image_info, 'rpn_score_targets': rpn_score_targets, 'rpn_box_targets': rpn_box_targets, } labels['gt_boxes'] = input_utils.clip_or_pad_to_fixed_size( boxes, self._max_num_instances, -1) labels['gt_classes'] = input_utils.clip_or_pad_to_fixed_size( classes, self._max_num_instances, -1) if self._include_mask: labels['gt_masks'] = input_utils.clip_or_pad_to_fixed_size( masks, self._max_num_instances, -1) if self._visual_feature_distill: labels['roi_boxes'] = input_utils.clip_or_pad_to_fixed_size( roi_boxes, self._max_num_rois, -1) labels['gt_visual_feat'] = input_utils.clip_or_pad_to_fixed_size( distill_features, self._max_num_rois, -1) return image, labels
def _filter_fn(features): # pylint: disable=missing-docstring return tf.less_equal( tf.reduce_max( tf.stack([tf.size(v) for v in features.values()], axis=0)), max_encoded_len)