def CreateDenseCoordinates(self, ranges): """Create a matrix of coordinate locations corresponding to a dense grid. Example: To create (x, y) coordinates corresponding over a 10x10 grid with step sizes 1, call CreateDenseCoordinates([(1, 10, 10), (1, 10, 10)]). Args: ranges: A list of 3-tuples, each tuple is expected to contain (min, max, num_steps). Each list element corresponds to one dimesion. Each tuple will be passed into np.linspace to create the values for a single dimension. Returns: tf.float32 tensor of shape [total_points, len(ranges)], where total_points = product of all num_steps. """ total_points = int(np.prod([r_steps for _, _, r_steps in ranges])) cycle_steps = total_points stack_coordinates = [] for r_start, r_stop, r_steps in ranges: values = tf.lin_space( tf.to_float(r_start), tf.to_float(r_stop), tf.to_int32(r_steps)) cycle_steps //= r_steps gather_idx = (tf.range(total_points) // cycle_steps) % r_steps stack_coordinates.append(tf.gather(values, gather_idx)) return tf.stack(stack_coordinates, axis=1)
def FProp(self, theta, current_step): """Returns the current learning rate decay.""" p = self.params current_step = tf.to_float(current_step) warmup_steps = tf.to_float(p.warmup_examples / (p.batch_size * self._num_replicas)) return tf.minimum((current_step + 1) * warmup_steps**-1.5, (current_step + 1)**-0.5)
def FProp(self, theta, current_step): """Returns the current learning rate decay.""" p = self.params current_step = tf.to_float(current_step) warmup_steps = tf.to_float(p.warmup_steps * p.worker_replicas) if p.decay_end is not None: current_step = tf.where(current_step < p.decay_end, current_step, tf.to_float(p.decay_end)) return p.model_dim**-0.5 * tf.minimum( (current_step + 1) * warmup_steps**-1.5, (current_step + 1)**-0.5)
def FProp(self, theta, current_step): """Returns the current learning rate decay.""" params = self.params warmup_steps = tf.to_float(params.decay_start * params.worker_replicas) current_step = tf.to_float(current_step) if params.decay_end is not None: current_step = tf.where(current_step < params.decay_end, current_step, tf.to_float(params.decay_end)) peak_learning_rate = (warmup_steps**-0.5) return (params.model_dim**-0.5) * tf.minimum( tf.minimum((current_step + 1), (current_step + 1)**-0.5), peak_learning_rate)
def _Value(self, current_step): """Returns the current clipping cap.""" p = self.params start_step = tf.cast(p.start_step, tf.float32) end_step = tf.cast(p.end_step, tf.float32) current_step = tf.cast(current_step, tf.float32) steps_ratio = ( tf.minimum(end_step - start_step, current_step - start_step)/ (end_step - start_step)) rmax_tensor = ( steps_ratio * p.end_cap + (1.0 - steps_ratio) * p.start_cap) return tf.cond(tf.less(current_step, p.start_step), lambda: tf.to_float(p.start_cap), lambda: tf.to_float(rmax_tensor))
def testPointConvParametricConvShapes(self): batch_size, num_groups, points_per_group, num_in_channels = 4, 5, 6, 7 num_out_channels = 8 b = builder_lib.ModelBuilderBase() p = b._PointConvParametricConv('test', [3, 4, 9], num_in_channels, num_out_channels) l = p.Instantiate() x = py_utils.NestedMap( points=tf.random_uniform( (batch_size, num_groups, points_per_group, 3), dtype=tf.float32), features=tf.random_uniform( (batch_size, num_groups, points_per_group, num_in_channels), dtype=tf.float32), padding=tf.to_float( tf.random_uniform((batch_size, num_groups, points_per_group), minval=0, maxval=2, dtype=tf.int32))) y = l.FPropDefaultTheta(x) with self.session() as sess: sess.run(tf.global_variables_initializer()) actual_y = sess.run(y) self.assertAllEqual(actual_y.shape, (batch_size, num_groups, num_out_channels))
def _BBox2DImage(self, bbox_corners_image, input_images): """Compute [xmin, ymin, xmax, ymax] 2D bounding boxes from corners.""" # Clip the boundaries of the bounding box to the image width/height. bci_x = bbox_corners_image[..., 0:1] image_width = tf.broadcast_to( input_images.width[..., tf.newaxis, tf.newaxis], tf.shape(bci_x)) bci_x = tf.clip_by_value(bci_x, 0.0, tf.to_float(image_width)) bci_y = bbox_corners_image[..., 1:2] image_height = tf.broadcast_to( input_images.height[..., tf.newaxis, tf.newaxis], tf.shape(bci_y)) bci_y = tf.clip_by_value(bci_y, 0.0, tf.to_float(image_height)) bbox_corners_image_clipped = tf.concat([bci_x, bci_y], axis=-1) # Compute the [xmin, ymin, xmax, ymax] bounding boxes from [batch, # num_boxes, 8, 2] extrema. min_vals = tf.math.reduce_min(bbox_corners_image_clipped, axis=2) max_vals = tf.math.reduce_max(bbox_corners_image_clipped, axis=2) bbox2d_corners_image = tf.concat([min_vals, max_vals], axis=2) return bbox2d_corners_image
def _Extract(self, features): p = self.params # Label values match the proto enum car.open_dataset.Label.Type. The value # range is [1..4] for non-background labels. labels = tf.to_int32(_Dense(features['labels'])) labels = py_utils.PadOrTrimTo(labels, [p.max_num_objects]) label_ids = tf.reshape(_Dense(features['label_ids'], ''), [-1]) label_ids = py_utils.PadOrTrimTo(label_ids, [p.max_num_objects], '') bboxes_3d = tf.reshape(_Dense(features['bboxes_3d']), [-1, 7]) bboxes_3d_mask = tf.ones([tf.shape(bboxes_3d)[0]]) bboxes_3d_num_points = tf.to_int32( _Dense(features['bboxes_3d_num_points'])) bboxes_3d = py_utils.PadOrTrimTo(bboxes_3d, [p.max_num_objects, 7]) bboxes_3d_mask = py_utils.PadOrTrimTo(bboxes_3d_mask, [p.max_num_objects]) bboxes_3d_num_points = py_utils.PadOrTrimTo(bboxes_3d_num_points, [p.max_num_objects]) label_metadata = tf.reshape(_Dense(features['label_metadata']), [-1, 4]) label_metadata = py_utils.PadOrTrimTo(label_metadata, [p.max_num_objects, 4]) detection_difficulties = py_utils.PadOrTrimTo( tf.to_int32(_Dense(features['detection_difficulties'])), [p.max_num_objects]) tracking_difficulties = py_utils.PadOrTrimTo( tf.to_int32(_Dense(features['tracking_difficulties'])), [p.max_num_objects]) unfiltered_bboxes_3d_mask = bboxes_3d_mask if p.filter_labels: valid_labels = tf.constant([p.filter_labels]) bbox_mask = tf.reduce_any(tf.equal(tf.expand_dims(labels, 1), valid_labels), axis=1) bboxes_3d_mask *= tf.to_float(bbox_mask) outputs = { 'labels': labels, 'label_ids': label_ids, 'detection_difficulties': detection_difficulties, 'tracking_difficulties': tracking_difficulties, 'bboxes_3d': bboxes_3d, 'bboxes_3d_mask': bboxes_3d_mask, 'bboxes_3d_num_points': bboxes_3d_num_points, 'unfiltered_bboxes_3d_mask': unfiltered_bboxes_3d_mask, 'speed': label_metadata[:, :2], 'acceleration': label_metadata[:, 2:], } return py_utils.NestedMap(outputs)
def SetMetrics(self, metric_dict, step_args): """Sets the metrics to evaluate and the per-step output tensors. Args: metric_dict: dict of (name -> (tensor of values, tensor of weights)) step_args: the tensors being passed to the training loop body. These share the same structure of alternating value and weight scalars as the initial values and the output of this function. Returns: The tensors to return from the training loop body. For entries that are for metrics in self._metrics, returns the value computed within the loop (the step_args value passed in); for all others, the value will never be used at the end and so the step_args value is passed through (which has the effect of passing the initial values through every iteration of the loop). """ num_metrics = len(metric_dict) assert num_metrics <= self._max_metrics, ('Increase _max_metrics to >= %d' % num_metrics) self._metrics = py_utils.NestedMap(metric_dict) # self._metrics contains a map of (metric_value, # metric_weight). We convert it into [metric_value * # metric_weight, metric_weight] to make it easier to aggregate # metric values across steps and TPU replicas. ret = [] for (value, weight) in self._metrics.Flatten(): assert value.shape.is_fully_defined(), ('%s' % value) assert weight.shape.is_fully_defined(), ('%s' % weight) weight = tf.to_float(weight) value = tf.to_float(value) * weight ret += [value, weight] # Each metric has two tensors: value and weight. assert len(ret) == 2 * num_metrics ret += list(step_args)[len(ret):] return ret
def _CreateFrustumMask(self, bbox_corners_image, bbox2d_corners_image_clipped, image_height, image_width): """Creates a box mask for boxes whose projections fall outside of image.""" p = self.params batch_size, num_boxes = py_utils.GetShape(bbox_corners_image, 2) if not p.filter_predictions_outside_frustum: return tf.ones(shape=(batch_size, num_boxes), dtype=tf.float32) def _MinMax(bbox_corners): """Computes the min and max over corners.""" bbox_min = tf.reduce_min(bbox_corners, axis=-1) bbox_max = tf.reduce_max(bbox_corners, axis=-1) bbox_min = py_utils.HasShape(bbox_min, [batch_size, num_boxes]) bbox_max = py_utils.HasShape(bbox_max, [batch_size, num_boxes]) return bbox_min, bbox_max bbox_min_x, bbox_max_x = _MinMax(bbox_corners_image[:, :, :, 0]) bbox_min_y, bbox_max_y = _MinMax(bbox_corners_image[:, :, :, 1]) # Compute the fraction of the clipped 2d image projection and the # full 2d image projection. We simply need to divide the area # of each cropped box by the area of the full box to get the # overlap fraction. original_area = (bbox_max_x - bbox_min_x) * (bbox_max_y - bbox_min_y) bbox_clipped_x_min = bbox2d_corners_image_clipped[..., 0] bbox_clipped_y_min = bbox2d_corners_image_clipped[..., 1] bbox_clipped_x_max = bbox2d_corners_image_clipped[..., 2] bbox_clipped_y_max = bbox2d_corners_image_clipped[..., 3] clipped_area = (bbox_clipped_x_max - bbox_clipped_x_min) * ( bbox_clipped_y_max - bbox_clipped_y_min) fraction = clipped_area / original_area frustum_mask = (fraction > p.truncation_threshold) frustum_mask = py_utils.HasShape(frustum_mask, [batch_size, num_boxes]) frustum_mask = tf.to_float(frustum_mask) return frustum_mask
def _Extract(self, features): p = self.params source_id = py_utils.HasShape(features['image/source_id'], []) xmin = _Dense(features['object/image/bbox/xmin']) xmax = _Dense(features['object/image/bbox/xmax']) ymin = _Dense(features['object/image/bbox/ymin']) ymax = _Dense(features['object/image/bbox/ymax']) # 2d bounding box in image coordinates. bboxes = tf.stack([ymin, xmin, ymax, xmax], axis=1) bboxes_count = tf.shape(bboxes)[0] bboxes = py_utils.PadOrTrimTo(bboxes, [p.max_num_objects, 4]) bboxes_padding = 1.0 - py_utils.PadOrTrimTo( tf.ones([bboxes_count]), [p.max_num_objects]) dim_xyz = tf.reshape(_Dense(features['object/velo/bbox/dim_xyz']), [-1, 3]) loc_xyz = tf.reshape(_Dense(features['object/velo/bbox/xyz']), [-1, 3]) phi = tf.reshape(_Dense(features['object/velo/bbox/phi']), [-1, 1]) # bboxes_3d is in [x, y, z, dx, dy, dz, phi]. bboxes_3d = tf.concat([loc_xyz, dim_xyz, phi], axis=1) cx, cy, _, dx, dy, _, _ = tf.unstack(bboxes_3d, num=7, axis=-1) bboxes_td = tf.stack([ cy - dy / 2, cx - dx / 2, cy + dy / 2, cx + dx / 2, ], axis=-1) # pyformat: disable bboxes_td = py_utils.PadOrTrimTo(bboxes_td, [p.max_num_objects, 4]) has_3d_info = tf.to_float(_Dense(features['object/has_3d_info'])) bboxes_3d_mask = py_utils.PadOrTrimTo(has_3d_info, [p.max_num_objects]) bboxes_td_mask = bboxes_3d_mask # Fill in difficulties from bounding box height, truncation and occlusion. bb_height = ymax - ymin box_image_height = py_utils.PadOrTrimTo(bb_height, [p.max_num_objects]) box_image_height *= bboxes_3d_mask # 0 to 3 indicating occlusion level. 0 means fully visible, 1 means partly, occlusion = tf.reshape(_Dense(features['object/occlusion']), [-1]) occlusion = tf.to_float(occlusion) occlusion = py_utils.PadOrTrimTo(occlusion, [p.max_num_objects]) occlusion *= bboxes_3d_mask # Truncation: 0 -> not truncated, 1.0 -> truncated truncation = tf.reshape(_Dense(features['object/truncation']), [-1]) truncation = py_utils.PadOrTrimTo(truncation, [p.max_num_objects]) truncation *= bboxes_3d_mask difficulties = ComputeKITTIDifficulties(box_image_height, occlusion, truncation) difficulties = py_utils.PadOrTrimTo(difficulties, [p.max_num_objects]) # Make a batch axis to call BBoxCorners, and take the first result back. bbox3d_corners = geometry.BBoxCorners(bboxes_3d[tf.newaxis, ...])[0] # Project the 3D bbox to the image plane. velo_to_image_plane = features['transform/velo_to_image_plane'] bboxes3d_proj_to_image_plane = geometry.PointsToImagePlane( tf.reshape(bbox3d_corners, [-1, 3]), velo_to_image_plane) # Output is [num_objects, 8 corners per object, (x, y)]. bboxes3d_proj_to_image_plane = tf.reshape(bboxes3d_proj_to_image_plane, [-1, 8, 2]) bboxes3d_proj_to_image_plane = py_utils.PadOrTrimTo( bboxes3d_proj_to_image_plane, [p.max_num_objects, 8, 2]) texts = features['object/label'].values labels = ops.static_map_string_int(x=texts, keys=self.KITTI_CLASS_NAMES) labels = py_utils.PadOrTrimTo(labels, [p.max_num_objects]) texts = py_utils.PadOrTrimTo(texts, [p.max_num_objects]) # Filter labels by setting bboxes_padding, bboxes_3d_mask, and # bboxes_td_mask appropriately. if p.filter_labels is not None: valid_labels = tf.constant([p.filter_labels]) bbox_mask = tf.reduce_any( tf.equal(tf.expand_dims(labels, 1), valid_labels), axis=1) bbox_mask = tf.to_float(bbox_mask) bboxes_padding = 1 - bbox_mask * (1 - bboxes_padding) filtered_bboxes_3d_mask = bboxes_3d_mask * bbox_mask bboxes_td_mask *= bbox_mask else: filtered_bboxes_3d_mask = bboxes_3d_mask # Placeholder for counting the number of laser points that reside within # each 3-d bounding box. This must be filled in outside of this function # based on the loaded 3-d laser points. bboxes_3d_num_points = tf.zeros([p.max_num_objects], dtype=tf.int32) bboxes_3d_num_points = py_utils.PadOrTrimTo(bboxes_3d_num_points, [p.max_num_objects]) # Pad bboxes_3d. bboxes_3d = py_utils.PadOrTrimTo(bboxes_3d, [p.max_num_objects, 7]) return py_utils.NestedMap( source_id=source_id, bboxes_count=bboxes_count, bboxes=bboxes, bboxes_padding=bboxes_padding, bboxes_3d=bboxes_3d, bboxes_3d_mask=filtered_bboxes_3d_mask, unfiltered_bboxes_3d_mask=bboxes_3d_mask, bboxes3d_proj_to_image_plane=bboxes3d_proj_to_image_plane, bboxes_td=bboxes_td, bboxes_td_mask=bboxes_td_mask, bboxes_3d_num_points=bboxes_3d_num_points, labels=labels, texts=texts, box_image_height=box_image_height, occlusion=occlusion, truncation=truncation, difficulties=difficulties)
def _SingleClassDecodeWithNMS(predicted_bboxes, classification_scores, nms_iou_threshold, score_threshold, max_boxes_per_class=None): """Perform NMS on predicted bounding boxes / associated logits. Args: predicted_bboxes: [batch_size, num_boxes, 7] float Tensor containing predicted bounding box coordinates. classification_scores: [batch_size, num_boxes, num_classes] float Tensor containing predicted classification scores for each box. nms_iou_threshold: IoU threshold to use when determining whether two boxes overlap for purposes of suppression. score_threshold: The score threshold passed to NMS that allows NMS to quickly ignore irrelevant boxes. max_boxes_per_class: The maximum number of boxes per example to emit. If None, this value is set to num_boxes from the shape of predicted_bboxes. Returns: predicted_bboxes: Filtered bboxes after NMS of shape [batch_size, num_classes, max_boxes_per_class, 7]. bbox_scores: A float32 Tensor with the score for each box of shape [batch_size, num_classes, max_boxes_per_class]. valid_mask: A float32 Tensor with 1/0 values indicating the validity of each box. 1 indicates valid, and 0 invalid. Tensor of shape [batch_size, num_classes, max_boxes_per_class]. """ utils_3d = detection_3d_lib.Utils3D() predicted_bboxes = py_utils.HasShape(predicted_bboxes, [-1, -1, 7]) batch_size, num_predicted_boxes, _ = py_utils.GetShape(predicted_bboxes) classification_scores = py_utils.HasShape( classification_scores, [batch_size, num_predicted_boxes, -1]) _, _, num_classes = py_utils.GetShape(classification_scores) if not isinstance(nms_iou_threshold, float): raise ValueError('Single class NMS only supports a scalar ' '`nms_iou_threshold`.') if not isinstance(score_threshold, float): raise ValueError('Single class NMS only supports a scalar ' '`score_threshold`.') if max_boxes_per_class is None: max_boxes_per_class = num_predicted_boxes # TODO(jngiam): Change to be per-class bboxes, and hence, per-class NMS, and # per-class thresholding. # [batch, num_predicted_boxes] nms_scores = tf.reduce_max(classification_scores, axis=-1) # Compute the most likely label by computing the highest class score from # the output of the sigmoid. likely_labels = tf.argmax(classification_scores, axis=-1) # When background is the most likely class for the box, mask out the scores # of that box from NMS scoring so the background boxes don't dominate the # NMS. nms_scores *= tf.to_float(likely_labels > 0) # Compute NMS for every sample in the batch. nms_indices, valid_mask = utils_3d.BatchedNMSIndices( predicted_bboxes, nms_scores, nms_iou_threshold=nms_iou_threshold, score_threshold=score_threshold, max_num_boxes=max_boxes_per_class) # Reorder the box data and logits according to NMS scoring. predicted_bboxes = tf.batch_gather(predicted_bboxes, nms_indices) classification_scores = tf.batch_gather(classification_scores, nms_indices) # Now reformat the output of NMS to match the format of the # MultiClassOrientedDecodeWithNMS, which outputs a per class NMS result. # This takes the leading shape of # [batch_size, num_classes, max_boxes_per_class] for all outputs, which # means since this NMS is not class specific we need to tile the outputs # num_classes times or reorder the data such that its [batch, num_classes]. predicted_bboxes = tf.tile(predicted_bboxes[:, tf.newaxis, :, :], [1, num_classes, 1, 1]) classification_scores = tf.transpose(classification_scores, (0, 2, 1)) classification_scores = py_utils.HasShape( classification_scores, [batch_size, num_classes, max_boxes_per_class]) valid_mask = tf.tile(valid_mask[:, tf.newaxis, :], [1, num_classes, 1]) return predicted_bboxes, classification_scores, valid_mask
def add_point_cloud(self, feature, laser_names, range_image_pose): """Convert the range images in `feature` to 3D point clouds. Adds the point cloud data to the tf.Example feature map. Args: feature: A tf.Example feature map. laser_names: A list of laser names (e.g., 'TOP', 'REAR', 'SIDE_LEFT'). range_image_pose: A range image pose Tensor for the GBR. """ for laser_name in laser_names: beam_inclinations = np.array(feature['%s_beam_inclinations' % laser_name].float_list.value[:]) # beam_inclinations will be populated if there is a non-uniform # beam configuration (e.g., for the TOP lasers). Others that have # uniform beam inclinations are only parameterized by the min and max. # We use these min and max if the beam_inclinations are not present, # and turn them into a uniform inclinations array. if beam_inclinations.size == 0: beam_inclination_min = feature['%s_beam_inclination_min' % laser_name].float_list.value[:] beam_inclination_max = feature['%s_beam_inclination_max' % laser_name].float_list.value[:] laser_ri_name = '%s_ri1' % laser_name range_image_shape = feature[laser_ri_name + '_shape'].int64_list.value[:] height = tf.to_float(range_image_shape[0]) beam_inclinations = tf.constant( [beam_inclination_min[0], beam_inclination_max[0]]) beam_inclinations = range_image_utils.compute_inclination( beam_inclinations, height) beam_extrinsics = np.array( feature['%s_extrinsics' % laser_name].float_list.value[:]).reshape( 4, 4) for ri_type in ['ri1', 'ri2']: laser_ri_name = '%s_%s' % (laser_name, ri_type) # For each of the 4 features of the lasers: range_image = np.array(feature[laser_ri_name].float_list.value[:]) range_image_shape = feature[laser_ri_name + '_shape'].int64_list.value[:] range_image = range_image.reshape(range_image_shape) # Compute mask. At the moment, invalid values in the range image # representation are indicated via a -1. entry. Callers are expected # to create this mask when passing into the conversion function below. range_image_mask = range_image[..., 0] >= 0 # Get the 'range' feature from the range images. range_image_range = range_image[..., 0] # Call utility to convert point cloud to cartesian coordinates. # # API expects a batch dimension for all inputs. batched_pixel_pose = None batched_frame_pose = None # At the moment, only the GBR has per-pixel pose. if laser_name == 'TOP': batched_pixel_pose = range_image_pose[tf.newaxis, ...] batched_frame_pose = self.frame_pose[tf.newaxis, ...] batched_range_image_range = tf.convert_to_tensor( range_image_range[np.newaxis, ...], dtype=tf.float32) batched_extrinsics = tf.convert_to_tensor( beam_extrinsics[np.newaxis, ...], dtype=tf.float32) batched_inclinations = tf.convert_to_tensor( beam_inclinations[np.newaxis, ...], dtype=tf.float32) batched_inclinations = tf.reverse(batched_inclinations, axis=[-1]) range_image_cartesian = ( range_image_utils.extract_point_cloud_from_range_image( batched_range_image_range, batched_extrinsics, batched_inclinations, pixel_pose=batched_pixel_pose, frame_pose=batched_frame_pose)) points_xyz = tf.gather_nd(range_image_cartesian[0], tf.where(range_image_mask)) # Fetch the features corresponding to each xyz coordinate and # concatentate them together. points_features = tf.to_float( tf.gather_nd(range_image[..., 1:], tf.where(range_image_mask))) points_data = tf.concat([points_xyz, points_features], axis=-1) # Add laser feature to output. # # Skip embedding shape since we assume that all points have six features # and so we can reconstruct the number of points. points_list = list(points_data.numpy().reshape([-1])) feature['laser_%s' % laser_ri_name].float_list.value[:] = points_list
def NeighborhoodIndices(points, query_points, k, points_padding=None, max_distance=None, sample_neighbors_uniformly=False): """Get indices to k-neighbors of query_points in points. Padding is returned along-side indices. Non-padded points are guaranteed to be unique (non-repeated) points from original non-padded points. Padded points arise due to either a lack of points (k exceeds the number of original non-padded points) or points are too far away (exceeds max distance). Note: Padded point indices may refer to padded points from the original, or may be duplicates of the closest point. TODO(weihan,jngiam): PointCNN implementation makes an assumption that padded points are repeated points from the original points. This behavior is maintained here, but we should update PointCNN to respect indices paddings. Args: points: tensor of shape [N, P1, dims]. query_points: tensor of shape [N, P2, dims] k: Integer. points_padding: optional tensor of shape [N, P1] containing True/1.0 iff the point is a padded point. if None, then all points are considered real points. max_distance: float representing the maximum distance that each neighbor can be. If there are no points within the distance, then the closest point is returned (regardless of distance). If this is set to None, then no filtering by distance is performed. sample_neighbors_uniformly: boolean specifying whether to sample neighbors uniformly if they are within max distance. Returns: indices: tensor of shape [N, P2, k]. padding: tensor of shape [N, P2, k] where 1 represents a padded point, and 0 represents an unpadded (real) point. """ n, p1 = py_utils.GetShape(points, 2) query_points = py_utils.HasShape(query_points, [n, -1, -1]) _, p2 = py_utils.GetShape(query_points, 2) # Compute pair-wise squared distances. # Note that dist_mat contains the squared distance (without sqrt). Thus, when # using max_distance, we will need to square max_distance to make sure it's # in the same units. dist_mat = SquaredDistanceMatrix(query_points, points) dist_mat = py_utils.HasShape(dist_mat, [n, p2, p1]) # Add a large scalar to the distances for padded points. # dist_mat[i, j, k] will be: # if k < valid_num[i]: distance between points[i, k] and query_points[i, j] # otherwise: a large scalar added to dist_mat[i, j, k] if points_padding is not None: points_padding = tf.to_float(tf.expand_dims(points_padding, 1)) points_padding = py_utils.HasShape(points_padding, [n, 1, p1]) large_scalar = tf.reduce_max(dist_mat) + 1 dist_mat += points_padding * large_scalar # To perform sampling neighbors uniformly efficiently, we set all neighbors # that are within the distance threshold to have distances be drawn uniformly # at random. Using top_k with this enables selecting a random set quickly # without replacement. if sample_neighbors_uniformly: if max_distance is not None: mask_by_distance = tf.less_equal(dist_mat, max_distance**2) dist_mat = tf.where( mask_by_distance, tf.square(max_distance) * tf.random_uniform(tf.shape(dist_mat)), dist_mat) else: raise ValueError( 'Uniform sampling requires specifying max_distance.') top_k_dist, indices = tf.nn.top_k(-dist_mat, k=k, sorted=True) # N x P2 x K # Set padding using top_k_dist; padded points will have distance exceeding # the large_scalar. if points_padding is not None: paddings = tf.greater_equal(-top_k_dist, large_scalar) else: paddings = tf.zeros_like(top_k_dist, dtype=tf.bool) # Filter by max_distances by setting all indices that exceed the max_distance # to the closest point. if max_distance is not None: # Mask is true for points that are further than max_distance. mask_by_distance = tf.greater(-top_k_dist, tf.square(max_distance)) closest_idx = tf.tile(indices[:, :, :1], [1, 1, k]) indices = tf.where(mask_by_distance, closest_idx, indices) paddings |= mask_by_distance indices = tf.reshape(indices, [n, p2, k]) paddings = tf.to_float(paddings) return indices, paddings
def AssignAnchors(self, anchor_bboxes, gt_bboxes, gt_bboxes_labels, gt_bboxes_mask, foreground_assignment_threshold=0.5, background_assignment_threshold=0.35, background_class_id=0, force_match=True, similarity_fn=None): """Assigns anchors to bboxes using a similarity function (SSD-based). Each anchor box is assigned to the top matching ground truth box. Ground truth boxes can be assigned to multiple anchor boxes. Assignments can result in 3 outcomes: Positive assignment (if score >= foreground_assignment_threshold): assigned_gt_labels will reflect the assigned box label and assigned_cls_mask will be set to 1.0 Background assignment (if score <= background_assignment_threshold): assigned_gt_labels will be background_class_id and assigned_cls_mask will be set to 1.0 Ignore assignment (otherwise): assigned_gt_labels will be background_class_id and assigned_cls_mask will be set to 0.0 The detection loss function would usually: Use assigned_cls_mask for weighting the classification loss. The mask is set such that the loss applies to foreground and background assignments only - ignored anchors will be set to 0. Use assigned_reg_mask for weighting the regression loss. The mask is set such that the loss applies to foreground assignments only. The thresholds (foreground_assignment_threshold and background_assignment_threshold) should be tuned per dataset. TODO(jngiam): Consider having a separate threshold for regression boxes; a separate threshold is used in PointRCNN. Args: anchor_bboxes: tf.float32. [A, 7], where [..., :] corresponds to box parameters (x, y, z, dx, dy, dz, r). gt_bboxes: tf.float32. [G, 7], where [..., :] corresponds to ground truth box parameters (x, y, z, dx, dy, dz, r). gt_bboxes_labels: tensor with shape [G]. Ground truth labels for each bounding box. gt_bboxes_mask: tensor with shape [G]. Mask for ground truth boxes, 1 iff the gt_bbox is a real bbox. foreground_assignment_threshold: Similarity score threshold for assigning foreground bounding boxes; scores need to be >= foreground_assignment_threshold to be assigned to foreground. background_assignment_threshold: Similarity score threshold for assigning background bounding boxes; scores need to be <= background_assignment_threshold to be assigned to background. background_class_id: class id to be assigned to anchors_gt_class if no anchor boxes match. force_match: Boolean specifying if force matching is enabled. If force matching is enabled, then matched anchors which are also the highest scoring with a ground-truth box are considered foreground matches as long as their similarity score > 0. similarity_fn: Function that computes the a similarity score (e.g., IOU) between pairs of bounding boxes. This function should take in two tensors corresponding to anchor and ground-truth bboxes, and return a matrix [A, G] with the similarity score between each pair of bboxes. The score must be non-negative, with greater scores representing more similar. The fore/background_assignment_thresholds will be applied to this score to determine if the an anchor is foreground, background or ignored. If set to None, the function will default to IOU2DRotatedBoxes. Returns: NestedMap with the following keys: assigned_gt_bbox: shape [A, 7] bbox parameters assigned to each anchor. assigned_gt_similarity_score: shape [A] (iou) score between the anchor and the gt bbox. assigned_gt_labels: shape [A] label assigned to bbox. assigned_cls_mask: shape [A] mask for classification loss per anchor. This should be 1.0 if the anchor has a foreground or background assignment; otherwise, it will be assigned to 0.0. assigned_reg_mask: shape [A] mask for regression loss per anchor. This should be 1.0 if the anchor has a foreground assignment; otherwise, it will be assigned to 0.0. Note: background anchors do not have regression targets. """ if similarity_fn is None: similarity_fn = self.IOU2DRotatedBoxes # Shape validation. anchor_bboxes = py_utils.HasShape(anchor_bboxes, [-1, 7]) num_anchor_bboxes, _ = py_utils.GetShape(anchor_bboxes, 2) gt_bboxes = py_utils.HasShape(gt_bboxes, [-1, 7]) num_gt_bboxes, _ = py_utils.GetShape(gt_bboxes, 2) # Compute similarity score and reduce max by anchors and by ground-truth. similarity_score = similarity_fn(anchor_bboxes, gt_bboxes) similarity_score = py_utils.HasShape(similarity_score, [num_anchor_bboxes, num_gt_bboxes]) # Reduce over ground-truth boxes, so we have the max score per anchor. anchor_max_score = tf.reduce_max(similarity_score, axis=1) anchor_max_idx = tf.argmax(similarity_score, axis=1) if force_match: # Reduce over anchors, so we have the max score per ground truth box. gt_max_score = tf.reduce_max(similarity_score, axis=0, keep_dims=True) # Force matches occur when the top matching gt bbox for an anchor is the # top matching anchor for the gt bbox. When force matching, we match # these boxes as long as their similarity score exceeds 0. force_matches = ( tf.equal(similarity_score, gt_max_score) & tf.equal(similarity_score, anchor_max_score[..., tf.newaxis]) & tf.greater(similarity_score, 0.) & tf.cast(gt_bboxes_mask[tf.newaxis, ...], tf.bool)) force_match_indicator = tf.reduce_any(force_matches, axis=1) force_match_idx = tf.argmax(tf.to_int32(force_matches), axis=1) # In assigning foreground/background anchors later, force_match_indicator # is used to determine which anchors are force foreground, and the index # assigned will be taken from anchor_max_idx. # Force matchers must also be the max scoring gt bbox per anchor. # We overwrite anchor_max_idx to ensure that the right match is done. anchor_max_idx = tf.where(force_match_indicator, force_match_idx, anchor_max_idx) # Ensure that max score boxes are not padded boxes by setting score to 0 # for boxes that are padded. gathered_mask = tf.batch_gather(gt_bboxes_mask, anchor_max_idx) anchor_max_score = tf.where( tf.equal(gathered_mask, 1), anchor_max_score, tf.zeros_like(anchor_max_score)) # Boolean tensors corresponding to whether an anchor is background or # foreground based on thresholding. background_anchors = tf.less_equal(anchor_max_score, background_assignment_threshold) foreground_anchors = tf.greater_equal(anchor_max_score, foreground_assignment_threshold) if force_match: # Background anchors are below threshold and not force matches. background_anchors &= ~force_match_indicator # Foreground anchors are above thresholds or force matches. foreground_anchors |= force_match_indicator # Add dummy background bbox to gt_boxes to facilitate batch gather. dummy_bbox = tf.constant([[0, 0, 0, 1, 1, 1, 0]], dtype=tf.float32) # Since we are concatenating the dummy bbox, the index corresponds to the # number of boxes. dummy_bbox_idx = py_utils.GetShape(gt_bboxes, 1)[0] gt_bboxes = tf.concat([gt_bboxes, dummy_bbox], axis=0) gt_bboxes_labels = tf.concat([gt_bboxes_labels, [background_class_id]], axis=0) # Gather indices so that all foreground boxes are gathered from gt_bboxes, # while all background and ignore boxes gather the dummy_bbox. anchor_gather_idx = tf.where( foreground_anchors, anchor_max_idx, tf.constant( dummy_bbox_idx, shape=py_utils.GetShape(anchor_max_idx), dtype=anchor_max_idx.dtype)) # Gather the bboxes and weights. assigned_gt_bbox = tf.batch_gather(gt_bboxes, anchor_gather_idx) assigned_gt_labels = tf.batch_gather(gt_bboxes_labels, anchor_gather_idx) # Set masks for classification and regression losses. assigned_cls_mask = tf.to_float(background_anchors | foreground_anchors) assigned_reg_mask = tf.to_float(foreground_anchors) return py_utils.NestedMap( assigned_gt_bbox=assigned_gt_bbox, assigned_gt_similarity_score=anchor_max_score, assigned_gt_labels=assigned_gt_labels, assigned_cls_mask=assigned_cls_mask, assigned_reg_mask=assigned_reg_mask)
def _Extract(self, features): p = self.params ri_outputs = {} outputs = {} frame_pose = tf.reshape(_Dense(features['pose']), [4, 4]) for laser in p.cbr_laser_names + p.gbr_laser_names: # Extract range images. for returns in p.returns: ri_shape = tf.reshape( _Dense(features['%s_%s_shape' % (laser, returns)]), [-1]) range_image = tf.reshape( _Dense(features['%s_%s' % (laser, returns)]), ri_shape) shape_to_check = ( p.cbr_ri_shape if laser in p.cbr_laser_names else p.gbr_ri_shape) range_image = py_utils.HasShape(range_image, shape_to_check) ri_outputs['%s_%s' % (laser, returns)] = range_image # Extract beam inclinations and extrinsics outputs['%s_extrinsics' % laser] = tf.reshape( _Dense(features['%s_extrinsics' % laser]), [4, 4]) # CBRs have uniform inclination for laser in p.cbr_laser_names: beam_inclination_min = tf.reshape( _Dense(features['%s_beam_inclination_min' % laser]), []) beam_inclination_max = tf.reshape( _Dense(features['%s_beam_inclination_max' % laser]), []) outputs['%s_beam_inclinations' % laser] = tf.stack( [beam_inclination_min, beam_inclination_max], axis=0) # GBRs have non-uniform inclinations defined by 64 floats. for laser in p.gbr_laser_names: outputs['%s_beam_inclinations' % laser] = tf.reshape( _Dense(features['%s_beam_inclinations' % laser]), [64]) # Embed xyz onto each range image pixel. for laser in p.cbr_laser_names + p.gbr_laser_names: extrinsics = outputs['%s_extrinsics' % laser] inclinations = outputs['%s_beam_inclinations' % laser] if laser in p.cbr_laser_names: ri_shape = p.cbr_ri_shape # Convert from 2-tuple range inclination to the full range # via linear interpolation. # # CBR lasers currently are always uniform inclinations specified by a # length 2 vector. height = ri_shape[0] min_inclination = inclinations[0] max_inclination = inclinations[1] diff = max_inclination - min_inclination ratio = (.5 + tf.to_float(tf.range(0, height))) / tf.to_float(height) # interpolate from min to max inclination. inclinations = (ratio * diff) + min_inclination else: ri_shape = p.gbr_ri_shape pixel_pose = None if laser in p.gbr_laser_names: pixel_pose = tf.reshape( _Dense(features['%s_pose' % laser]), shape=p.gbr_ri_shape[0:2] + [4, 4]) for returns in p.returns: range_image = ri_outputs['%s_%s' % (laser, returns)] range_image = tf.reshape(range_image, ri_shape) range_image_mask = range_image[..., 0] >= 0 ri_xyz = tf.to_float( self._XYZFromRangeImage(range_image, range_image_mask, extrinsics, inclinations, pixel_pose, frame_pose)) # Produce the NestedMap of xyz, features, mask. ri_result = py_utils.NestedMap({ 'xyz': ri_xyz, 'features': range_image, 'mask': tf.to_float(range_image_mask), }) outputs['%s_%s' % (laser, returns)] = ri_result return py_utils.NestedMap(outputs)
def _InferenceSubgraph_Default(self): """Default inference subgraph. Returns: (fetches, feeds), with: - fetches: A dictionary of fetches, containing: - log_pplx_per_token: A matrix of shape [batch, time]. [i, j] is i-th input text's j-th token's log prob. - paddings: A matrix of shape [batch, time]. The padding mask. - log_pplx_per_sample: A vector of shape [batch]. [i] is i-th input text's log prob. - num_oovs_per_sample: A vector of shape [batch] counting the total number of out-of-vocabulary tokens in each input. - tokens_from_labels: A vector of shape [batch] returning the predicted tokens as a sequence after mapping them back to strings from ids using the vocabulary. - ids: A matrix of shape [batch, time]. [i, j] is i-th input text's j-th token's id. - feeds: A dictionary of feeds, containing: - text: A placeholder for a vector of strings. """ text = tf.placeholder(tf.string, shape=[None]) # [batch, time] ids, labels, paddings = self.input_generator.StringsToIds(text) lengths = tf.reduce_sum(tf.to_int32(1 - paddings), axis=1) tokens_from_labels = self.input_generator.IdsToStrings(labels, lengths) oovs = tf.equal(labels, self.input_generator.tokenizer.unk_id) num_oovs_per_sample = tf.to_int32( tf.reduce_sum(tf.to_float(oovs) * (1 - paddings), axis=1)) # [time, batch] ids, paddings, labels, weights = self._TrimIfPossibleThenTranspose( ids, paddings, labels, 1.0 - paddings) batch_size = tf.shape(ids)[1] xent_output, _ = self.lm.FPropDefaultTheta( inputs=ids, paddings=paddings, state0=self.lm.zero_state(self.theta.lm, batch_size), labels=py_utils.NestedMap(class_ids=labels, class_weights=weights)) per_example_xent = py_utils.HasShape(xent_output.per_example_xent, tf.shape(ids)) log_pplx_per_sample = tf.reduce_sum(per_example_xent * (1 - paddings), axis=0) fetches = { 'log_pplx_per_token': # [batch, time] tf.transpose(per_example_xent), 'paddings': # [batch, time] tf.transpose(paddings), 'lengths': # [batch] lengths, 'log_pplx_per_sample': # [batch] log_pplx_per_sample, 'num_oovs_per_sample': # [batch], int32 num_oovs_per_sample, 'tokens_from_labels': # [batch], string tokens_from_labels, 'ids': # [batch, time], int32 ids } feeds = {'text': text} return fetches, feeds
def ReadData(): x, y = io_ops.restore_v2(p.ckpt, [p.data, p.label], [''] * 2, [p.data_dtype, p.label_dtype]) # Always convert to float32. return tf.to_float(x), tf.to_float(y)