def testTransformerStackV2(self, use_v1_stack=False, stride=1, first_n=None): with self.session(use_gpu=False) as sess: bs = 2 sl = 21 d = 16 tf.random.set_seed(12345) atten_builder = self_attention.Builder.Params().Set( model_dim=d, num_heads=2, ff_hidden_dim=5, deterministic_dropout=False, num_splits=1, num_micro_batches=1) builder = atten_builder.Instantiate() if use_v1_stack: p = builder.TransformerStack('atten', num_layers=3) else: p = builder.TransformerStackV2( 'atten', num_layers=3, final_layer_stride=stride, final_layer_first_n=first_n) p.params_init = py_utils.WeightInit.Xavier(scale=1.0, seed=0) l = p.Instantiate() self.assertAllEqual([ 'atten/iter_000/block/ff/feedforward/bias01/b/var', 'atten/iter_000/block/ff/feedforward/bias02/b/var', 'atten/iter_000/block/ff/feedforward/linear01/w/var', 'atten/iter_000/block/ff/feedforward/linear02/w/var', 'atten/iter_000/block/ff/feedforward/ln/bias/var', 'atten/iter_000/block/ff/feedforward/ln/scale/var', 'atten/iter_000/block/self_atten/LN/bias/var', 'atten/iter_000/block/self_atten/LN/scale/var', 'atten/iter_000/block/self_atten/atten/key/b/var', 'atten/iter_000/block/self_atten/atten/key/w/var', 'atten/iter_000/block/self_atten/atten/per_dim_scale/per_dim_scale/var', 'atten/iter_000/block/self_atten/atten/post/b/var', 'atten/iter_000/block/self_atten/atten/post/w/var', 'atten/iter_000/block/self_atten/atten/query/b/var', 'atten/iter_000/block/self_atten/atten/query/w/var', 'atten/iter_000/block/self_atten/atten/value/b/var', 'atten/iter_000/block/self_atten/atten/value/w/var', 'atten/iter_001/block/ff/feedforward/bias01/b/var', 'atten/iter_001/block/ff/feedforward/bias02/b/var', 'atten/iter_001/block/ff/feedforward/linear01/w/var', 'atten/iter_001/block/ff/feedforward/linear02/w/var', 'atten/iter_001/block/ff/feedforward/ln/bias/var', 'atten/iter_001/block/ff/feedforward/ln/scale/var', 'atten/iter_001/block/self_atten/LN/bias/var', 'atten/iter_001/block/self_atten/LN/scale/var', 'atten/iter_001/block/self_atten/atten/key/b/var', 'atten/iter_001/block/self_atten/atten/key/w/var', 'atten/iter_001/block/self_atten/atten/per_dim_scale/per_dim_scale/var', 'atten/iter_001/block/self_atten/atten/post/b/var', 'atten/iter_001/block/self_atten/atten/post/w/var', 'atten/iter_001/block/self_atten/atten/query/b/var', 'atten/iter_001/block/self_atten/atten/query/w/var', 'atten/iter_001/block/self_atten/atten/value/b/var', 'atten/iter_001/block/self_atten/atten/value/w/var', 'atten/iter_002/block/ff/feedforward/bias01/b/var', 'atten/iter_002/block/ff/feedforward/bias02/b/var', 'atten/iter_002/block/ff/feedforward/linear01/w/var', 'atten/iter_002/block/ff/feedforward/linear02/w/var', 'atten/iter_002/block/ff/feedforward/ln/bias/var', 'atten/iter_002/block/ff/feedforward/ln/scale/var', 'atten/iter_002/block/self_atten/LN/bias/var', 'atten/iter_002/block/self_atten/LN/scale/var', 'atten/iter_002/block/self_atten/atten/key/b/var', 'atten/iter_002/block/self_atten/atten/key/w/var', 'atten/iter_002/block/self_atten/atten/per_dim_scale/per_dim_scale/var', 'atten/iter_002/block/self_atten/atten/post/b/var', 'atten/iter_002/block/self_atten/atten/post/w/var', 'atten/iter_002/block/self_atten/atten/query/b/var', 'atten/iter_002/block/self_atten/atten/query/w/var', 'atten/iter_002/block/self_atten/atten/value/b/var', 'atten/iter_002/block/self_atten/atten/value/w/var', ], [var.op.name for var in tf.nest.flatten(l.vars)]) input_embs = tf.constant( np.random.random(size=[bs, sl, d]), dtype=np.float) paddings = tf.zeros([bs, sl]) segment_mask = tf.zeros([bs, 1, sl, sl]) out = l.FPropDefaultTheta( py_utils.NestedMap( vec=input_embs, paddings=paddings, segment_mask=segment_mask)) enc_out = out.vec if first_n is None: first_n = sl enc_out = py_utils.HasShape(enc_out, [bs, (first_n + stride - 1) // stride, d]) # Only test the value of the first token. enc_out = enc_out[:, :1, :] tf.logging.info('enc_out={}'.format(enc_out.shape)) enc_out_sum = tf.reduce_sum(enc_out) tf.global_variables_initializer().run() actual_enc_out, actual_enc_out_sum = sess.run([enc_out, enc_out_sum]) print('actual_enc_out_sum=', actual_enc_out_sum) self.assertAllEqual(actual_enc_out.shape, [bs, 1, d]) self.assertAllClose(21.429626, actual_enc_out_sum, atol=1e-5)
def ComputePredictions(self, theta, input_batch): """Computes predictions for `input_batch`. Args: theta: A `.NestedMap` object containing variable values of this task. input_batch: A `.NestedMap` expected to contain cell_center_xyz, cell_points_xyz, cell_feature, anchor_bboxes, anchor_localization_residuals, assigned_gt_labels, and assigned_cls_mask. See class doc string for details. Returns: A `.NestedMap` object containing residuals and classification_logits. """ p = self.params input_batch.Transform(lambda x: (x.shape, x.shape.num_elements())).VLog( 1, 'input_batch shapes: ') cell_feature = py_utils.HasRank(input_batch.cell_feature, 4) batch_size, num_centers, num_points_per_cell = py_utils.GetShape( cell_feature, 3) cell_points_xyz = py_utils.HasShape( input_batch.cell_points_xyz, [batch_size, num_centers, num_points_per_cell, 3]) cell_center_xyz = py_utils.HasShape(input_batch.cell_center_xyz, [batch_size, num_centers, 3]) cell_points_padding = py_utils.HasShape( input_batch.cell_points_padding, [batch_size, num_centers, num_points_per_cell]) # TODO(jngiam): Make concat_feature computation a layer or configureable. cell_center_xyz = tf.reshape(cell_center_xyz, [batch_size, num_centers, 1, 3]) centered_cell_points_xyz = cell_points_xyz - cell_center_xyz concat_feature = tf.concat([ tf.tile(cell_center_xyz, [1, 1, num_points_per_cell, 1]), centered_cell_points_xyz, cell_feature ], axis=-1) # pyformat: disable # Featurize point clouds at each center. point_input = py_utils.NestedMap({ 'points': centered_cell_points_xyz, 'features': concat_feature, 'padding': cell_points_padding, }) featurized_cell = self.cell_featurizer.FProp(theta.cell_featurizer, point_input) featurized_cell = py_utils.HasShape(featurized_cell, [batch_size, num_centers, -1]) # Predict localization residuals. predicted_residuals = self.localization_regressor.FProp( theta.localization_regressor, featurized_cell) predicted_residuals = tf.reshape( predicted_residuals, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7]) if p.squash_rotation_predictions: predicted_rotations = predicted_residuals[..., 6:] predicted_rotations = np.pi * tf.tanh(predicted_rotations) predicted_residuals = tf.concat( [predicted_residuals[..., :6], predicted_rotations], axis=-1) # Predict object classification at each bbox. predicted_classification_logits = self.classifier.FProp( theta.classifier, featurized_cell) predicted_classification_logits = tf.reshape( predicted_classification_logits, [ batch_size, num_centers, p.num_anchor_bboxes_per_center, p.num_classes ]) return py_utils.NestedMap({ 'residuals': predicted_residuals, 'classification_logits': predicted_classification_logits, })
def Decode(self, input_batch): """Decode an input batch, computing predicted bboxes from residuals.""" p = self.params bboxes_and_logits = self._BBoxesAndLogits(input_batch) predicted_bboxes = bboxes_and_logits.predicted_bboxes batch_size, num_bboxes, _ = py_utils.GetShape(predicted_bboxes, 3) classification_logits = bboxes_and_logits.classification_logits classification_logits = py_utils.HasShape( classification_logits, [batch_size, num_bboxes, p.num_classes]) classification_scores = tf.sigmoid(classification_logits) with tf.device('/cpu:0'): # Decode the predicted bboxes, performing NMS. per_cls_bboxes, per_cls_bbox_scores, per_cls_valid_mask = ( detection_decoder.DecodeWithNMS( predicted_bboxes, classification_scores, nms_iou_threshold=p.nms_iou_threshold, score_threshold=p.nms_score_threshold, max_boxes_per_class=p.max_nms_boxes, use_oriented_per_class_nms=p.use_oriented_per_class_nms)) # per_cls_valid_mask is [batch, num_classes, num_boxes] Tensor that # indicates which boxes were selected by NMS. Each example will have a # different number of chosen bboxes, so the mask is present to allow us # to keep the boxes as a batched dense Tensor. # # We mask the scores by the per_cls_valid_mask so that none of these boxes # will be interpreted as valid. per_cls_bbox_scores *= per_cls_valid_mask visualization_weights = py_utils.HasShape( per_cls_bbox_scores, [batch_size, p.num_classes, p.max_nms_boxes]) # For top down visualization, filter boxes whose scores are not above the # visualization threshold. visualization_weights = tf.where( tf.greater_equal(visualization_weights, p.visualization_classification_threshold), visualization_weights, tf.zeros_like(visualization_weights)) model_outputs = py_utils.NestedMap() model_outputs.per_class_predicted_bboxes = per_cls_bboxes model_outputs.per_class_predicted_bbox_scores = per_cls_bbox_scores model_outputs.per_class_valid_mask = per_cls_valid_mask decoder_outputs = py_utils.NestedMap({ 'per_class_predicted_bboxes': per_cls_bboxes, 'per_class_predicted_bbox_scores': per_cls_bbox_scores, 'per_class_valid_mask': per_cls_valid_mask, 'visualization_weights': visualization_weights, }) decoder_outputs.update( self.output_decoder.ProcessOutputs(input_batch, model_outputs)) # Produce global step as an output (which is the step # of the checkpoint being decoded.) decoder_outputs.global_step = py_utils.GetGlobalStep() return decoder_outputs
def MaxPool3D(points, point_features, pooling_idx, closest_idx): """Apply max pooling to a point cloud with computed sampling indices. sampled_idx and closest_idx are the outputs of a sampler such as FurthestPointSampler. The pooling operation results in a point cloud with fewer points, where the pooled points are specified by pooling_idx. Each element of pooling_idx contains an integer in the range [0, P1) containing the index of the point in points/points_features. Max pooling is performed by assigning each point to its closest pooled point, and then taking a max over the features of points assigned. We assume that this mapping is provided by closest_idx, where each element should contain an integer in the range [0, P2) containing the index of the pooled point that each point is assigned to. Note: This logic for pooling assumes that there will be at least one value > 0 per sampled region for each feature, otherwise it will return 0. Additionally, it does a reduce over a masked version of the features, so mean and min would not work without a change in the logic. Args: points: a floating point tf.Tensor with shape [N, P1, 3] point_features: a floating point tf.Tensor with shape [N, P1, C] pooling_idx: A tf.int32 tf.Tensor of shape [N, P2] with the index of which points we want to keep. Each value should be in the range [0, P1]. closest_idx: A tf.int32 tf.Tensor of shape [N, P1] representing which sampled point is closest to each original point. Each value should be in the range of [0, P2]. Returns: A tuple of tf.Tensors (pooled_points, pooled_features). pooled_points has shape [N, P2, 3] representing the locations of each selected point. P2 corresponds to num_pooled_points. pooled_features has shape [N, P2, C] representing the pooled features at each point. """ batch_size, num_points = py_utils.GetShape(points, 2) point_features = py_utils.HasShape(point_features, [batch_size, num_points, -1]) pooling_idx = py_utils.HasShape(pooling_idx, [batch_size, -1]) _, num_output_points = py_utils.GetShape(pooling_idx) _, _, feature_dims = py_utils.GetShape(point_features, 3) # Gather new point locations. pooled_points = tf.batch_gather(points, pooling_idx) mask = tf.one_hot(closest_idx, num_output_points) # [N, P1, P2] mask = tf.transpose(mask, [2, 0, 1]) # [P2, N, P1] def _PartialPoolFeaturesFn(partial_mask): partial_mask = tf.tile( tf.reshape(partial_mask, [batch_size, num_points, 1]), [1, 1, feature_dims]) # Note: This method of pooling assumes there will be a value > 0 # And will only work with max under this condition. return tf.reduce_max(partial_mask * point_features, axis=1) # Performing a map_fn over the pooled points is more memory efficient. pooled_point_features = tf.map_fn(_PartialPoolFeaturesFn, mask) # [P2, N, P1] pooled_point_features = tf.transpose(pooled_point_features, [1, 0, 2]) return pooled_points, pooled_point_features
def ProcessOutputs(self, input_batch, model_outputs): """Produce additional decoder outputs for KITTI. Args: input_batch: A .NestedMap of the inputs to the model. model_outputs: A .NestedMap of the outputs of the model, including:: - per_class_predicted_bboxes: [batch, num_classes, num_boxes, 7] float Tensor with per class 3D (7 DOF) bounding boxes. - per_class_predicted_bbox_scores: [batch, num_classes, num_boxes] float Tensor with per class, per box scores. - per_class_valid_mask: [batch, num_classes, num_boxes] masking Tensor indicating which boxes were still kept after NMS for each class. Returns: A NestedMap of additional decoder outputs needed for PostProcessDecodeOut. """ p = self.params per_class_predicted_bboxes = model_outputs.per_class_predicted_bboxes batch_size, num_classes, num_boxes, _ = py_utils.GetShape( per_class_predicted_bboxes) flattened_num_boxes = num_classes * num_boxes input_labels = input_batch.decoder_copy.labels input_lasers = input_batch.decoder_copy.lasers input_images = input_batch.decoder_copy.images with tf.device('/cpu:0'): # Convert the predicted bounding box points to their corners # and then project them to the image plane. # # This output can be used to: # # A) Visualize bounding boxes (2d or 3d) on the camera image. # # B) Compute the height of the predicted boxes to filter 'too small' boxes # as is done in the KITTI eval. predicted_bboxes = tf.reshape(per_class_predicted_bboxes, [batch_size, flattened_num_boxes, 7]) bbox_corners = geometry.BBoxCorners(predicted_bboxes) bbox_corners = py_utils.HasShape(bbox_corners, [batch_size, flattened_num_boxes, 8, 3]) utils_3d = detection_3d_lib.Utils3D() bbox_corners_image = utils_3d.CornersToImagePlane( bbox_corners, input_images.velo_to_image_plane) bbox_corners_image = py_utils.HasShape( bbox_corners_image, [batch_size, flattened_num_boxes, 8, 2]) # Clip the bounding box corners so they remain within # the image coordinates. bbox2d_corners_image_clipped = self._BBox2DImage(bbox_corners_image, input_images) bbox2d_corners_image_clipped = py_utils.HasShape( bbox2d_corners_image_clipped, [batch_size, flattened_num_boxes, 4]) # Compute the frustum mask to filter out bounding boxes that # are 'outside the frustum'. frustum_mask = self._CreateFrustumMask(bbox_corners_image, bbox2d_corners_image_clipped, input_images.height, input_images.width) # Reshape all of these back to [batch_size, num_classes, num_boxes, ...] bbox_corners_image = tf.reshape( bbox_corners_image, [batch_size, num_classes, num_boxes, 8, 2]) bbox2d_corners_image_clipped = tf.reshape( bbox2d_corners_image_clipped, [batch_size, num_classes, num_boxes, 4]) frustum_mask = tf.reshape(frustum_mask, [batch_size, num_classes, num_boxes]) ret = py_utils.NestedMap({ # For mAP eval 'source_ids': input_labels.source_id, 'difficulties': input_labels.difficulties, 'num_points_in_bboxes': input_batch.labels.bboxes_3d_num_points, # For exporting. 'velo_to_image_plane': input_images.velo_to_image_plane, 'velo_to_camera': input_images.velo_to_camera, # Predictions. 'bbox_corners_image': bbox_corners_image, 'bbox2d_corners_image': bbox2d_corners_image_clipped, 'frustum_mask': frustum_mask, # Ground truth. 'bboxes_3d': input_labels.bboxes_3d, 'bboxes_3d_mask': input_labels.bboxes_3d_mask, 'unfiltered_bboxes_3d_mask': input_labels.unfiltered_bboxes_3d_mask, 'labels': input_labels.labels, }) laser_sample = self._SampleLaserForVisualization( input_lasers.points_xyz, input_lasers.points_padding) ret.update(laser_sample) if p.summarize_boxes_on_image: ret.camera_images = input_images.image return ret
def FProp(self, theta, inputs, paddings, state0, labels=None): """Forward compute.""" p = self.params ids = py_utils.HasRank(inputs, 2) paddings = py_utils.HasShape(paddings, tf.shape(ids)) seqlen, batch = tf.unstack(tf.shape(inputs), num=2) assert state0 paddings_3d = tf.expand_dims(paddings, axis=2) # RNNs if p.shared_emb: emb_act = [self.emb.EmbLookup(theta.emb, inputs) ] * (1 + p.number_of_experts) else: emb_act = [ self.emb[i].EmbLookup(theta.emb[i], inputs) for i in range(1 + p.number_of_experts) ] state1 = py_utils.NestedMap(rnns=[]) rnns_act = [] for i, act in enumerate(emb_act): act, state = self.rnns[i].FProp(theta.rnns[i], act, paddings_3d, state0.rnns[i]) act = py_utils.HasRank(act, 3) rnns_act += [act] state1.rnns += [state] # [time, batch, experts, dims]. expert_stacked = tf.stack(rnns_act[1:], axis=2) # Compute gating softmax. The 0-th rnns is used as the expert # predictor. Because SoftmaxLayer.Logits takes a matrix as input, # we reshape rnns_act[0], the domain predictor activation, to a # matrix here. act = tf.reshape(rnns_act[0], [seqlen * batch, -1]) logits = self.domain_predictor_softmax.Logits( theta.domain_predictor_softmax, act) # [time, batch, experts] gating = tf.reshape(tf.nn.softmax(logits), [seqlen, batch, -1]) # Mix the experts. # [time, batch, dims] combined = tf.squeeze( tf.matmul( # [time, batch, 1, experts] tf.expand_dims(gating, axis=2), # [time, batch, experts, dims] expert_stacked), axis=2) if p.add_postgating_rnn: # Note that this layer includes 1 or more RNN layers followed # by a softmax. xent_loss, state1.merge = self.merge.FProp(theta.merge, combined, paddings, state0.merge, labels) else: xent_loss = self.output_softmax.FProp( theta=theta.output_softmax, inputs=combined, class_weights=labels.class_weights, class_ids=labels.class_ids) # return xent_loss, state1 return xent_loss, state1
def NeighborhoodIndices(points, query_points, k, points_padding=None, max_distance=None, sample_neighbors_uniformly=False): """Get indices to k-neighbors of query_points in points. Padding is returned along-side indices. Non-padded points are guaranteed to be unique (non-repeated) points from original non-padded points. Padded points arise due to either a lack of points (k exceeds the number of original non-padded points) or points are too far away (exceeds max distance). Note: Padded point indices may refer to padded points from the original, or may be duplicates of the closest point. TODO(weihan,jngiam): PointCNN implementation makes an assumption that padded points are repeated points from the original points. This behavior is maintained here, but we should update PointCNN to respect indices paddings. Args: points: tensor of shape [N, P1, dims]. query_points: tensor of shape [N, P2, dims] k: Integer. points_padding: optional tensor of shape [N, P1] containing True/1.0 iff the point is a padded point. if None, then all points are considered real points. max_distance: float representing the maximum distance that each neighbor can be. If there are no points within the distance, then the closest point is returned (regardless of distance). If this is set to None, then no filtering by distance is performed. sample_neighbors_uniformly: boolean specifying whether to sample neighbors uniformly if they are within max distance. Returns: A pair of tensors: - indices: tensor of shape [N, P2, k]. - padding: tensor of shape [N, P2, k] where 1 represents a padded point, and 0 represents an unpadded (real) point. """ n, p1 = py_utils.GetShape(points, 2) query_points = py_utils.HasShape(query_points, [n, -1, -1]) _, p2 = py_utils.GetShape(query_points, 2) # Compute pair-wise squared distances. # Note that dist_mat contains the squared distance (without sqrt). Thus, when # using max_distance, we will need to square max_distance to make sure it's # in the same units. dist_mat = SquaredDistanceMatrix(query_points, points) dist_mat = py_utils.HasShape(dist_mat, [n, p2, p1]) # Add a large scalar to the distances for padded points. # dist_mat[i, j, k] will be: # if k < valid_num[i]: distance between points[i, k] and query_points[i, j] # otherwise: a large scalar added to dist_mat[i, j, k] if points_padding is not None: points_padding = tf.cast(tf.expand_dims(points_padding, 1), tf.float32) points_padding = py_utils.HasShape(points_padding, [n, 1, p1]) large_scalar = tf.reduce_max(dist_mat) + 1 dist_mat += points_padding * large_scalar # To perform sampling neighbors uniformly efficiently, we set all neighbors # that are within the distance threshold to have distances be drawn uniformly # at random. Using top_k with this enables selecting a random set quickly # without replacement. if sample_neighbors_uniformly: if max_distance is not None: mask_by_distance = tf.less_equal(dist_mat, max_distance**2) dist_mat = tf.where( mask_by_distance, tf.square(max_distance) * tf.random_uniform(tf.shape(dist_mat)), dist_mat) else: raise ValueError( 'Uniform sampling requires specifying max_distance.') top_k_dist, indices = tf.nn.top_k(-dist_mat, k=k, sorted=True) # N x P2 x K # Set padding using top_k_dist; padded points will have distance exceeding # the large_scalar. if points_padding is not None: paddings = tf.greater_equal(-top_k_dist, large_scalar) else: paddings = tf.zeros_like(top_k_dist, dtype=tf.bool) # Filter by max_distances by setting all indices that exceed the max_distance # to the closest point. if max_distance is not None: # Mask is true for points that are further than max_distance. mask_by_distance = tf.greater(-top_k_dist, tf.square(max_distance)) closest_idx = tf.tile(indices[:, :, :1], [1, 1, k]) indices = tf.where(mask_by_distance, closest_idx, indices) paddings |= mask_by_distance indices = tf.reshape(indices, [n, p2, k]) paddings = tf.cast(paddings, tf.float32) return indices, paddings
def FProp(self, theta, input_batch): # pyformat: disable """Compute features for the pillars and convert them back to a dense grid. Args: theta: A `.NestedMap` object containing variable values of this task. input_batch: A `.NestedMap` object containing input tensors. Following keys are required: - grid_num_points: Integer tensor with shape [batch size, nx, ny, nz], where nx, ny, nz corresponds to the grid sizes (i.e., number of voxels in each axis dimension). - pillar_points: Float tensor with shape [batch size, num_pillars, num_points_per_pillar, 3 + num_laser_features] - pillar_centers: Float tensor with shape [batch size, num_pillars, num_points_per_pillar, 3] - pillar_locations: Float tensor with shape [batch size, num_pillars, 3] Returns: The dense features with shape [b, nx, ny, nz * fdims]. """ # pyformat: enable p = self.params bs, nx, ny, nz = py_utils.GetShape(input_batch.grid_num_points, 4) # Process points to concatenate a set of fixed features (e.g., # add means, centers, normalize points to means). num_features = 3 + p.num_laser_features pillar_points = py_utils.HasShape(input_batch.pillar_points, [bs, -1, -1, num_features]) _, npillars, npoints, _ = py_utils.GetShape(pillar_points, 4) pillar_xyz = pillar_points[..., :3] # Compute number of points per pillar and prepare for broadcasting. pillar_num_points = tf.gather_nd(input_batch.grid_num_points, input_batch.pillar_locations, batch_dims=1) pillar_num_points = pillar_num_points[..., tf.newaxis, tf.newaxis] # Compute mean by computing sum and dividing by number of points. Clip the # denominator by 1.0 to gracefully handle empty pillars. pillar_sum = tf.reduce_sum(pillar_xyz, axis=2, keep_dims=True) pillar_means = pillar_sum / tf.maximum( tf.cast(pillar_num_points, tf.float32), 1.0) pillar_feats = pillar_points[..., 3:] pillar_centers = py_utils.HasShape(input_batch.pillar_centers, [bs, -1, 1, 3]) pillar_concat = tf.concat(axis=3, values=[ pillar_xyz - pillar_means, pillar_feats, tf.tile(pillar_means, [1, 1, npoints, 1]), tf.tile(pillar_centers, [1, 1, npoints, 1]) ]) # Featurize pillars. pillar_features = self.featurizer.FProp(theta.featurizer, pillar_concat) # Convert back to the dense grid. pillar_locations = py_utils.HasShape(input_batch.pillar_locations, [bs, npillars, 3]) dense_features = SparseToDense(grid_shape=(nx, ny, nz), locations=pillar_locations, feats=pillar_features) return dense_features
def Decode(self, input_batch): """Decode an input batch, computing predicted bboxes from residuals.""" p = self.params predictions = self.ComputePredictions(self.theta, input_batch) bboxes_and_logits = self._BBoxesAndLogits(input_batch, predictions) predicted_bboxes = bboxes_and_logits.predicted_bboxes batch_size, num_bboxes, _ = py_utils.GetShape(predicted_bboxes, 3) classification_logits = bboxes_and_logits.classification_logits classification_logits = py_utils.HasShape( classification_logits, [batch_size, num_bboxes, p.num_classes]) classification_scores = tf.sigmoid(classification_logits) _, per_example_dict = self.ComputeLoss(self.theta, predictions, input_batch) if 'score_scaler' in per_example_dict: classification_scores *= per_example_dict['score_scaler'] with tf.device('/cpu:0'): # Decode the predicted bboxes, performing NMS. per_cls_idxs, per_cls_bboxes, per_cls_bbox_scores, per_cls_valid_mask = ( detection_decoder.DecodeWithNMS( predicted_bboxes, classification_scores, nms_iou_threshold=p.nms_iou_threshold, score_threshold=p.nms_score_threshold, max_boxes_per_class=p.max_nms_boxes, use_oriented_per_class_nms=p.use_oriented_per_class_nms)) # per_cls_valid_mask is [batch, num_classes, num_boxes] Tensor that # indicates which boxes were selected by NMS. Each example will have a # different number of chosen bboxes, so the mask is present to allow us # to keep the boxes as a batched dense Tensor. # # We mask the scores by the per_cls_valid_mask so that none of these boxes # will be interpreted as valid. per_cls_bbox_scores *= per_cls_valid_mask visualization_weights = py_utils.HasShape( per_cls_bbox_scores, [batch_size, p.num_classes, p.max_nms_boxes]) # For top down visualization, filter boxes whose scores are not above the # visualization threshold. visualization_weights = tf.where( tf.greater_equal(visualization_weights, p.visualization_classification_threshold), visualization_weights, tf.zeros_like(visualization_weights)) model_outputs = py_utils.NestedMap() model_outputs.per_class_predicted_bboxes = per_cls_bboxes model_outputs.per_class_predicted_bbox_scores = per_cls_bbox_scores model_outputs.per_class_valid_mask = per_cls_valid_mask decoder_outputs = py_utils.NestedMap({ 'per_class_predicted_bboxes': per_cls_bboxes, 'per_class_predicted_bbox_scores': per_cls_bbox_scores, 'per_class_valid_mask': per_cls_valid_mask, 'visualization_weights': visualization_weights, }) if p.decode_include_residuals: # Including the residuals in the decoder output makes it possible to save # the outputs for further analysis. Note that we ensure that the outputs # match the per-class NMS output format of [batch, num_classes, ...]. def _ReshapeGather(tensor): """Reshapes tensor and then gathers using the nms indices.""" tensor = tf.gather(tf.reshape(tensor, [batch_size, num_bboxes, -1]), per_cls_idxs, batch_dims=1) if not p.use_oriented_per_class_nms: # Tile so that the data fits the expected per class shape of # [batch_size, num_classes, ...]. When *not* using oriented NMS, the # num_classes dimension will be missing since the indices will not # have it. tensor = tf.tile(tensor[:, tf.newaxis, :, :], [1, p.num_classes, 1, 1]) return tensor decoder_outputs.update({ 'per_class_gt_residuals': _ReshapeGather(input_batch.anchor_localization_residuals), 'per_class_gt_labels': _ReshapeGather(input_batch.assigned_gt_labels), 'per_class_residuals': _ReshapeGather(predictions.residuals), 'per_class_logits': _ReshapeGather(predictions.classification_logits), 'per_class_anchor_boxes': _ReshapeGather(input_batch.anchor_bboxes), }) decoder_outputs.update( self.output_decoder.ProcessOutputs(input_batch, model_outputs)) # Produce global step as an output (which is the step # of the checkpoint being decoded.) decoder_outputs.global_step = py_utils.GetGlobalStep() return decoder_outputs
def ComputePredictions(self, theta, input_batch): """Computes predictions for `input_batch`. Args: theta: A `.NestedMap` object containing variable values of this task. input_batch: A `.NestedMap` object containing input tensors to this tower. Returns: A `.NestedMap` contains logits - [b, nx, ny, nz, na, 7 + num_classes] """ p = self.params input_batch.Transform(lambda x: (x.shape, x.shape.num_elements())).VLog( 0, 'input_batch shapes: ') # Make pillars representation from input_batch. dense_features = self.input_featurizer.FProp(theta.input_featurizer, input_batch) # Backbone tf.logging.vlog(1, 'dense_features.shape = %s', dense_features.shape) act = self.backbone.FProp(theta.backbone, dense_features) tf.logging.vlog(1, 'act.shape = %s', act.shape) # Convert the output of the backbone into class logits and regression # residuals using two different layers. class_detection = self.class_detector.FProp(theta.class_detector, act) reg_detection = self.regression_detector.FProp( theta.regression_detector, act) bs, nx, ny, _ = py_utils.GetShape(class_detection, 4) predicted_classification_logits = tf.reshape( class_detection, [bs, nx, ny, p.grid_size_z, p.num_anchors, p.num_classes]) predicted_residuals = tf.reshape( reg_detection, [bs, nx, ny, p.grid_size_z, p.num_anchors, 7]) if p.squash_rotation_predictions: predicted_rotations = predicted_residuals[..., 6:] predicted_rotations = np.pi * tf.tanh(predicted_rotations) predicted_residuals = tf.concat( [predicted_residuals[..., :6], predicted_rotations], axis=-1) if p.oracle_location or p.oracle_dimension or p.oracle_rotation: gt_residuals = py_utils.HasShape( input_batch.anchor_localization_residuals, [bs, nx, ny, p.grid_size_z, p.num_anchors, 7]) # Replace the predicted components with the ground truth if needed. if p.oracle_location: location = gt_residuals[..., 0:3] else: location = predicted_residuals[..., 0:3] if p.oracle_dimension: dimension = gt_residuals[..., 3:6] else: dimension = predicted_residuals[..., 3:6] if p.oracle_rotation: rotation = gt_residuals[..., 6:] else: rotation = predicted_residuals[..., 6:] predicted_residuals = tf.concat([location, dimension, rotation], axis=-1) ret = py_utils.NestedMap({ 'residuals': predicted_residuals, 'classification_logits': predicted_classification_logits, }) if p.direction_classifier_weight > 0.0: predicted_dir = self.direction_classifier.FProp( theta.direction_classifier, act) predicted_dir = tf.reshape( predicted_dir, [bs, nx, ny, p.grid_size_z, p.num_anchors, 2]) ret.predicted_dir = predicted_dir return ret
def ComputeLoss(self, theta, predictions, input_batch): """Computes loss and other metrics for the given predictions. Args: theta: A `.NestedMap` object containing variable values of this task. predictions: The output of `ComputePredictions`, contains: logits - [b, nx, ny, nz, na, 7 + num_classes]. na is the number of anchor boxes per cell. [..., :7] are (dx, dy, dz, dw, dl, dh, dt). input_batch: The input batch from which we accesses the groundtruth. Returns: Two dicts defined as BaseTask.ComputeLoss. """ p = self.params predicted_residuals = py_utils.HasShape( predictions.residuals, [-1, -1, -1, -1, p.num_anchors, 7]) predicted_class_logits = py_utils.HasShape( predictions.classification_logits, [-1, -1, -1, -1, p.num_anchors, p.num_classes]) bs, nx, ny, nz, na, _ = py_utils.GetShape(predicted_class_logits, 6) # Compute class and regression weights. class_weights = input_batch.assigned_cls_mask class_weights = py_utils.HasShape(class_weights, [bs, nx, ny, nz, na]) reg_weights = input_batch.assigned_reg_mask reg_weights = py_utils.HasShape(reg_weights, [bs, nx, ny, nz, na]) reg_weights = tf.expand_dims(reg_weights, -1) if p.loss_norm_type == LossNormType.NORM_BY_NUM_POSITIVES: # Compute number of positive anchors per example. foreground_mask = py_utils.HasShape(input_batch.assigned_reg_mask, [bs, nx, ny, nz, na]) # Sum to get the number of foreground anchors for each example. loss_normalization = tf.reduce_sum(foreground_mask, axis=[1, 2, 3, 4]) loss_normalization = tf.maximum(loss_normalization, tf.ones_like(loss_normalization)) # Reshape for broadcasting. loss_normalization = tf.reshape(loss_normalization, [bs, 1, 1, 1, 1, 1]) class_weights /= loss_normalization reg_weights /= loss_normalization # Classification loss. assigned_gt_labels = py_utils.HasShape(input_batch.assigned_gt_labels, [bs, nx, ny, nz, na]) class_loss = py_utils.SigmoidCrossEntropyFocalLoss( logits=predicted_class_logits, labels=tf.one_hot(assigned_gt_labels, p.num_classes), alpha=p.focal_loss_alpha, gamma=p.focal_loss_gamma) class_loss *= class_weights[..., tf.newaxis] class_loss_sum = tf.reduce_sum(class_loss) # Regression loss. anchor_localization_residuals = py_utils.HasShape( input_batch.anchor_localization_residuals, [bs, nx, ny, nz, na, 7]) # Location and dimensions loss. reg_loc_and_dims_loss = self._utils.ScaledHuberLoss( predictions=py_utils.HasShape(predicted_residuals[..., :6], [bs, nx, ny, nz, na, 6]), labels=anchor_localization_residuals[..., :6], delta=1 / (3.**2)) # Rotation loss with SmoothL1(sin(delta)). rot_delta = (predicted_residuals[..., 6:] - input_batch.anchor_localization_residuals[..., 6:]) if p.use_atan2_heading_loss: atan2_of_delta = tf.atan2(tf.sin(rot_delta), tf.cos(rot_delta)) reg_rot_loss = self._utils.ScaledHuberLoss( predictions=atan2_of_delta, labels=tf.zeros_like(atan2_of_delta), delta=1 / (3.**2)) else: # Rotation loss with SmoothL1(sin(delta)). reg_rot_loss = self._utils.ScaledHuberLoss( predictions=tf.sin(rot_delta), labels=tf.zeros_like(rot_delta), delta=1 / (3.**2)) # Direction loss if p.direction_classifier_weight > 0.0: # The target rotations are in the assigned_gt_bbox tensor, # which already has assigned a gt bounding box to every anchor. rot_target = input_batch.assigned_gt_bbox[..., 6] # If rotation is > 0, the class is 1, else it is 0. rot_dir = tf.cast(rot_target > 0., tf.int32) # Compute one-hot labels as a target. rot_dir_onehot = tf.one_hot(rot_dir, 2) # Manually handle loss reduction. dir_loss = tf.losses.softmax_cross_entropy( onehot_labels=rot_dir_onehot, logits=predictions.predicted_dir, weights=tf.squeeze(reg_weights, axis=-1), reduction=tf.losses.Reduction.NONE) # Reduce across all dimensions (we'll divide by the batch size below). dir_loss_sum = tf.reduce_sum(dir_loss) else: dir_loss_sum = 0.0 # Compute loss contribution from location and dimension separately. reg_loc_loss = reg_loc_and_dims_loss[..., :3] * reg_weights reg_loc_loss_sum = tf.reduce_sum(reg_loc_loss) reg_dim_loss = reg_loc_and_dims_loss[..., 3:6] * reg_weights reg_dim_loss_sum = tf.reduce_sum(reg_dim_loss) # Compute rotation loss contribution. reg_rot_loss *= reg_weights reg_rot_loss_sum = tf.reduce_sum(reg_rot_loss) # Num. predictions. # TODO(zhifengc): Consider other normalization factors. E.g., # of bboxes. preds = tf.cast(bs, class_loss_sum.dtype) # Normalize all of the components by batch size. reg_loc_loss = reg_loc_loss_sum / preds reg_dim_loss = reg_dim_loss_sum / preds reg_rot_loss = reg_rot_loss_sum / preds class_loss = class_loss_sum / preds dir_loss = dir_loss_sum / preds # Compute total localization regression loss. reg_loss = (p.location_loss_weight * reg_loc_loss + p.dimension_loss_weight * reg_dim_loss + p.rotation_loss_weight * reg_rot_loss) # Apply weights to normalized class losses. loss = (class_loss * p.classification_loss_weight + reg_loss * p.localization_loss_weight + dir_loss * p.direction_classifier_weight) metrics_dict = { 'loss': (loss, preds), 'loss/class': (class_loss, preds), 'loss/reg': (reg_loss, preds), 'loss/reg/rot': (reg_rot_loss, preds), 'loss/reg/loc': (reg_loc_loss, preds), 'loss/reg/dim': (reg_dim_loss, preds), 'loss/dir': (dir_loss, preds), } # Calculate dimension errors min_angle_rad = -np.pi if p.use_atan2_heading_loss else 0 gt_bboxes = self._utils_3d.ResidualsToBBoxes( input_batch.anchor_bboxes, anchor_localization_residuals, min_angle_rad=min_angle_rad, max_angle_rad=np.pi) predicted_bboxes = self._utils_3d.ResidualsToBBoxes( input_batch.anchor_bboxes, predicted_residuals, min_angle_rad=min_angle_rad, max_angle_rad=np.pi) dimension_errors_dict = self._BBoxDimensionErrors( gt_bboxes, predicted_bboxes, reg_weights) metrics_dict.update(dimension_errors_dict) per_example_dict = { 'residuals': predicted_residuals, 'classification_logits': predicted_class_logits, } return metrics_dict, per_example_dict
def _FProp(self, theta, source_encs, source_paddings, targets, src_segment_id): """Decodes `targets` given encoded source. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. source_encs: source encoding. When `p.is_transparent` is False, it is a tensor of shape [time, batch, depth]. When `p.is_transparent` is True, it is a tensor of shape [time, batch, depth, num_trans_layers] if `p.is_eval` is True, and a list of `num_trans_layers` tensors of shape [time, batch, depth] if `p.is_eval` is False. source_paddings: source encoding's padding, of shape [time, batch]. targets: A dict of string to tensors representing the targets one try to predict. Each tensor in targets is of shape [batch, time]. src_segment_id: source segment id, of shape [time, batch]. Returns: Output of last decoder layer, [target_time, target_batch, source_dim]. """ p = self.params time, batch = py_utils.GetShape(source_paddings, 2) if p.is_transparent: if p.is_eval: source_encs = py_utils.HasShape( source_encs, [time, batch, p.source_dim, p.num_trans_layers]) source_encs = tf.unstack(source_encs, axis=3) else: assert isinstance(source_encs, list) assert len(source_encs) == p.num_trans_layers for i in range(p.num_trans_layers): source_encs[i] = py_utils.HasShape(source_encs[i], [time, batch, p.source_dim]) else: source_encs = py_utils.HasShape(source_encs, [time, batch, p.source_dim]) source_encs = [source_encs] * p.num_trans_layers with tf.name_scope(p.name): # [batch, time] target_ids = targets.ids # [time, batch] target_paddings = tf.transpose(targets.paddings) target_segment_pos = None target_segment_id = None if p.packed_input: target_segment_id = tf.transpose(targets.segment_ids) target_segment_pos = targets.segment_pos assert src_segment_id is not None, ('Need to provide src_segment_id ' 'for packed input.') # Embedding layer # [batch, time, model_dim] token_embs = self.token_emb.EmbLookup(theta.token_emb, target_ids) target_time = py_utils.GetShape(target_ids)[1] # [1, time, model_dim] if p.packed_input: posit_embs = self.position_emb.FPropWithPosition( theta.position_emb, target_segment_pos) else: posit_embs = tf.expand_dims( self.position_emb.FProp(theta.position_emb, target_time), 0) # [time, batch, model_dim] input_embs = token_embs + posit_embs if p.model_dim != p.token_emb.embedding_dim: input_embs = self.emb_proj.FProp(theta.emb_proj, input_embs) input_embs = tf.transpose(input_embs, [1, 0, 2]) input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs) atten_probs = [] layer_in = input_embs for i, (layer, layer_theta) in enumerate(zip(self.trans, theta.trans)): # [time, batch, model_dim] layer_out, probs = layer.FProp( layer_theta, layer_in, target_paddings, source_encs[i], source_paddings, source_segment_id=target_segment_id, aux_segment_id=src_segment_id) layer_in = layer_out atten_probs.append(probs) self._AddAttenProbsSummary(source_paddings, targets, atten_probs) return layer_out
def ComputePredictions(self, theta, source_encs, source_paddings, targets, src_segment_id): """Decodes `targets` given encoded source. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. source_encs: source encoding, of shape [time, batch, depth]. source_paddings: source encoding's padding, of shape [time, batch]. targets: A dict of string to tensors representing the targets one try to predict. Each tensor in targets is of shape [batch, time]. src_segment_id: source segment id, of shape [time, batch]. Returns: A Tensor with shape [time, batch, params.softmax.input_dim]. """ p = self.params time, batch = py_utils.GetShape(source_paddings, 2) source_encs = py_utils.HasShape(source_encs, [time, batch, p.source_dim]) with tf.name_scope(p.name): target_ids = tf.transpose(targets.ids) target_paddings = py_utils.HasRank(targets.paddings, 2) target_paddings = tf.expand_dims(tf.transpose(target_paddings), 2) if p.packed_input: target_segment_id = tf.expand_dims(tf.transpose(targets.segment_ids), 2) else: target_segment_id = tf.zeros_like(target_paddings) if py_utils.use_tpu(): emb_device = self.cluster.WorkerDeviceInModelSplit(0) else: emb_device = '' with tf.device(emb_device): inputs = self.emb.EmbLookup(theta.emb, target_ids) inputs = self.ApplyClipping(theta, inputs) summary_utils.histogram('input_emb', inputs) inputs = self.ApplyDropout(inputs) self._emb_out = inputs # Layer 0 interwines with attention. (atten_ctxs, xs, atten_probs, _) = self.frnn_with_atten.FProp( theta.frnn_with_atten, source_encs, source_paddings, inputs, target_paddings, src_segment_id=src_segment_id, segment_id=target_segment_id) self._AddAttenProbsSummary(source_paddings, targets, [atten_probs]) atten_ctxs = self.ApplyClipping(theta, atten_ctxs) summary_utils.histogram('atten_ctxs', atten_ctxs) for i, (layer, layer_theta) in enumerate(zip(self.frnn, theta.frnn)): # Forward through Layer-(i + 1) because Layer-0 handled before. ys, _ = layer.FProp( layer_theta, tf.concat([xs, atten_ctxs], 2), target_paddings, segment_id=target_segment_id) ys = self.ApplyDropout(ys) if 1 + i >= p.residual_start: xs += ys # Residual skip xs = self.ApplyClipping(theta, xs) else: xs = ys summary_utils.histogram('layer_out_%s' % i, xs) if p.feed_attention_context_vec_to_softmax: xs = tf.concat([xs, atten_ctxs], 2) return xs
def FProp(self, theta, *args): """Runs p.repeat copies of self.body.FProp independently. Args: theta: Layer model parameters. The shape of each variable in theta is always [p.repeat, ...]. And the i-th slice theta[i] becomes theta of the i-th copy of self.body. *args: Input arguments. The shape of each tensor in args is always [p.repeat, ....]. And the list [arg[i] for arg in args] becomes inputs to the i-th copy of self.body.FProp. Returns: The accumulated output_tensors. Each tensor t in the return has the shape [p.repeat, ....] and the tuple (t[i] for i in output_tensors) is the return tuple of the i-th self.body.FProp. """ p = self.params for arg in args: if arg is not None: arg = py_utils.HasShape(arg, [p.repeat], ndims=1) theta_stack = _MaybeStackExtraTheta(theta.body, self.body.vars, p.repeat) inputs = py_utils.NestedMap(theta=theta_stack, args=list(args)) # Infer out_shapes from FPropMeta. out_shapes = self._InferOutShapes(args) def _CellFn(unused_theta, unused_state0, inputs): """Recurrent cell function wrapper of body.FProp.""" # Sets shapes for both theta and inputs to self.body.FProp. for dst, src in zip(inputs.args + inputs.theta.Flatten(), list(args) + theta_stack.Flatten()): if src is not None: dst.set_shape(tf.TensorShape(src.shape.as_list()[1:])) # Runs the actual body.FProp fprop_outputs = self.body.FProp(inputs.theta, *inputs.args) fprop_outputs = _ToTuple(fprop_outputs) assert len(fprop_outputs) == len(out_shapes) # Passes fprop outputs to the next layer through state. state1 = py_utils.NestedMap(outputs=list(fprop_outputs)) return state1, py_utils.NestedMap() with tf.name_scope(p.name): # Initiate state0 with inferred output shapes. state0 = py_utils.NestedMap(outputs=[ tf.zeros(shape, args[0].dtype) for shape in out_shapes ]) # Runs body.FProp p.repeat times using Recurrent. acc_states, _ = recurrent.Recurrent(theta=py_utils.NestedMap(), state0=state0, inputs=inputs, cell_fn=_CellFn) # Retrieves fprop outputs from state1 and sets shapes. output_tensors = tuple(acc_states.outputs) for out_idx in range(len(output_tensors)): output_tensors[out_idx].set_shape( tf.TensorShape([p.repeat] + out_shapes[out_idx].as_list())) return output_tensors[0] if len(args) == 1 else tuple( output_tensors)
def FProp(self, theta, inputs, paddings, state0=None, segment_id=None): """Computes LSTM forward pass. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: A single tensor or a tuple of tensors with cardinality equal to rnn_cell.inputs_arity. For every input tensor, the first dimension is assumed to be time, second dimension batch, and third dimension depth. paddings: A tensor. First dim is time, second dim is batch, and third dim is expected to be 1. state0: If not None, the initial rnn state in a `.NestedMap`. Defaults to the cell's zero-state. segment_id: A tensor to support packed inputs. First dim is time, second dim is batch, and third dim is expected to be 1. Returns: A tensor of [time, batch, dims]. The final recurrent state. """ p = self.params rcell = self.cell assert isinstance(rcell, (rnn_cell.RNNCell)) if not isinstance(inputs, (list, tuple)): inputs = [inputs] # Slicing wm to wm_{i,h} outside the loop to get 20% speedup over regular # LSTM baseline. # Keeping slicing within the loop gives only < 3% speedup. cell_theta = theta.cell.copy() num_input_nodes = p.cell.num_input_nodes cell_theta['wm_i'] = cell_theta.wm[:num_input_nodes, :] cell_theta['wm_h'] = cell_theta.wm[num_input_nodes:, :] tf.logging.vlog(1, 'cell_theta: %r', cell_theta) if p.packed_input: assert segment_id is not None reset_mask = rnn_layers.GeneratePackedInputResetMask( segment_id, is_reverse=False) reset_mask = py_utils.HasShape(reset_mask, tf.shape(paddings)) else: reset_mask = tf.zeros_like(paddings) if not state0: batch_size = py_utils.GetShape(paddings)[1] state0 = rcell.zero_state(cell_theta, batch_size) # [T, B, H] proj_inputs = rcell.ProjectInputSequence( cell_theta, py_utils.NestedMap(act=inputs)) proj_inputs = py_utils.NestedMap(proj_inputs=proj_inputs, padding=paddings, reset_mask=reset_mask) acc_state, final_state = recurrent.Recurrent( theta=cell_theta, state0=state0, inputs=proj_inputs, cell_fn=rcell.FPropWithProjectedInput, cell_type=rcell.layer_type, accumulator_layer=self, allow_implicit_capture=p.allow_implicit_capture) act = rcell.GetOutput(acc_state) return act, final_state
def _InferenceSubgraph_Default(self): """Default inference subgraph. Returns: (fetches, feeds), with: - fetches: A dictionary of fetches, containing: - log_pplx_per_token: A matrix of shape [batch, time]. [i, j] is i-th input text's j-th token's log prob. - paddings: A matrix of shape [batch, time]. The padding mask. - log_pplx_per_sample: A vector of shape [batch]. [i] is i-th input text's log prob. - num_oovs_per_sample: A vector of shape [batch] counting the total number of out-of-vocabulary tokens in each input. - tokens_from_labels: A vector of shape [batch] returning the predicted tokens as a sequence after mapping them back to strings from ids using the vocabulary. - ids: A matrix of shape [batch, time]. [i, j] is i-th input text's j-th token's id. - feeds: A dictionary of feeds, containing: - text: A placeholder for a vector of strings. """ text = tf.placeholder(tf.string, shape=[None]) # [batch, time] ids, labels, paddings = self.input_generator.StringsToIds(text) lengths = tf.reduce_sum(tf.to_int32(1 - paddings), axis=1) tokens_from_labels = self.input_generator.IdsToStrings(labels, lengths) oovs = tf.equal(labels, self.input_generator.tokenizer.unk_id) num_oovs_per_sample = tf.to_int32( tf.round(tf.reduce_sum(tf.to_float(oovs) * (1 - paddings), axis=1))) # [time, batch] ids, paddings, labels, weights = self._TrimIfPossibleThenTranspose( ids, paddings, labels, 1.0 - paddings) batch_size = tf.shape(ids)[1] xent_output, _ = self.lm.FPropDefaultTheta( inputs=ids, paddings=paddings, state0=self.lm.zero_state(self.theta.lm, batch_size), labels=py_utils.NestedMap(class_ids=labels, class_weights=weights)) per_example_xent = py_utils.HasShape(xent_output.per_example_xent, tf.shape(ids)) log_pplx_per_sample = tf.reduce_sum(per_example_xent * (1 - paddings), axis=0) fetches = { 'log_pplx_per_token': # [batch, time] tf.transpose(per_example_xent), 'paddings': # [batch, time] tf.transpose(paddings), 'lengths': # [batch] lengths, 'log_pplx_per_sample': # [batch] log_pplx_per_sample, 'num_oovs_per_sample': # [batch], int32 num_oovs_per_sample, 'tokens_from_labels': # [batch], string tokens_from_labels, 'ids': # [batch, time], int32 ids } feeds = {'text': text} return fetches, feeds
def FProp(self, theta, inputs, paddings, state0, labels=None, direct_features=None): """Computes xent loss given the language model input activations. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: input activation. A tensor of shape [time, batch, dims]. paddings: a 0/1 tensor of shape [time, batch]. state0: A `.NestedMap` containing the initial recurrent state. labels: If not None, a `.NestedMap` containing the following fields. - class_weights, a tensor with shape [time, batch] containing the weights for each target word. - class_ids, a tensor with shape [time, batch] of int32 dtype containing the target class labels. - class_probabilities, a tensor with shape [time, batch, vocab_size] of float values indicating class-membership probabilities. direct_features: If not None, a tensor of [time, batch, direct_feature_dims] that is concatenated to the output of the last RNN layer. Returns: If `labels` is not None, returns (xent_output, state1), where `xent_output` is a `.NestedMap` as defined by `SoftmaxLayer`'s return value and `state1` is the next recurrent state. Otherwise, `xent_output` contains the softmax logits, probabilities (.probs) and log-probabilities (.log_probs). """ inputs = py_utils.HasRank(inputs, 3) seqlen, batch, _ = tf.unstack(tf.shape(inputs), num=3) paddings = py_utils.HasShape(paddings, [seqlen, batch]) assert state0 is not None activation, state1 = self.rnns.FProp(theta.rnns, inputs, tf.expand_dims(paddings, 2), state0) if direct_features is not None: direct_features = py_utils.HasRank(direct_features, 3) activation = tf.concat([activation, direct_features], axis=2) if labels is None: # We can only compute the logits here. logits = self.softmax.Logits(theta=theta.softmax, inputs=tf.reshape( activation, [seqlen * batch, -1])) xent_output = py_utils.NestedMap( logits=tf.reshape(logits, [seqlen, batch, -1])) xent_output.probs = tf.nn.softmax(xent_output.logits) xent_output.log_probs = tf.nn.log_softmax(xent_output.logits) elif 'class_ids' in labels: xent_output = self.softmax.FProp( theta=theta.softmax, inputs=activation, class_weights=labels.class_weights, class_ids=labels.class_ids) else: assert 'class_probabilities' in labels xent_output = self.softmax.FProp( theta=theta.softmax, inputs=activation, class_weights=labels.class_weights, class_probabilities=labels.class_probabilities) xent_output.last_hidden = activation return xent_output, state1
def ExtractBlockContextV2(x, block_size, left_context, right_context, padding_val=0.0, paddings=None): """Extracts temporal context for every block (without restrictions). This is a generalized implementation of ExtractBlockContext, where block_size, left_context, and right_context are 3 free parameters and we don't have constraints (other than l>=1, r>=0, block_size>0). Args: x: a tensor of [batch, time, dim]. block_size: int. Number of time frames in a block. left_context: int. Left context size. Note that the actual left context is `left_context - 1` (this is to be compatible with ExtractBlockContext implementation). right_context: int. Right context size. padding_val: float. value on the padded frames. paddings: optional. If specified, it must be a tensor of [batch, time], and we will return a padding tensor indicating padding info for the returned tensor. Returns: (x_patches, x_paddings) where - x_patches: A tensor of [batch, num_blocks, context_size + block_size, dim] with necessary paddings, where context_size = (left_context - 1) + right_context, and output[:, i, ...] are x[:, start-left_context+1:end+right_context, ...], where start = i * block_size, end = (i + 1) * block_size. - x_paddings: None if paddings = None; else a [batch, num_blocks, context_size + block_size] tensor, indicating the padding info for the corresponding position in x_patches. Let's define some variables here: B: batch size T: input tensor length in time axis D: input tensor dimension in the last axis W: block size U: ceil(T/W) L: left context size R: right context size C: L-1+W+R, full block length Given a [B, T, D] tensor, the return is a [B, U, C, D] tensor where ret[b, u, :] is a length of 2D tensor in a shape (L - 1 + W + R, D), which is a u-th block of the input tensor with (L - 1) left context frames and R right context frames. Implementation note: We use the following procedure to get the return tensor - first do padding in the beginning and at the end: [B, T, D] -> [B, L - 1 + U*W + L - 1 + R, D] - add one extra axis [B, L-1+U*W+R, D] -> [B, L-1+U*W+R, D, 1] - use gather to gather blocks [B, L-1+U*W+R+L-1, D, 1] -> [B, U, C, D] TODO(yqw): after verfiying correctness and benchmark, consider replace v1 implementation? """ # 0. basic shapes b, t, d = py_utils.GetShape(x, 3) w = block_size u = (t + w - 1) // w # equivalent to math.ceil(t/w) l = left_context r = right_context c = l - 1 + r + w # the only constraints are block_size > 0 , l >= 1, r>=0 if w <= 0: raise ValueError(f'block size ({w}) must be greater than 0') if l < 1: raise ValueError(f'Left context ({left_context}) must be >= 1.') if r < 0: raise ValueError(f'Right context ({right_context}) must be >= 0') if paddings is not None: paddings = py_utils.HasShape(paddings, [b, t]) # 1. do front and rear padding left_pad = l - 1 # we need to make sure all u * w elements have enough long context right_pad = (u * w - t + l - 1 + r) x_padded = _DoPadding(x, b, left_pad, right_pad, d, padding_val=padding_val) if paddings is not None: paddings = _DoPadding(paddings, b, left_pad, right_pad, d=None, padding_val=1.0) # 2. generate gather indices # gather_indices is a [u, c] matrix like # [ 0, ........., c-1] # [ w, ........., w + (c-1)] # [2w, .........., 2w + (c-1)] # [(u-1)*w, ...., (u-1)*w + (c-1)] gather_indices = (tf.tile(tf.expand_dims(tf.range(0, c), axis=0), (u, 1)) + tf.tile(tf.expand_dims(tf.range(0, u * w, w), axis=1), (1, c))) # 3. generate x_patches, shape [b, u, c, d] x_patches = tf.gather(x_padded, gather_indices, axis=1) if paddings is not None: # gather is now a [b, u, c] tensor paddings = tf.gather(paddings, gather_indices, axis=1) return x_patches, paddings
def FProp(self, theta, inputs, paddings, state0=None, labels=None): """Computes xent loss given the language model input activations. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: Input activation. A tensor of shape [time, batch, model_dim]. paddings: A 0/1 tensor of shape [time, batch]. state0: Not used for Transformer. labels: If not None, a `.NestedMap` containing the following fields: - class_weights, a tensor with shape [time, batch] containing the weights for each target word. - class_ids, a tensor with shape [time, batch] of int32 dtype containing the target class labels. - class_probabilities, a tensor with shape [time, batch, vocab_size] of float values indicating class-membership probabilities. Returns: If `labels` is not None, returns (xent_output, None), where `xent_output` is a `.NestedMap` as defined by `SoftmaxLayer`'s return value. Otherwise, `xent_output` only contains the softmax logits. """ p = self.params inputs = py_utils.HasRank(inputs, 3) seqlen, batch, _ = tf.unstack(tf.shape(inputs), num=3) inputs = py_utils.HasShape(inputs, [seqlen, batch, p.model_dim]) paddings = py_utils.HasShape(paddings, [seqlen, batch]) # [time, 1, model_dim] posit_embs = tf.expand_dims( self.position_emb.FProp(theta.position_emb, seqlen), 1) # [time, batch, model_dim] input_embs = inputs + posit_embs input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs) layer_in = input_embs for layer, layer_theta in zip(self.trans, theta.trans): # [time, batch, model_dim] layer_out, _ = layer.FProp(layer_theta, layer_in, paddings) layer_in = layer_out if labels is None: # We can only compute the logits here. logits = self.softmax.Logits(theta=theta.softmax, inputs=tf.reshape( layer_out, [seqlen * batch, -1])) xent_output = py_utils.NestedMap( logits=tf.reshape(logits, [seqlen, batch, -1])) elif 'class_ids' in labels: xent_output = self.softmax.FProp( theta=theta.softmax, inputs=layer_out, class_weights=labels.class_weights, class_ids=labels.class_ids) else: assert 'class_probabilities' in labels xent_output = self.softmax.FProp( theta=theta.softmax, inputs=layer_out, class_weights=labels.class_weights, class_probabilities=labels.class_probabilities) xent_output.last_hidden = layer_out return xent_output, None
def RelPositionBias(self, content, abs_pos_emb, skip_term_b=False): """Compute relative position bias. This is a subroutine used by variants of self-attentions with relative positional embedding. output[b][n][i][j] = content[b][i][n] x abs_pos_emb[i-j+T-1][n] Padding should be masked by the caller of this function. B: batch size T: sequence length N: num of attention heads. H: per-head attention dimension. Args: tensors of the following shapes: content: [N, H] if skip_term_b else [B, T, N, H] abs_pos_emb: [2T - 1, N, H], the absolute positional embedding. abs_pos_emb[i] is the emb of relative distance i - (T-1). skip_term_b: If to skip term_b in section 3.3 equation. Returns: The attention logits tensor. [N, T, T] if skip_term_b else [B, N, T, T]. """ if not skip_term_b: b, t, n, h = py_utils.GetShape(content) l = 2 * t - 1 abs_pos_emb = py_utils.HasShape(abs_pos_emb, [l, n, h]) else: n, h = py_utils.GetShape(content) l = py_utils.GetShape(abs_pos_emb)[0] t = (l + 1) // 2 if not skip_term_b: # [B, N, T, L=2T-1] content, abs_pos_emb = self.ToAqtActActInputs(content, abs_pos_emb) term_bd = tf.einsum('BTNH,LNH->BNTL', content, abs_pos_emb) term_bd = self.FromAqtActActMatmul(term_bd) term_bd = tf.reshape(term_bd, [b, n, t * l], name='flatten') # [B, N, T * (L + 1)]. term_bd = tf.pad(term_bd, ((0, 0), (0, 0), (0, t))) # [B, N, T, L + 1]. term_bd = tf.reshape(term_bd, [b, n, t, l + 1], name='restore') return term_bd[:, :, :, t - 1::-1] else: # [N, L=2T-1] content, abs_pos_emb = self.ToAqtActActInputs(content, abs_pos_emb) term_d = tf.einsum('NH,LNH->NL', content, abs_pos_emb) term_d = self.FromAqtActActMatmul(term_d) # [N, T, L] term_d = tf.tile(tf.expand_dims(term_d, axis=1), [1, t, 1], name='tile') term_d = tf.reshape(term_d, [n, t * l]) # [N, T * (L + 1)]. term_d = tf.pad(term_d, ((0, 0), (0, t))) # [N, T, L + 1]. term_d = tf.reshape(term_d, [n, t, l + 1], name='restore') return term_d[:, :, t - 1::-1]
def FarthestPointSampler(points, padding, num_sampled_points, precomputed_squared_distance=None, num_seeded_points=0, random_seed=None): """Samples num_sampled_points from points using farthest point sampling. Algorithm: 1. Start by selecting a random point and adding to a selected set. 2. For all remaining points, find the furthest point from those selected. 3. Add furthest point to selected. 4. Repeat 2-3 until num_sampled_points are selected. More details at https://en.wikipedia.org/wiki/Farthest-first_traversal This output of this function can be used with tf.batch_gather to extract the desired points, for example: tf.batch_gather(points, sampled_idx) Args: points: floating point tf.Tensor of shape [N, P1, dims] padding: A floating point tf.Tensor of shape [N, P1] with 0 if the point is real, and 1 otherwise. num_sampled_points: integer number of points to sample. precomputed_squared_distance: optional tf.Tensor of shape [N, P1, P1] of distances between each point. if None, distances will be computed on the fly. num_seeded_points: If num_seeded_points > 0, then the first num_seeded_points in points are considered to be seeded in the FPS sampling. Note that we assume that these points are *not* padded, and do not check padding when seeding them. random_seed: optional integer random seed to use with all the random ops. Returns: A tuple of tf.Tensors (sampled_idx, closest_idx) of types (tf.int32, tf.int32). sampled_idx is of shape [N, num_sampled_points] representing the indices selected using the sampler. This will have range of [0, P1]. closest_idx is of shape [N, P1] representing the indices of the closest sampled points for each input point. closest_idx is used in PCNN as part of the pooling operation: each point is assigned to the closest sampled point and a max is taken over them. This will have a range of [0, P2] with the index of the closest sampled point that remains. """ points = py_utils.HasRank(points, 3) batch_size, num_points, dims = py_utils.GetShape(points, 3) points = py_utils.with_dependencies( [py_utils.assert_greater_equal(num_points, num_sampled_points)], points) # Add a tiny bit of noise to the distance matrix or points so all # points are unique. This will also ensure true repeated points # like padded points are only selected after all valid points are selected. if precomputed_squared_distance is not None: precomputed_squared_distance = py_utils.HasShape( precomputed_squared_distance, [batch_size, num_points, num_points]) precomputed_squared_distance += tf.random.uniform( (batch_size, num_points, 1), minval=1e-6, maxval=1e-5, dtype=tf.float32, seed=random_seed) else: points += tf.random.uniform((batch_size, num_points, dims), minval=1e-6, maxval=1e-5, dtype=tf.float32, seed=random_seed) # TensorArray to store the sampled indices in the loop. sampled_idx = tf.TensorArray(tf.int32, num_sampled_points) # Initialize distance_to_selected to inf for all points. distance_to_selected = float('inf') * tf.ones((batch_size, num_points)) # For tracking the index to the closest selected point. closest_idx = tf.zeros((batch_size, num_points), dtype=tf.int32) # Current loop index counter. curr_idx = tf.constant(0, dtype=tf.int32) # Get number of valid points (1 is padded, so num_points - num_padded). num_valid_points = tf.cast(tf.cast(num_points, dtype=tf.float32) - tf.reduce_sum(padding, axis=1), dtype=tf.int32) def _BodyFn(curr_idx, distance_to_selected, sampled_idx, closest_idx): """Loop body for farthest point sampler.""" def _GetRandomRealPoint(): """Select the first point. For the first point, we want any random real (non padded) point, so we create a random values per point, and then set all padded ones to some large value (more than the maxval). We then take the min per batch element to get the first points. Returns: Tensor containing the index of a random point selected for each example in the batch. """ random_values = tf.random.uniform((batch_size, num_points), minval=0, maxval=1, dtype=tf.float32, seed=random_seed) random_values = tf.where(tf.equal(padding, 0.0), random_values, padding * 10) return tf.argmin(random_values, axis=1, output_type=tf.int32) def _GetFurthestPoint(): """Get point that is furthest from those already selected. We also bias the sampling towards real points by setting the distance to padded points negative until we are out of real points. Returns: Tensor containing the index of the next farthest point selected for each example in the batch. """ # Set padded points distance to negative so they aren't selected. padding_masked_distance_to_selected = tf.where( tf.equal(padding, 0.0), distance_to_selected, -1.0 * tf.ones( (batch_size, num_points), dtype=tf.float32)) # But only do this when we still have valid points left. padding_masked_distance_to_selected = tf.where( tf.less(curr_idx, num_valid_points), padding_masked_distance_to_selected, distance_to_selected) return tf.argmax(padding_masked_distance_to_selected, axis=-1, output_type=tf.int32) def _GetSeededPoint(): """Select a seeded point. Seeded points are assumed to be at the beginning of the original points. Returns: Tensor containing the index of the next seeded point to select for each example in the batch. """ return tf.ones((batch_size, ), dtype=tf.int32) * curr_idx # Select indices for this loop iteration. def _Seeded(): return tf.cond(tf.less(curr_idx, num_seeded_points), _GetSeededPoint, _GetFurthestPoint) def _Real(): return tf.cond(tf.equal(curr_idx, 0), _GetRandomRealPoint, _GetFurthestPoint) new_selected = tf.cond(tf.greater(num_seeded_points, 0), _Seeded, _Real) sampled_idx = sampled_idx.write(curr_idx, new_selected) # Extract the distance to the latest point selected to update # distance_to_selected. new_selected_gather_idx = tf.stack( [tf.range(batch_size), new_selected], axis=1) if precomputed_squared_distance is not None: new_distance = tf.gather_nd(precomputed_squared_distance, new_selected_gather_idx) else: new_points = tf.reshape( tf.gather_nd(points, new_selected_gather_idx), [batch_size, 1, dims]) new_distance = tf.reshape( SquaredDistanceMatrix(points, new_points), [batch_size, num_points]) is_newly_closest = tf.less(new_distance, distance_to_selected) distance_to_selected = tf.minimum(distance_to_selected, new_distance) # Track the index to the closest selected point. new_selected_tiled = tf.tile([[curr_idx]], [batch_size, num_points]) closest_idx = tf.cond( tf.equal(curr_idx, 0), # At the first loop iteration, the init points are the closest. lambda: new_selected_tiled, # Otherwise, update with the new points based on the distances. lambda: tf.where(is_newly_closest, new_selected_tiled, closest_idx) ) return curr_idx + 1, distance_to_selected, sampled_idx, closest_idx _, _, sampled_idx, closest_idx = tf.while_loop( lambda curr_idx, *args: tf.less(curr_idx, num_sampled_points), _BodyFn, loop_vars=(curr_idx, distance_to_selected, sampled_idx, closest_idx), back_prop=False, maximum_iterations=num_sampled_points) sampled_idx = sampled_idx.stack() # num_sampled_points x n sampled_idx = tf.transpose(sampled_idx, [1, 0]) if isinstance(batch_size, int) and isinstance(num_sampled_points, int): sampled_idx.set_shape((batch_size, num_sampled_points)) return sampled_idx, closest_idx
def FProp(self, theta, batch, state0=None): """Encodes source as represented by 'inputs' and 'paddings'. Args: theta: A NestedMap object containing weights' values of this layer and its children layers. batch: A NestedMap with fields: - src_inputs - The inputs tensor. It is expected to be of shape [batch, time, feature_dim, channels]. - paddings - The paddings tensor. It is expected to be of shape [batch, time]. state0: Recurrent input state. Not supported/ignored by this encoder. Returns: A NestedMap containing: - 'encoded': a feature tensor of shape [time, batch, depth] - 'padding': a 0/1 tensor of shape [time, batch] - 'state': the updated recurrent state - '${layer_type}_${layer_index}': The per-layer encoder output. Each one is a NestedMap containing 'encoded' and 'padding' similar to regular final outputs, except that 'encoded' from conv or conv_lstm layers are of shape [time, batch, depth, channels]. """ p = self.params inputs, paddings = batch.src_inputs, batch.paddings outputs = py_utils.NestedMap() with tf.name_scope(p.name): # Adding specAugmentation. if p.use_specaugment and not p.is_eval: inputs, paddings = self.specaugment.FProp( theta.specaugment, inputs, paddings) # Add a few extra padded timesteps at the end. This is for ensuring the # correctness of the conv-layers at the edges. if p.pad_steps > 0: # inplace_update() is not supported by TPU for now. Since we have done # padding on the input_generator, we may avoid this additional padding. assert not py_utils.use_tpu() inputs_pad = tf.zeros( inplace_ops.inplace_update(tf.shape(inputs), 1, p.pad_steps), inputs.dtype) paddings_pad = tf.ones( inplace_ops.inplace_update(tf.shape(paddings), 1, p.pad_steps), paddings.dtype) inputs = tf.concat([inputs, inputs_pad], 1, name='inputs') paddings = tf.concat([paddings, paddings_pad], 1) def ReshapeForPlot(tensor, padding, name): """Transposes and flattens channels to [batch, dim, seq_len] shape.""" # Flatten any dimensions beyond the third into the third. batch_size = tf.shape(tensor)[0] max_len = tf.shape(tensor)[1] plot_tensor = tf.reshape(tensor, [batch_size, max_len, -1]) plot_tensor = tf.transpose(plot_tensor, [0, 2, 1], name=name) return (plot_tensor, summary_utils.SequenceLength(padding)) plots = [ ReshapeForPlot(tf.transpose(inputs, [0, 1, 3, 2]), paddings, 'inputs') ] conv_out = inputs out_padding = paddings for i, conv_layer in enumerate(self.conv): conv_out, out_padding = conv_layer.FProp( theta.conv[i], conv_out, out_padding) if p.extra_per_layer_outputs: conv_out *= (1.0 - out_padding[:, :, tf.newaxis, tf.newaxis]) outputs['conv_%d' % i] = py_utils.NestedMap( encoded=tf.transpose(conv_out, [1, 0, 2, 3]), # to [t, b, d, c] padding=tf.transpose(out_padding)) plots.append( ReshapeForPlot(tf.transpose(conv_out, [0, 1, 3, 2]), out_padding, 'conv_%d_out' % i)) def TransposeFirstTwoDims(t): first_dim = tf.shape(t)[0] second_dim = tf.shape(t)[1] t_new = tf.transpose( tf.reshape(t, [first_dim, second_dim, -1]), [1, 0, 2]) t_shape_new = tf.concat([[second_dim], [first_dim], tf.shape(t)[2:]], 0) return tf.reshape(t_new, t_shape_new) # Now the conv-lstm part. conv_lstm_out = conv_out conv_lstm_out_padding = out_padding for i, (rnn, cnn) in enumerate( zip(self.conv_lstm_rnn, self.conv_lstm_cnn)): conv_lstm_in = conv_lstm_out # Move time dimension to be the first. conv_lstm_in = TransposeFirstTwoDims(conv_lstm_in) conv_lstm_in = tf.expand_dims(conv_lstm_in, 2) conv_lstm_in_padding = tf.expand_dims( tf.transpose(conv_lstm_out_padding), 2) lstm_out = rnn.FProp(theta.conv_lstm_rnn[i], conv_lstm_in, conv_lstm_in_padding) # Move time dimension to be the second. cnn_in = TransposeFirstTwoDims(lstm_out) cnn_in = tf.squeeze(cnn_in, 2) cnn_in_padding = conv_lstm_out_padding cnn_out, cnn_out_padding = cnn.FProp(theta.conv_lstm_cnn[i], cnn_in, cnn_in_padding) conv_lstm_out, conv_lstm_out_padding = cnn_out, cnn_out_padding if p.extra_per_layer_outputs: conv_lstm_out *= ( 1.0 - conv_lstm_out_padding[:, :, tf.newaxis, tf.newaxis]) outputs['conv_lstm_%d' % i] = py_utils.NestedMap( encoded=tf.transpose(conv_lstm_out, [1, 0, 2, 3]), # to [t, b, d, c] padding=tf.transpose(conv_lstm_out_padding)) plots.append( ReshapeForPlot(conv_lstm_out, conv_lstm_out_padding, 'conv_lstm_%d_out' % i)) # Need to do a reshape before starting the rnn layers. conv_lstm_out = py_utils.HasRank(conv_lstm_out, 4) conv_lstm_out_shape = tf.shape(conv_lstm_out) new_shape = tf.concat([conv_lstm_out_shape[:2], [-1]], 0) conv_lstm_out = tf.reshape(conv_lstm_out, new_shape) if self._first_lstm_input_dim_pad: conv_lstm_out = tf.pad( conv_lstm_out, [[0, 0], [0, 0], [0, self._first_lstm_input_dim_pad]]) conv_lstm_out = py_utils.HasShape( conv_lstm_out, [-1, -1, self._first_lstm_input_dim]) # Transpose to move the time dimension to be the first. rnn_in = tf.transpose(conv_lstm_out, [1, 0, 2]) rnn_padding = tf.expand_dims(tf.transpose(conv_lstm_out_padding), 2) # rnn_in is of shape [time, batch, depth] # rnn_padding is of shape [time, batch, 1] # Now the rnn layers. num_skips = 0 for i in range(p.num_lstm_layers): rnn_out = self.rnn[i].FProp(theta.rnn[i], rnn_in, rnn_padding) residual_index = i - p.residual_start + 1 if p.residual_start > 0 and residual_index >= 0: if residual_index % p.residual_stride == 0: residual_in = rnn_in if residual_index % p.residual_stride == p.residual_stride - 1: # Highway skip connection. if p.highway_skip: rnn_out = self.highway_skip[num_skips].FProp( theta.highway_skip[num_skips], residual_in, rnn_out) num_skips += 1 else: # Residual skip connection. rnn_out += py_utils.HasShape( residual_in, tf.shape(rnn_out)) if p.project_lstm_output and (i < p.num_lstm_layers - 1): # Projection layers. rnn_out = self.proj[i].FProp(theta.proj[i], rnn_out, rnn_padding) if i == p.num_lstm_layers - 1: rnn_out *= (1.0 - rnn_padding) if p.extra_per_layer_outputs: rnn_out *= (1.0 - rnn_padding) outputs['rnn_%d' % i] = py_utils.NestedMap( encoded=rnn_out, padding=tf.squeeze(rnn_padding, [2])) plots.append( ReshapeForPlot(tf.transpose(rnn_out, [1, 0, 2]), tf.transpose(rnn_padding, [1, 0, 2]), 'rnn_%d_out' % i)) rnn_in = rnn_out final_out = rnn_in if self.cluster.add_summary: fig = plot.MatplotlibFigureSummary('encoder_example', figsize=(8, len(plots) * 3.5)) # Order layers from bottom to top. plots.reverse() for tensor, seq_len in plots: fig.AddSubplot([tensor, seq_len], summary_utils.TrimPaddingAndPlotSequence, title=tensor.name, xlabel='Time') fig.Finalize() outputs['encoded'] = final_out outputs['padding'] = tf.squeeze(rnn_padding, [2]) outputs['state'] = py_utils.NestedMap() return outputs
def SegmentPool3D(points, point_features, pooling_idx, closest_idx, pooling_method='max'): """Performs {min/max/average} pooling over a pointcloud given indices. This should be functionally identical when using max to the above MaxPool3D function, except it turns out to be much more memory efficient on a TPU, and supports min/max/mean. Args: points: A float tf.Tensor of shape [N, P1, 3] with point locations. point_features: A float tf.Tensor of shape [N, P1, C] with point features. pooling_idx: A tf.int32 tf.Tensor of shape [N, P2] with the index of which points we want to keep. Each value should be in the range [0, P1]. closest_idx: A tf.int32 tf.Tensor of shape [N, P1] representing which sampled point is closest to each original point. Each value should be in the range of [0, P2]. pooling_method: A string for which pooling function to use. Should be one of {'min', 'max', 'mean'}. Returns: pooled_points: A float tf.Tensor of shape [N, P2, 3] with the pooled point locations. pooled_features: A float tf.Tensor of shape [N, P2, C] with the pooled features. Raises: ValueError: If pooling_method is not one of {min/max/mean}. """ segment_pooling_functions = { 'min': tf.unsorted_segment_min, 'max': tf.unsorted_segment_max, 'mean': tf.unsorted_segment_mean } if pooling_method not in segment_pooling_functions: raise ValueError('`pooling_method` must be one of {}.'.format( list(segment_pooling_functions.keys()))) segment_fn = segment_pooling_functions[pooling_method] points = py_utils.HasShape(points, [-1, -1, 3]) n, p1 = py_utils.GetShape(points, 2) point_features = py_utils.HasShape(point_features, [n, p1, -1]) _, _, c = py_utils.GetShape(point_features) pooling_idx = py_utils.HasShape(pooling_idx, [n, -1]) _, p2 = py_utils.GetShape(pooling_idx) closest_idx = py_utils.HasShape(closest_idx, [n, p1]) # Subselect our output points pooled_points = tf.batch_gather(points, pooling_idx) # Loop over batch dimension of our features/indices, as unsorted_segment_X # does not currently support a batch dimension. def _LoopFn(args): example_features, example_closest_idx = args return segment_fn(example_features, example_closest_idx, num_segments=p2) pooled_features = tf.map_fn(fn=_LoopFn, elems=(point_features, closest_idx), dtype=tf.float32) return (py_utils.HasShape(pooled_points, [n, p2, 3]), py_utils.HasShape(pooled_features, [n, p2, c]))
def _Extract(self, features): p = self.params ri_outputs = {} outputs = {} frame_pose = tf.reshape(_Dense(features['pose']), [4, 4]) for laser in p.cbr_laser_names + p.gbr_laser_names: # Extract range images. for returns in p.returns: ri_shape = tf.reshape( _Dense(features['%s_%s_shape' % (laser, returns)]), [-1]) range_image = tf.reshape( _Dense(features['%s_%s' % (laser, returns)]), ri_shape) shape_to_check = (p.cbr_ri_shape if laser in p.cbr_laser_names else p.gbr_ri_shape) range_image = py_utils.HasShape(range_image, shape_to_check) ri_outputs['%s_%s' % (laser, returns)] = range_image # Extract beam inclinations and extrinsics outputs['%s_extrinsics' % laser] = tf.reshape( _Dense(features['%s_extrinsics' % laser]), [4, 4]) # CBRs have uniform inclination for laser in p.cbr_laser_names: beam_inclination_min = tf.reshape( _Dense(features['%s_beam_inclination_min' % laser]), []) beam_inclination_max = tf.reshape( _Dense(features['%s_beam_inclination_max' % laser]), []) outputs['%s_beam_inclinations' % laser] = tf.stack( [beam_inclination_min, beam_inclination_max], axis=0) # GBRs have non-uniform inclinations defined by 64 floats. for laser in p.gbr_laser_names: outputs['%s_beam_inclinations' % laser] = tf.reshape( _Dense(features['%s_beam_inclinations' % laser]), [64]) # Embed xyz onto each range image pixel. for laser in p.cbr_laser_names + p.gbr_laser_names: extrinsics = outputs['%s_extrinsics' % laser] inclinations = outputs['%s_beam_inclinations' % laser] if laser in p.cbr_laser_names: ri_shape = p.cbr_ri_shape # Convert from 2-tuple range inclination to the full range # via linear interpolation. # # CBR lasers currently are always uniform inclinations specified by a # length 2 vector. height = ri_shape[0] min_inclination = inclinations[0] max_inclination = inclinations[1] diff = max_inclination - min_inclination ratio = (.5 + tf.cast(tf.range( 0, height), tf.float32)) / tf.cast(height, tf.float32) # interpolate from min to max inclination. inclinations = (ratio * diff) + min_inclination else: ri_shape = p.gbr_ri_shape pixel_pose = None if laser in p.gbr_laser_names: pixel_pose = tf.reshape(_Dense(features['%s_pose' % laser]), shape=p.gbr_ri_shape[0:2] + [4, 4]) outputs['%s_pose' % laser] = pixel_pose for returns in p.returns: range_image = ri_outputs['%s_%s' % (laser, returns)] range_image = tf.reshape(range_image, ri_shape) range_image_mask = range_image[..., 0] >= 0 ri_xyz = tf.cast( self._XYZFromRangeImage(range_image, range_image_mask, extrinsics, inclinations, pixel_pose, frame_pose), tf.float32) # Produce the NestedMap of xyz, features, mask. ri_result = py_utils.NestedMap({ 'xyz': ri_xyz, 'features': range_image, 'mask': tf.cast(range_image_mask, tf.float32), }) outputs['%s_%s' % (laser, returns)] = ri_result return py_utils.NestedMap(outputs)
def ComputeLoss(self, theta, predictions, input_batch): """Compute loss for the sparse detector model v1. Args: theta: A `.NestedMap` object containing variable values of this task. predictions: A `.NestedMap` object containing residuals and classification_logits. input_batch: A `.NestedMap` expected to contain cell_center_xyz, cell_points_xyz, cell_feature, anchor_bboxes, anchor_localization_residuals, assigned_gt_labels, and assigned_cls_mask. See class doc string for details. Returns: Two dicts: - A dict containing str keys and (metric, weight) pairs as values, where one of the keys is expected to be 'loss'. - A dict containing arbitrary tensors describing something about each training example, where the first dimension of each tensor is the batch index. """ p = self.params batch_size, num_centers = py_utils.GetShape( input_batch.cell_center_xyz, 2) # Assert shapes of inputs. anchor_bboxes = py_utils.HasShape( input_batch.anchor_bboxes, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7]) anchor_localization_residuals = py_utils.HasShape( input_batch.anchor_localization_residuals, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7]) predicted_residuals = py_utils.HasShape( predictions.residuals, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7]) assigned_gt_labels = py_utils.HasShape( input_batch.assigned_gt_labels, [batch_size, num_centers, p.num_anchor_bboxes_per_center]) predicted_classification_logits = py_utils.HasShape( predictions.classification_logits, [ batch_size, num_centers, p.num_anchor_bboxes_per_center, p.num_classes ]) # assigned_cls_mask is for weighting the classification loss. # Ignored targets will have their mask = 0; this happens when their IOU is # not high enough to be a foreground object and not low enough to be # background. class_weights = py_utils.HasShape( input_batch.assigned_cls_mask, [batch_size, num_centers, p.num_anchor_bboxes_per_center]) class_weights = tf.reshape( class_weights, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 1]) # Broadcast per class loss weights. For each anchor, there are num_classes # prediction heads, we weight the outputs of these heads by the per class # loss weights. per_class_loss_weight = tf.constant([[[p.per_class_loss_weight]]], dtype=tf.float32) per_class_loss_weight = py_utils.HasShape(per_class_loss_weight, [1, 1, 1, p.num_classes]) class_weights *= per_class_loss_weight class_weights = py_utils.HasShape(class_weights, [ batch_size, num_centers, p.num_anchor_bboxes_per_center, p.num_classes ]) # We use assigned_reg_mask for masking the regression loss. # Only foreground objects will have assigned_reg_mask = 1. reg_weights = py_utils.HasShape( input_batch.assigned_reg_mask, [batch_size, num_centers, p.num_anchor_bboxes_per_center]) reg_weights = tf.reshape( reg_weights, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 1]) if p.loss_norm_type == LossNormType.NORM_BY_NUM_POS_PER_CENTER: # Compute number of positive anchors per example. foreground_mask = py_utils.HasShape( input_batch.assigned_reg_mask, [batch_size, num_centers, p.num_anchor_bboxes_per_center]) # Sum to get the number of foreground anchors for each example. loss_normalization = tf.reduce_sum(foreground_mask, axis=2) loss_normalization = tf.maximum(loss_normalization, tf.ones_like(loss_normalization)) # Reshape for broadcasting. loss_normalization = tf.reshape(loss_normalization, [batch_size, num_centers, 1, 1]) # Normalize so that the loss is independent of # centers. loss_normalization *= num_centers class_weights /= loss_normalization reg_weights /= loss_normalization classification_loss = py_utils.SigmoidCrossEntropyFocalLoss( logits=predicted_classification_logits, labels=tf.one_hot(assigned_gt_labels, p.num_classes), alpha=p.focal_loss_alpha, gamma=p.focal_loss_gamma) # Apply mask. classification_loss *= class_weights # TODO(jngiam): Consider normalizing by num_foreground_anchors for each # example instead. This would match the 1/N_positive normalization in # point pillars. # Reduce sum over centers, boxes and classes. classification_loss = tf.reduce_sum(classification_loss, axis=[1, 2, 3]) # Reduce mean over batch. classification_loss = tf.reduce_mean(classification_loss) # Localization regression loss with Huber loss (SmoothL1). regression_loc_and_dims_loss = self._utils_3d.ScaledHuberLoss( labels=anchor_localization_residuals[..., :6], predictions=predicted_residuals[..., :6], delta=p.huber_loss_delta) # Rotation loss is computed on a transform on rotation_delta. For a # direction aware loss, we simply wrap the angles to -pi to pi; for a loss # that is symmetric to direction (i.e., rotating by pi), we use a sin # transform. rotation_delta_transform = tf.sin if p.direction_aware_rot_loss: rotation_delta_transform = functools.partial(geometry.WrapAngleRad, min_val=-np.pi, max_val=np.pi) rotation_delta = (predicted_residuals[..., 6:] - anchor_localization_residuals[..., 6:]) regression_rotation_loss = self._utils_3d.ScaledHuberLoss( labels=tf.zeros_like(rotation_delta), predictions=rotation_delta_transform(rotation_delta), delta=p.huber_loss_delta) reg_loc_loss = regression_loc_and_dims_loss[..., :3] reg_dim_loss = regression_loc_and_dims_loss[..., 3:6] gt_bboxes = self._utils_3d.ResidualsToBBoxes( anchor_bboxes, anchor_localization_residuals, min_angle_rad=-np.pi, max_angle_rad=np.pi) predicted_bboxes = self._utils_3d.ResidualsToBBoxes( anchor_bboxes, predicted_residuals, min_angle_rad=-np.pi, max_angle_rad=np.pi) # Apply mask to individual losses. # # And then reduce sum over centers, boxes, residuals, and batch # and divide by the batch_size. regression_rotation_loss *= reg_weights reg_rot_loss = tf.reduce_sum(regression_rotation_loss) / batch_size reg_loc_loss *= reg_weights reg_loc_loss = tf.reduce_sum(reg_loc_loss) / batch_size reg_dim_loss *= reg_weights reg_dim_loss = tf.reduce_sum(reg_dim_loss) / batch_size # Do not create corner loss graph if weight is 0.0 # TODO(bcyang): Remove condition after fixing corner loss NaN issue if p.corner_loss_weight != 0.0: reg_corner_loss = self._utils_3d.CornerLoss( gt_bboxes=gt_bboxes, predicted_bboxes=predicted_bboxes) reg_corner_loss = tf.expand_dims(reg_corner_loss, axis=-1) reg_corner_loss *= reg_weights reg_corner_loss = tf.reduce_sum(reg_corner_loss) / batch_size else: reg_corner_loss = 0.0 # Sum components of regression loss. regression_loss = (p.location_loss_weight * reg_loc_loss + p.dimension_loss_weight * reg_dim_loss + p.rotation_loss_weight * reg_rot_loss + p.corner_loss_weight * reg_corner_loss) # Compute total loss. total_loss = (p.loss_weight_localization * regression_loss + p.loss_weight_classification * classification_loss) metrics_dict = py_utils.NestedMap({ 'loss': (total_loss, batch_size), 'loss/regression': (regression_loss, batch_size), 'loss/regression/loc': (reg_loc_loss, batch_size), 'loss/regression/dim': (reg_dim_loss, batch_size), 'loss/regression/rot': (reg_rot_loss, batch_size), 'loss/regression/corner': (reg_corner_loss, batch_size), 'loss/classification': (classification_loss, batch_size), }) # Calculate dimension errors dimension_errors_dict = self._BBoxDimensionErrors( gt_bboxes, predicted_bboxes, reg_weights) metrics_dict.update(dimension_errors_dict) per_example_dict = py_utils.NestedMap({ 'residuals': predicted_residuals, 'classification_logits': predicted_classification_logits, 'predicted_bboxes': predicted_bboxes, 'gt_bboxes': gt_bboxes, 'reg_weights': reg_weights, }) return metrics_dict, per_example_dict
def _XYZFromRangeImage(self, lidar_image, lidar_image_mask, extrinsics, inclinations, pixel_pose=None, frame_pose=None): """Extract the cartesian coordinates from the range image. Args: lidar_image: [H, W, C] range image Tensor. lidar_image_mask: [H, W] boolean indicating which 2d coordinates in the lidar image are present. extrinsics: [4, 4] float matrix representing transformation matrix to world coordinates. inclinations: [V] beam inclinations vector. pixel_pose: [64, 2650, 4, 4] tensor representing per pixel pose of GBR. frame_pose: [4, 4] matrix representing vehicle to world transformation. Returns: [H, W, 3] range image cartesian coordinates. """ height, width, channels = py_utils.GetShape(lidar_image, 3) conversion_dtype = tf.float32 lidar_image = tf.cast(lidar_image, conversion_dtype) extrinsics = tf.cast(extrinsics, conversion_dtype) inclinations = tf.cast(inclinations, conversion_dtype) inclinations = tf.reverse(inclinations, axis=[-1]) az_correction = py_utils.HasShape( tf.atan2(extrinsics[1, 0], extrinsics[0, 0]), []) ratios = (tf.cast(tf.range(width, 0, -1), dtype=conversion_dtype) - .5) / tf.cast(width, conversion_dtype) ratios = py_utils.HasShape(ratios, [width]) azimuth = (ratios * 2. - 1.) * np.pi - az_correction[..., tf.newaxis] azimuth = py_utils.HasShape(azimuth, [width]) lidar_image_mask = lidar_image_mask[..., tf.newaxis] lidar_image_mask = tf.tile(lidar_image_mask, [1, 1, channels]) lidar_image = tf.where(lidar_image_mask, lidar_image, tf.zeros_like(lidar_image)) lidar_image_range = lidar_image[..., 0] azimuth = py_utils.HasShape(azimuth[tf.newaxis, ...], [1, width]) inclinations = py_utils.HasShape(inclinations[..., tf.newaxis], [height, 1]) cos_azimuth = tf.cos(azimuth) sin_azimuth = tf.sin(azimuth) cos_incl = tf.cos(inclinations) sin_incl = tf.sin(inclinations) x = cos_azimuth * cos_incl * lidar_image_range y = sin_azimuth * cos_incl * lidar_image_range z = sin_incl * lidar_image_range lidar_image_points = tf.stack([x, y, z], -1) lidar_image_points = py_utils.HasShape(lidar_image_points, [height, width, 3]) rotation = extrinsics[0:3, 0:3] translation = extrinsics[0:3, 3][tf.newaxis, ...] # Transform the image points in cartesian coordinates to # the world coordinate system using the extrinsics matrix. # # We first flatten the points, apply rotation, then # reshape to restore the original input and then apply # translation. lidar_image_points = tf.matmul(tf.reshape(lidar_image_points, [-1, 3]), rotation, transpose_b=True) lidar_image_points = tf.reshape(lidar_image_points, [height, width, 3]) lidar_image_points += translation lidar_image_points = py_utils.HasShape(lidar_image_points, [height, width, 3]) # GBR uses per pixel pose. if pixel_pose is not None: pixel_pose_rotation = pixel_pose[..., 0:3, 0:3] pixel_pose_translation = pixel_pose[..., 0:3, 3] lidar_image_points = tf.einsum( 'hwij,hwj->hwi', pixel_pose_rotation, lidar_image_points) + pixel_pose_translation if frame_pose is None: raise ValueError( 'frame_pose must be set when pixel_pose is set.') # To vehicle frame corresponding to the given frame_pose # [4, 4] world_to_vehicle = tf.matrix_inverse(frame_pose) world_to_vehicle_rotation = world_to_vehicle[0:3, 0:3] world_to_vehicle_translation = world_to_vehicle[0:3, 3] # [H, W, 3] lidar_image_points = tf.einsum( 'ij,hwj->hwi', world_to_vehicle_rotation, lidar_image_points ) + world_to_vehicle_translation[tf.newaxis, tf.newaxis, :] return lidar_image_points
def ComputePredictions(self, theta, input_batch): """Computes predictions for `input_batch`. Args: theta: A `.NestedMap` object containing variable values of this task. input_batch: A `.NestedMap` expected to contain lasers.points_xyz, lasers.points_feature, lasers.points_padding, cell_center_xyz, cell_points_xyz, cell_feature, anchor_bboxes, anchor_localization_residuals, assigned_gt_labels, and assigned_cls_mask. See class doc string for details. Returns: A `.NestedMap` object containing residuals and classification_logits. """ p = self.params input_batch.Transform(lambda x: (x.shape, x.shape.num_elements())).VLog( 1, 'input_batch shapes: ') cell_feature = py_utils.HasRank(input_batch.cell_feature, 4) batch_size, num_centers = py_utils.GetShape(cell_feature, 2) featurized_cell = self._CellFeaturizer(theta, input_batch) # Project each featurized_cell features to each bbox per center. featurized_anchors = self.cell_feature_projector.FProp( theta.cell_feature_projector, featurized_cell) # Reshape output so that we have features per offset. featurized_anchors = tf.reshape( featurized_anchors, [batch_size, num_centers, p.num_anchor_bboxes_offsets, -1]) # Predict localization residuals. predicted_residuals = self.localization_regressor.FProp( theta.localization_regressor, featurized_anchors) predicted_residuals = tf.reshape( predicted_residuals, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7]) if any([p.oracle_location, p.oracle_dimension, p.oracle_rotation]): gt_residuals = py_utils.HasShape( input_batch.anchor_localization_residuals, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7]) residuals = [] if p.oracle_location: residuals.append(gt_residuals[..., 0:3]) else: residuals.append(predicted_residuals[..., 0:3]) if p.oracle_dimension: residuals.append(gt_residuals[..., 3:6]) else: residuals.append(predicted_residuals[..., 3:6]) if p.oracle_rotation: residuals.append(gt_residuals[..., 6:]) else: residuals.append(predicted_residuals[..., 6:]) predicted_residuals = tf.concat(residuals, axis=-1) if p.squash_rotation_predictions: predicted_rotations = predicted_residuals[..., 6:] predicted_rotations = np.pi * tf.tanh(predicted_rotations) predicted_residuals = tf.concat( [predicted_residuals[..., :6], predicted_rotations], axis=-1) # Predict object classification at each bbox. predicted_classification_logits = self.classifier.FProp( theta.classifier, featurized_anchors) predicted_classification_logits = tf.reshape( predicted_classification_logits, [ batch_size, num_centers, p.num_anchor_bboxes_per_center, p.num_classes ]) if p.oracle_classification: assigned_gt_labels = py_utils.HasShape( input_batch.assigned_gt_labels, [batch_size, num_centers, p.num_anchor_bboxes_per_center]) predicted_classification_logits = tf.one_hot( assigned_gt_labels, p.num_classes) return py_utils.NestedMap({ 'residuals': predicted_residuals, 'classification_logits': predicted_classification_logits, })
def FProp(self, theta, inputs, paddings): """Apply convolution to inputs. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: The inputs tensor. It is expected to be of shape [batch, time, frequency, channel]. The time dimension corresponds to the height dimension as in images and the frequency dimension corresponds to the width dimension as in images. paddings: The paddings tensor, expected to be of shape [batch, time]. Returns: outputs, out_paddings pair. """ p = self.params with tf.name_scope(p.name): inputs = py_utils.with_dependencies([ py_utils.assert_shape_match(tf.shape(paddings), [-1, -1]), py_utils.assert_shape_match( tf.shape(inputs), tf.concat([ tf.shape(paddings), [-1, symbolic.ToStatic(self.input_channels)] ], 0)) ], inputs) def _ApplyPadding(tensor_in, padding_in): padding_expanded = tf.expand_dims( tf.expand_dims(padding_in, -1), -1) return tensor_in * (1.0 - padding_expanded) # Zeroing out padded inputs. inputs = _ApplyPadding(inputs, paddings) # Apply conv on 'inputs'. if p.v2_padding: padded_inputs, slice_len = _PadForLengthCompatibleStridesV2( inputs, p.filter_stride[0], 'SAME', 0.) out = self._ApplyConv(theta, padded_inputs) if p.filter_stride[0] > 1: slice_end = py_utils.GetShape(out)[1] - slice_len out = out[:, :slice_end, :, :] else: out = self._ApplyConv(theta, inputs) if p.partial_conv: out = self._RescaleBoundary(out, paddings) # NOTE: this may be slightly inaccurate when p.dilation_rate[0] > 1. # But there's likely no real problems. Trying to set it gives an error: # pooling with SAME padding is not implemented for dilation_rate > 1. # implementation. Consider updating it to be the actual shape. if p.v2_padding: conv_padding = _ComputeConvOutputPaddingV2( paddings, window=p.filter_shape[0], stride=p.filter_stride[0]) else: conv_padding = ComputeConvOutputPadding( paddings, window=p.filter_stride[0], stride=p.filter_stride[0]) # Assuming padded nodes will be properly zero-ed out if necessary by # sub-sequent layers. # out = _ApplyPadding(out, conv_padding) out = py_utils.HasShape( out, symbolic.ToStatic(self.OutShape(tf.shape(inputs)))) return out, conv_padding
def PadOne(inp): inp = py_utils.HasShape(inp, [-1, -1, 3]) return tf.pad(inp, [[0, 0], [0, 0], [0, 1]], constant_values=1.0)
def ResidualsToBBoxes(self, anchor_bboxes, residuals, min_angle_rad=-np.pi, max_angle_rad=np.pi): r"""Converts anchor_boxes and residuals to predicted bboxes. This converts predicted residuals into bboxes using the following formulae:: x_predicted = x_a + x_residual * diagonal_xy y_predicted = y_a + y_residual * diagonal_xy z_predicted = z_a + z_residual * dz_a dx_predicted = dx_a * exp(dx_residual) dy_predicted = dy_a * exp(dy_residual) dz_predicted = dz_a * exp(dz_residual) # Adding the residual, and bounding it between # [min_angle_rad, max_angle_rad] phi_predicted = NormalizeAngleRad(phi_a + phi_residual, min_angle_rad, max_angle_rad) These equations follow from those in LocalizationResiduals, where we solve for the \*_gt variables. Args: anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz, phi), corresponding to each anchor bbox parameters. residuals: tf.float32 of the same shape as anchor_bboxes containing predicted residuals at each anchor location. min_angle_rad: Scalar with the minimum angle allowed (before wrapping) in radians. max_angle_rad: Scalar with the maximum angle allowed (before wrapping) in radians. This value usually should be pi. Returns: A tf.float32 tensor of the same shape as anchor_bboxes with predicted bboxes. """ anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes) anchor_bboxes = py_utils.with_dependencies( [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes) residuals = py_utils.HasShape(residuals, anchor_bboxes_shape) x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(anchor_bboxes, num=7, axis=-1) (x_residual, y_residual, z_residual, dx_residual, dy_residual, dz_residual, phi_residual) = tf.unstack(residuals, num=7, axis=-1) diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a)) x_predicted = x_a + x_residual * diagonal_xy y_predicted = y_a + y_residual * diagonal_xy z_predicted = z_a + z_residual * dz_a dx_predicted = dx_a * tf.exp(dx_residual) dy_predicted = dy_a * tf.exp(dy_residual) dz_predicted = dz_a * tf.exp(dz_residual) # We bound the angle between [min_angle_rad, max_angle_rad], which should # be passed in depending on the heading handling in the calling model. # If the model uses a sine(delta_phi) transformation in the loss, then it # cannot distinguish direction and a [0, np.pi] # [min_angle_rad, max_angle_rad] should be used. # If there is a heading encoding that is directional, most likely you # should use a [-np.pi, np.pi] [min_angle_rad, max_angle_rad]. phi_predicted = phi_a + phi_residual phi_predicted = geometry.WrapAngleRad(phi_predicted, min_angle_rad, max_angle_rad) return tf.stack([ x_predicted, y_predicted, z_predicted, dx_predicted, dy_predicted, dz_predicted, phi_predicted, ], axis=-1) # pyformat: disable