Exemple #1
0
  def testTransformerStackV2(self, use_v1_stack=False, stride=1, first_n=None):
    with self.session(use_gpu=False) as sess:
      bs = 2
      sl = 21
      d = 16
      tf.random.set_seed(12345)
      atten_builder = self_attention.Builder.Params().Set(
          model_dim=d,
          num_heads=2,
          ff_hidden_dim=5,
          deterministic_dropout=False,
          num_splits=1,
          num_micro_batches=1)
      builder = atten_builder.Instantiate()
      if use_v1_stack:
        p = builder.TransformerStack('atten', num_layers=3)
      else:
        p = builder.TransformerStackV2(
            'atten',
            num_layers=3,
            final_layer_stride=stride,
            final_layer_first_n=first_n)
      p.params_init = py_utils.WeightInit.Xavier(scale=1.0, seed=0)
      l = p.Instantiate()
      self.assertAllEqual([
          'atten/iter_000/block/ff/feedforward/bias01/b/var',
          'atten/iter_000/block/ff/feedforward/bias02/b/var',
          'atten/iter_000/block/ff/feedforward/linear01/w/var',
          'atten/iter_000/block/ff/feedforward/linear02/w/var',
          'atten/iter_000/block/ff/feedforward/ln/bias/var',
          'atten/iter_000/block/ff/feedforward/ln/scale/var',
          'atten/iter_000/block/self_atten/LN/bias/var',
          'atten/iter_000/block/self_atten/LN/scale/var',
          'atten/iter_000/block/self_atten/atten/key/b/var',
          'atten/iter_000/block/self_atten/atten/key/w/var',
          'atten/iter_000/block/self_atten/atten/per_dim_scale/per_dim_scale/var',
          'atten/iter_000/block/self_atten/atten/post/b/var',
          'atten/iter_000/block/self_atten/atten/post/w/var',
          'atten/iter_000/block/self_atten/atten/query/b/var',
          'atten/iter_000/block/self_atten/atten/query/w/var',
          'atten/iter_000/block/self_atten/atten/value/b/var',
          'atten/iter_000/block/self_atten/atten/value/w/var',
          'atten/iter_001/block/ff/feedforward/bias01/b/var',
          'atten/iter_001/block/ff/feedforward/bias02/b/var',
          'atten/iter_001/block/ff/feedforward/linear01/w/var',
          'atten/iter_001/block/ff/feedforward/linear02/w/var',
          'atten/iter_001/block/ff/feedforward/ln/bias/var',
          'atten/iter_001/block/ff/feedforward/ln/scale/var',
          'atten/iter_001/block/self_atten/LN/bias/var',
          'atten/iter_001/block/self_atten/LN/scale/var',
          'atten/iter_001/block/self_atten/atten/key/b/var',
          'atten/iter_001/block/self_atten/atten/key/w/var',
          'atten/iter_001/block/self_atten/atten/per_dim_scale/per_dim_scale/var',
          'atten/iter_001/block/self_atten/atten/post/b/var',
          'atten/iter_001/block/self_atten/atten/post/w/var',
          'atten/iter_001/block/self_atten/atten/query/b/var',
          'atten/iter_001/block/self_atten/atten/query/w/var',
          'atten/iter_001/block/self_atten/atten/value/b/var',
          'atten/iter_001/block/self_atten/atten/value/w/var',
          'atten/iter_002/block/ff/feedforward/bias01/b/var',
          'atten/iter_002/block/ff/feedforward/bias02/b/var',
          'atten/iter_002/block/ff/feedforward/linear01/w/var',
          'atten/iter_002/block/ff/feedforward/linear02/w/var',
          'atten/iter_002/block/ff/feedforward/ln/bias/var',
          'atten/iter_002/block/ff/feedforward/ln/scale/var',
          'atten/iter_002/block/self_atten/LN/bias/var',
          'atten/iter_002/block/self_atten/LN/scale/var',
          'atten/iter_002/block/self_atten/atten/key/b/var',
          'atten/iter_002/block/self_atten/atten/key/w/var',
          'atten/iter_002/block/self_atten/atten/per_dim_scale/per_dim_scale/var',
          'atten/iter_002/block/self_atten/atten/post/b/var',
          'atten/iter_002/block/self_atten/atten/post/w/var',
          'atten/iter_002/block/self_atten/atten/query/b/var',
          'atten/iter_002/block/self_atten/atten/query/w/var',
          'atten/iter_002/block/self_atten/atten/value/b/var',
          'atten/iter_002/block/self_atten/atten/value/w/var',
      ], [var.op.name for var in tf.nest.flatten(l.vars)])
      input_embs = tf.constant(
          np.random.random(size=[bs, sl, d]), dtype=np.float)
      paddings = tf.zeros([bs, sl])
      segment_mask = tf.zeros([bs, 1, sl, sl])

      out = l.FPropDefaultTheta(
          py_utils.NestedMap(
              vec=input_embs, paddings=paddings, segment_mask=segment_mask))
      enc_out = out.vec
      if first_n is None:
        first_n = sl
      enc_out = py_utils.HasShape(enc_out,
                                  [bs, (first_n + stride - 1) // stride, d])
      # Only test the value of the first token.
      enc_out = enc_out[:, :1, :]
      tf.logging.info('enc_out={}'.format(enc_out.shape))
      enc_out_sum = tf.reduce_sum(enc_out)

      tf.global_variables_initializer().run()
      actual_enc_out, actual_enc_out_sum = sess.run([enc_out, enc_out_sum])
      print('actual_enc_out_sum=', actual_enc_out_sum)

      self.assertAllEqual(actual_enc_out.shape, [bs, 1, d])
      self.assertAllClose(21.429626, actual_enc_out_sum, atol=1e-5)
Exemple #2
0
    def ComputePredictions(self, theta, input_batch):
        """Computes predictions for `input_batch`.

    Args:
      theta: A `.NestedMap` object containing variable values of this task.
      input_batch: A `.NestedMap` expected to contain cell_center_xyz,
        cell_points_xyz, cell_feature, anchor_bboxes,
        anchor_localization_residuals, assigned_gt_labels, and
        assigned_cls_mask. See class doc string for details.

    Returns:
      A `.NestedMap` object containing residuals and classification_logits.
    """
        p = self.params
        input_batch.Transform(lambda x:
                              (x.shape, x.shape.num_elements())).VLog(
                                  1, 'input_batch shapes: ')

        cell_feature = py_utils.HasRank(input_batch.cell_feature, 4)
        batch_size, num_centers, num_points_per_cell = py_utils.GetShape(
            cell_feature, 3)

        cell_points_xyz = py_utils.HasShape(
            input_batch.cell_points_xyz,
            [batch_size, num_centers, num_points_per_cell, 3])
        cell_center_xyz = py_utils.HasShape(input_batch.cell_center_xyz,
                                            [batch_size, num_centers, 3])

        cell_points_padding = py_utils.HasShape(
            input_batch.cell_points_padding,
            [batch_size, num_centers, num_points_per_cell])

        # TODO(jngiam): Make concat_feature computation a layer or configureable.
        cell_center_xyz = tf.reshape(cell_center_xyz,
                                     [batch_size, num_centers, 1, 3])
        centered_cell_points_xyz = cell_points_xyz - cell_center_xyz
        concat_feature = tf.concat([
            tf.tile(cell_center_xyz, [1, 1, num_points_per_cell, 1]),
            centered_cell_points_xyz, cell_feature
        ],
                                   axis=-1)  # pyformat: disable

        # Featurize point clouds at each center.
        point_input = py_utils.NestedMap({
            'points': centered_cell_points_xyz,
            'features': concat_feature,
            'padding': cell_points_padding,
        })
        featurized_cell = self.cell_featurizer.FProp(theta.cell_featurizer,
                                                     point_input)
        featurized_cell = py_utils.HasShape(featurized_cell,
                                            [batch_size, num_centers, -1])

        # Predict localization residuals.
        predicted_residuals = self.localization_regressor.FProp(
            theta.localization_regressor, featurized_cell)
        predicted_residuals = tf.reshape(
            predicted_residuals,
            [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7])

        if p.squash_rotation_predictions:
            predicted_rotations = predicted_residuals[..., 6:]
            predicted_rotations = np.pi * tf.tanh(predicted_rotations)
            predicted_residuals = tf.concat(
                [predicted_residuals[..., :6], predicted_rotations], axis=-1)

        # Predict object classification at each bbox.
        predicted_classification_logits = self.classifier.FProp(
            theta.classifier, featurized_cell)
        predicted_classification_logits = tf.reshape(
            predicted_classification_logits, [
                batch_size, num_centers, p.num_anchor_bboxes_per_center,
                p.num_classes
            ])

        return py_utils.NestedMap({
            'residuals':
            predicted_residuals,
            'classification_logits':
            predicted_classification_logits,
        })
Exemple #3
0
  def Decode(self, input_batch):
    """Decode an input batch, computing predicted bboxes from residuals."""
    p = self.params

    bboxes_and_logits = self._BBoxesAndLogits(input_batch)
    predicted_bboxes = bboxes_and_logits.predicted_bboxes
    batch_size, num_bboxes, _ = py_utils.GetShape(predicted_bboxes, 3)
    classification_logits = bboxes_and_logits.classification_logits
    classification_logits = py_utils.HasShape(
        classification_logits, [batch_size, num_bboxes, p.num_classes])

    classification_scores = tf.sigmoid(classification_logits)

    with tf.device('/cpu:0'):
      # Decode the predicted bboxes, performing NMS.
      per_cls_bboxes, per_cls_bbox_scores, per_cls_valid_mask = (
          detection_decoder.DecodeWithNMS(
              predicted_bboxes,
              classification_scores,
              nms_iou_threshold=p.nms_iou_threshold,
              score_threshold=p.nms_score_threshold,
              max_boxes_per_class=p.max_nms_boxes,
              use_oriented_per_class_nms=p.use_oriented_per_class_nms))

      # per_cls_valid_mask is [batch, num_classes, num_boxes] Tensor that
      # indicates which boxes were selected by NMS. Each example will have a
      # different number of chosen bboxes, so the mask is present to allow us
      # to keep the boxes as a batched dense Tensor.
      #
      # We mask the scores by the per_cls_valid_mask so that none of these boxes
      # will be interpreted as valid.
      per_cls_bbox_scores *= per_cls_valid_mask
      visualization_weights = py_utils.HasShape(
          per_cls_bbox_scores, [batch_size, p.num_classes, p.max_nms_boxes])

      # For top down visualization, filter boxes whose scores are not above the
      # visualization threshold.
      visualization_weights = tf.where(
          tf.greater_equal(visualization_weights,
                           p.visualization_classification_threshold),
          visualization_weights, tf.zeros_like(visualization_weights))

    model_outputs = py_utils.NestedMap()
    model_outputs.per_class_predicted_bboxes = per_cls_bboxes
    model_outputs.per_class_predicted_bbox_scores = per_cls_bbox_scores
    model_outputs.per_class_valid_mask = per_cls_valid_mask

    decoder_outputs = py_utils.NestedMap({
        'per_class_predicted_bboxes': per_cls_bboxes,
        'per_class_predicted_bbox_scores': per_cls_bbox_scores,
        'per_class_valid_mask': per_cls_valid_mask,
        'visualization_weights': visualization_weights,
    })

    decoder_outputs.update(
        self.output_decoder.ProcessOutputs(input_batch, model_outputs))

    # Produce global step as an output (which is the step
    # of the checkpoint being decoded.)
    decoder_outputs.global_step = py_utils.GetGlobalStep()

    return decoder_outputs
Exemple #4
0
def MaxPool3D(points, point_features, pooling_idx, closest_idx):
    """Apply max pooling to a point cloud with computed sampling indices.

  sampled_idx and closest_idx are the outputs of a sampler such as
  FurthestPointSampler.

  The pooling operation results in a point cloud with fewer points, where the
  pooled points are specified by pooling_idx. Each element of pooling_idx
  contains an integer in the range [0, P1) containing the index of the point in
  points/points_features.

  Max pooling is performed by assigning each point to its closest pooled point,
  and then taking a max over the features of points assigned. We assume that
  this mapping is provided by closest_idx, where each element should contain
  an integer in the range [0, P2) containing the index of the pooled point that
  each point is assigned to.

  Note: This logic for pooling assumes that there will be at least
  one value > 0 per sampled region for each feature, otherwise it will return 0.
  Additionally, it does a reduce over a masked version of the features, so
  mean and min would not work without a change in the logic.

  Args:
    points: a floating point tf.Tensor with shape [N, P1, 3]
    point_features: a floating point tf.Tensor with shape [N, P1, C]
    pooling_idx: A tf.int32 tf.Tensor of shape [N, P2] with the index of which
      points we want to keep. Each value should be in the range [0, P1].
    closest_idx: A tf.int32 tf.Tensor of shape [N, P1] representing which
      sampled point is closest to each original point. Each value should be in
      the range of [0, P2].

  Returns:
    A tuple of tf.Tensors (pooled_points, pooled_features).

    pooled_points has shape [N, P2, 3] representing the locations of each
    selected point. P2 corresponds to num_pooled_points.

    pooled_features has shape [N, P2, C] representing the pooled features at
    each point.
  """
    batch_size, num_points = py_utils.GetShape(points, 2)
    point_features = py_utils.HasShape(point_features,
                                       [batch_size, num_points, -1])
    pooling_idx = py_utils.HasShape(pooling_idx, [batch_size, -1])
    _, num_output_points = py_utils.GetShape(pooling_idx)
    _, _, feature_dims = py_utils.GetShape(point_features, 3)

    # Gather new point locations.
    pooled_points = tf.batch_gather(points, pooling_idx)

    mask = tf.one_hot(closest_idx, num_output_points)  # [N, P1, P2]
    mask = tf.transpose(mask, [2, 0, 1])  # [P2, N, P1]

    def _PartialPoolFeaturesFn(partial_mask):
        partial_mask = tf.tile(
            tf.reshape(partial_mask, [batch_size, num_points, 1]),
            [1, 1, feature_dims])
        # Note: This method of pooling assumes there will be a value > 0
        # And will only work with max under this condition.
        return tf.reduce_max(partial_mask * point_features, axis=1)

    # Performing a map_fn over the pooled points is more memory efficient.
    pooled_point_features = tf.map_fn(_PartialPoolFeaturesFn,
                                      mask)  # [P2, N, P1]
    pooled_point_features = tf.transpose(pooled_point_features, [1, 0, 2])

    return pooled_points, pooled_point_features
Exemple #5
0
  def ProcessOutputs(self, input_batch, model_outputs):
    """Produce additional decoder outputs for KITTI.

    Args:
      input_batch: A .NestedMap of the inputs to the model.
      model_outputs: A .NestedMap of the outputs of the model, including::
        - per_class_predicted_bboxes: [batch, num_classes, num_boxes, 7] float
          Tensor with per class 3D (7 DOF) bounding boxes.
        - per_class_predicted_bbox_scores: [batch, num_classes, num_boxes] float
          Tensor with per class, per box scores.
        - per_class_valid_mask: [batch, num_classes, num_boxes] masking Tensor
          indicating which boxes were still kept after NMS for each class.

    Returns:
      A NestedMap of additional decoder outputs needed for
      PostProcessDecodeOut.
    """
    p = self.params
    per_class_predicted_bboxes = model_outputs.per_class_predicted_bboxes
    batch_size, num_classes, num_boxes, _ = py_utils.GetShape(
        per_class_predicted_bboxes)
    flattened_num_boxes = num_classes * num_boxes

    input_labels = input_batch.decoder_copy.labels
    input_lasers = input_batch.decoder_copy.lasers
    input_images = input_batch.decoder_copy.images

    with tf.device('/cpu:0'):
      # Convert the predicted bounding box points to their corners
      # and then project them to the image plane.
      #
      # This output can be used to:
      #
      # A) Visualize bounding boxes (2d or 3d) on the camera image.
      #
      # B) Compute the height of the predicted boxes to filter 'too small' boxes
      #    as is done in the KITTI eval.
      predicted_bboxes = tf.reshape(per_class_predicted_bboxes,
                                    [batch_size, flattened_num_boxes, 7])
      bbox_corners = geometry.BBoxCorners(predicted_bboxes)
      bbox_corners = py_utils.HasShape(bbox_corners,
                                       [batch_size, flattened_num_boxes, 8, 3])
      utils_3d = detection_3d_lib.Utils3D()
      bbox_corners_image = utils_3d.CornersToImagePlane(
          bbox_corners, input_images.velo_to_image_plane)
      bbox_corners_image = py_utils.HasShape(
          bbox_corners_image, [batch_size, flattened_num_boxes, 8, 2])

      # Clip the bounding box corners so they remain within
      # the image coordinates.
      bbox2d_corners_image_clipped = self._BBox2DImage(bbox_corners_image,
                                                       input_images)
      bbox2d_corners_image_clipped = py_utils.HasShape(
          bbox2d_corners_image_clipped, [batch_size, flattened_num_boxes, 4])

      # Compute the frustum mask to filter out bounding boxes that
      # are 'outside the frustum'.
      frustum_mask = self._CreateFrustumMask(bbox_corners_image,
                                             bbox2d_corners_image_clipped,
                                             input_images.height,
                                             input_images.width)

      # Reshape all of these back to [batch_size, num_classes, num_boxes, ...]
      bbox_corners_image = tf.reshape(
          bbox_corners_image, [batch_size, num_classes, num_boxes, 8, 2])

      bbox2d_corners_image_clipped = tf.reshape(
          bbox2d_corners_image_clipped, [batch_size, num_classes, num_boxes, 4])
      frustum_mask = tf.reshape(frustum_mask,
                                [batch_size, num_classes, num_boxes])

    ret = py_utils.NestedMap({
        # For mAP eval
        'source_ids': input_labels.source_id,
        'difficulties': input_labels.difficulties,
        'num_points_in_bboxes': input_batch.labels.bboxes_3d_num_points,
        # For exporting.
        'velo_to_image_plane': input_images.velo_to_image_plane,
        'velo_to_camera': input_images.velo_to_camera,
        # Predictions.
        'bbox_corners_image': bbox_corners_image,
        'bbox2d_corners_image': bbox2d_corners_image_clipped,
        'frustum_mask': frustum_mask,
        # Ground truth.
        'bboxes_3d': input_labels.bboxes_3d,
        'bboxes_3d_mask': input_labels.bboxes_3d_mask,
        'unfiltered_bboxes_3d_mask': input_labels.unfiltered_bboxes_3d_mask,
        'labels': input_labels.labels,
    })

    laser_sample = self._SampleLaserForVisualization(
        input_lasers.points_xyz, input_lasers.points_padding)
    ret.update(laser_sample)

    if p.summarize_boxes_on_image:
      ret.camera_images = input_images.image
    return ret
Exemple #6
0
    def FProp(self, theta, inputs, paddings, state0, labels=None):
        """Forward compute."""
        p = self.params

        ids = py_utils.HasRank(inputs, 2)
        paddings = py_utils.HasShape(paddings, tf.shape(ids))
        seqlen, batch = tf.unstack(tf.shape(inputs), num=2)
        assert state0

        paddings_3d = tf.expand_dims(paddings, axis=2)

        # RNNs
        if p.shared_emb:
            emb_act = [self.emb.EmbLookup(theta.emb, inputs)
                       ] * (1 + p.number_of_experts)
        else:
            emb_act = [
                self.emb[i].EmbLookup(theta.emb[i], inputs)
                for i in range(1 + p.number_of_experts)
            ]
        state1 = py_utils.NestedMap(rnns=[])
        rnns_act = []
        for i, act in enumerate(emb_act):
            act, state = self.rnns[i].FProp(theta.rnns[i], act, paddings_3d,
                                            state0.rnns[i])
            act = py_utils.HasRank(act, 3)
            rnns_act += [act]
            state1.rnns += [state]

        # [time, batch, experts, dims].
        expert_stacked = tf.stack(rnns_act[1:], axis=2)

        # Compute gating softmax. The 0-th rnns is used as the expert
        # predictor.  Because SoftmaxLayer.Logits takes a matrix as input,
        # we reshape rnns_act[0], the domain predictor activation, to a
        # matrix here.
        act = tf.reshape(rnns_act[0], [seqlen * batch, -1])
        logits = self.domain_predictor_softmax.Logits(
            theta.domain_predictor_softmax, act)
        # [time, batch, experts]
        gating = tf.reshape(tf.nn.softmax(logits), [seqlen, batch, -1])

        # Mix the experts.
        # [time, batch, dims]
        combined = tf.squeeze(
            tf.matmul(
                # [time, batch, 1, experts]
                tf.expand_dims(gating, axis=2),
                # [time, batch, experts, dims]
                expert_stacked),
            axis=2)

        if p.add_postgating_rnn:
            # Note that this layer includes 1 or more RNN layers followed
            # by a softmax.
            xent_loss, state1.merge = self.merge.FProp(theta.merge, combined,
                                                       paddings, state0.merge,
                                                       labels)
        else:
            xent_loss = self.output_softmax.FProp(
                theta=theta.output_softmax,
                inputs=combined,
                class_weights=labels.class_weights,
                class_ids=labels.class_ids)

        # return xent_loss, state1
        return xent_loss, state1
Exemple #7
0
def NeighborhoodIndices(points,
                        query_points,
                        k,
                        points_padding=None,
                        max_distance=None,
                        sample_neighbors_uniformly=False):
    """Get indices to k-neighbors of query_points in points.

  Padding is returned along-side indices. Non-padded points are guaranteed to
  be unique (non-repeated) points from original non-padded points.

  Padded points arise due to either a lack of points (k exceeds the number
  of original non-padded points) or points are too far away (exceeds max
  distance).

  Note: Padded point indices may refer to padded points from the original, or
  may be duplicates of the closest point.

  TODO(weihan,jngiam): PointCNN implementation makes an assumption that padded
  points are repeated points from the original points. This behavior is
  maintained here, but we should update PointCNN to respect indices paddings.

  Args:
    points: tensor of shape [N, P1, dims].
    query_points: tensor of shape [N, P2, dims]
    k: Integer.
    points_padding: optional tensor of shape [N, P1] containing True/1.0 iff the
      point is a padded point. if None, then all points are considered real
      points.
    max_distance: float representing the maximum distance that each neighbor can
      be. If there are no points within the distance, then the closest point is
      returned (regardless of distance). If this is set to None, then no
      filtering by distance is performed.
    sample_neighbors_uniformly: boolean specifying whether to sample neighbors
      uniformly if they are within max distance.

  Returns:
    A pair of tensors:

    - indices: tensor of shape [N, P2, k].
    - padding: tensor of shape [N, P2, k] where 1 represents a padded point, and
      0 represents an unpadded (real) point.

  """
    n, p1 = py_utils.GetShape(points, 2)
    query_points = py_utils.HasShape(query_points, [n, -1, -1])
    _, p2 = py_utils.GetShape(query_points, 2)

    # Compute pair-wise squared distances.
    # Note that dist_mat contains the squared distance (without sqrt). Thus, when
    # using max_distance, we will need to square max_distance to make sure it's
    # in the same units.
    dist_mat = SquaredDistanceMatrix(query_points, points)
    dist_mat = py_utils.HasShape(dist_mat, [n, p2, p1])

    # Add a large scalar to the distances for padded points.
    # dist_mat[i, j, k] will be:
    #   if k < valid_num[i]: distance between points[i, k] and query_points[i, j]
    #   otherwise:           a large scalar added to dist_mat[i, j, k]
    if points_padding is not None:
        points_padding = tf.cast(tf.expand_dims(points_padding, 1), tf.float32)
        points_padding = py_utils.HasShape(points_padding, [n, 1, p1])
        large_scalar = tf.reduce_max(dist_mat) + 1
        dist_mat += points_padding * large_scalar

    # To perform sampling neighbors uniformly efficiently, we set all neighbors
    # that are within the distance threshold to have distances be drawn uniformly
    # at random. Using top_k with this enables selecting a random set quickly
    # without replacement.
    if sample_neighbors_uniformly:
        if max_distance is not None:
            mask_by_distance = tf.less_equal(dist_mat, max_distance**2)
            dist_mat = tf.where(
                mask_by_distance,
                tf.square(max_distance) *
                tf.random_uniform(tf.shape(dist_mat)), dist_mat)
        else:
            raise ValueError(
                'Uniform sampling requires specifying max_distance.')

    top_k_dist, indices = tf.nn.top_k(-dist_mat, k=k,
                                      sorted=True)  # N x P2 x K

    # Set padding using top_k_dist; padded points will have distance exceeding
    # the large_scalar.
    if points_padding is not None:
        paddings = tf.greater_equal(-top_k_dist, large_scalar)
    else:
        paddings = tf.zeros_like(top_k_dist, dtype=tf.bool)

    # Filter by max_distances by setting all indices that exceed the max_distance
    # to the closest point.
    if max_distance is not None:
        # Mask is true for points that are further than max_distance.
        mask_by_distance = tf.greater(-top_k_dist, tf.square(max_distance))
        closest_idx = tf.tile(indices[:, :, :1], [1, 1, k])
        indices = tf.where(mask_by_distance, closest_idx, indices)
        paddings |= mask_by_distance

    indices = tf.reshape(indices, [n, p2, k])
    paddings = tf.cast(paddings, tf.float32)

    return indices, paddings
Exemple #8
0
    def FProp(self, theta, input_batch):
        # pyformat: disable
        """Compute features for the pillars and convert them back to a dense grid.

    Args:
      theta: A `.NestedMap` object containing variable values of this task.
      input_batch: A `.NestedMap` object containing input tensors. Following
        keys are required:

        - grid_num_points: Integer tensor with shape [batch size, nx, ny, nz],
          where nx, ny, nz corresponds to the grid sizes (i.e., number of voxels
          in each axis dimension).
        - pillar_points: Float tensor with shape [batch size, num_pillars,
          num_points_per_pillar, 3 + num_laser_features]
        - pillar_centers: Float tensor with shape [batch size, num_pillars,
          num_points_per_pillar, 3]
        - pillar_locations: Float tensor with shape [batch size, num_pillars, 3]

    Returns:
      The dense features with shape [b, nx, ny, nz * fdims].
    """
        # pyformat: enable
        p = self.params
        bs, nx, ny, nz = py_utils.GetShape(input_batch.grid_num_points, 4)
        # Process points to concatenate a set of fixed features (e.g.,
        # add means, centers, normalize points to means).
        num_features = 3 + p.num_laser_features
        pillar_points = py_utils.HasShape(input_batch.pillar_points,
                                          [bs, -1, -1, num_features])
        _, npillars, npoints, _ = py_utils.GetShape(pillar_points, 4)
        pillar_xyz = pillar_points[..., :3]

        # Compute number of points per pillar and prepare for broadcasting.
        pillar_num_points = tf.gather_nd(input_batch.grid_num_points,
                                         input_batch.pillar_locations,
                                         batch_dims=1)
        pillar_num_points = pillar_num_points[..., tf.newaxis, tf.newaxis]

        # Compute mean by computing sum and dividing by number of points. Clip the
        # denominator by 1.0 to gracefully handle empty pillars.
        pillar_sum = tf.reduce_sum(pillar_xyz, axis=2, keep_dims=True)
        pillar_means = pillar_sum / tf.maximum(
            tf.cast(pillar_num_points, tf.float32), 1.0)

        pillar_feats = pillar_points[..., 3:]
        pillar_centers = py_utils.HasShape(input_batch.pillar_centers,
                                           [bs, -1, 1, 3])
        pillar_concat = tf.concat(axis=3,
                                  values=[
                                      pillar_xyz - pillar_means, pillar_feats,
                                      tf.tile(pillar_means,
                                              [1, 1, npoints, 1]),
                                      tf.tile(pillar_centers,
                                              [1, 1, npoints, 1])
                                  ])
        # Featurize pillars.
        pillar_features = self.featurizer.FProp(theta.featurizer,
                                                pillar_concat)

        # Convert back to the dense grid.
        pillar_locations = py_utils.HasShape(input_batch.pillar_locations,
                                             [bs, npillars, 3])
        dense_features = SparseToDense(grid_shape=(nx, ny, nz),
                                       locations=pillar_locations,
                                       feats=pillar_features)
        return dense_features
Exemple #9
0
    def Decode(self, input_batch):
        """Decode an input batch, computing predicted bboxes from residuals."""
        p = self.params

        predictions = self.ComputePredictions(self.theta, input_batch)
        bboxes_and_logits = self._BBoxesAndLogits(input_batch, predictions)
        predicted_bboxes = bboxes_and_logits.predicted_bboxes
        batch_size, num_bboxes, _ = py_utils.GetShape(predicted_bboxes, 3)
        classification_logits = bboxes_and_logits.classification_logits
        classification_logits = py_utils.HasShape(
            classification_logits, [batch_size, num_bboxes, p.num_classes])

        classification_scores = tf.sigmoid(classification_logits)

        _, per_example_dict = self.ComputeLoss(self.theta, predictions,
                                               input_batch)
        if 'score_scaler' in per_example_dict:
            classification_scores *= per_example_dict['score_scaler']

        with tf.device('/cpu:0'):
            # Decode the predicted bboxes, performing NMS.
            per_cls_idxs, per_cls_bboxes, per_cls_bbox_scores, per_cls_valid_mask = (
                detection_decoder.DecodeWithNMS(
                    predicted_bboxes,
                    classification_scores,
                    nms_iou_threshold=p.nms_iou_threshold,
                    score_threshold=p.nms_score_threshold,
                    max_boxes_per_class=p.max_nms_boxes,
                    use_oriented_per_class_nms=p.use_oriented_per_class_nms))

            # per_cls_valid_mask is [batch, num_classes, num_boxes] Tensor that
            # indicates which boxes were selected by NMS. Each example will have a
            # different number of chosen bboxes, so the mask is present to allow us
            # to keep the boxes as a batched dense Tensor.
            #
            # We mask the scores by the per_cls_valid_mask so that none of these boxes
            # will be interpreted as valid.
            per_cls_bbox_scores *= per_cls_valid_mask
            visualization_weights = py_utils.HasShape(
                per_cls_bbox_scores,
                [batch_size, p.num_classes, p.max_nms_boxes])

            # For top down visualization, filter boxes whose scores are not above the
            # visualization threshold.
            visualization_weights = tf.where(
                tf.greater_equal(visualization_weights,
                                 p.visualization_classification_threshold),
                visualization_weights, tf.zeros_like(visualization_weights))

        model_outputs = py_utils.NestedMap()
        model_outputs.per_class_predicted_bboxes = per_cls_bboxes
        model_outputs.per_class_predicted_bbox_scores = per_cls_bbox_scores
        model_outputs.per_class_valid_mask = per_cls_valid_mask

        decoder_outputs = py_utils.NestedMap({
            'per_class_predicted_bboxes':
            per_cls_bboxes,
            'per_class_predicted_bbox_scores':
            per_cls_bbox_scores,
            'per_class_valid_mask':
            per_cls_valid_mask,
            'visualization_weights':
            visualization_weights,
        })

        if p.decode_include_residuals:
            # Including the residuals in the decoder output makes it possible to save
            # the outputs for further analysis. Note that we ensure that the outputs
            # match the per-class NMS output format of [batch, num_classes, ...].
            def _ReshapeGather(tensor):
                """Reshapes tensor and then gathers using the nms indices."""
                tensor = tf.gather(tf.reshape(tensor,
                                              [batch_size, num_bboxes, -1]),
                                   per_cls_idxs,
                                   batch_dims=1)
                if not p.use_oriented_per_class_nms:
                    # Tile so that the data fits the expected per class shape of
                    # [batch_size, num_classes, ...]. When *not* using oriented NMS, the
                    # num_classes dimension will be missing since the indices will not
                    # have it.
                    tensor = tf.tile(tensor[:, tf.newaxis, :, :],
                                     [1, p.num_classes, 1, 1])
                return tensor

            decoder_outputs.update({
                'per_class_gt_residuals':
                _ReshapeGather(input_batch.anchor_localization_residuals),
                'per_class_gt_labels':
                _ReshapeGather(input_batch.assigned_gt_labels),
                'per_class_residuals':
                _ReshapeGather(predictions.residuals),
                'per_class_logits':
                _ReshapeGather(predictions.classification_logits),
                'per_class_anchor_boxes':
                _ReshapeGather(input_batch.anchor_bboxes),
            })

        decoder_outputs.update(
            self.output_decoder.ProcessOutputs(input_batch, model_outputs))

        # Produce global step as an output (which is the step
        # of the checkpoint being decoded.)
        decoder_outputs.global_step = py_utils.GetGlobalStep()

        return decoder_outputs
Exemple #10
0
    def ComputePredictions(self, theta, input_batch):
        """Computes predictions for `input_batch`.

    Args:
      theta: A `.NestedMap` object containing variable values of this task.
      input_batch: A `.NestedMap` object containing input tensors to this tower.

    Returns:
      A `.NestedMap` contains
        logits - [b, nx, ny, nz, na, 7 + num_classes]
    """
        p = self.params
        input_batch.Transform(lambda x:
                              (x.shape, x.shape.num_elements())).VLog(
                                  0, 'input_batch shapes: ')

        # Make pillars representation from input_batch.
        dense_features = self.input_featurizer.FProp(theta.input_featurizer,
                                                     input_batch)

        # Backbone
        tf.logging.vlog(1, 'dense_features.shape = %s', dense_features.shape)
        act = self.backbone.FProp(theta.backbone, dense_features)
        tf.logging.vlog(1, 'act.shape = %s', act.shape)

        # Convert the output of the backbone into class logits and regression
        # residuals using two different layers.
        class_detection = self.class_detector.FProp(theta.class_detector, act)
        reg_detection = self.regression_detector.FProp(
            theta.regression_detector, act)
        bs, nx, ny, _ = py_utils.GetShape(class_detection, 4)
        predicted_classification_logits = tf.reshape(
            class_detection,
            [bs, nx, ny, p.grid_size_z, p.num_anchors, p.num_classes])
        predicted_residuals = tf.reshape(
            reg_detection, [bs, nx, ny, p.grid_size_z, p.num_anchors, 7])

        if p.squash_rotation_predictions:
            predicted_rotations = predicted_residuals[..., 6:]
            predicted_rotations = np.pi * tf.tanh(predicted_rotations)
            predicted_residuals = tf.concat(
                [predicted_residuals[..., :6], predicted_rotations], axis=-1)

        if p.oracle_location or p.oracle_dimension or p.oracle_rotation:
            gt_residuals = py_utils.HasShape(
                input_batch.anchor_localization_residuals,
                [bs, nx, ny, p.grid_size_z, p.num_anchors, 7])

            # Replace the predicted components with the ground truth if needed.
            if p.oracle_location:
                location = gt_residuals[..., 0:3]
            else:
                location = predicted_residuals[..., 0:3]

            if p.oracle_dimension:
                dimension = gt_residuals[..., 3:6]
            else:
                dimension = predicted_residuals[..., 3:6]

            if p.oracle_rotation:
                rotation = gt_residuals[..., 6:]
            else:
                rotation = predicted_residuals[..., 6:]
            predicted_residuals = tf.concat([location, dimension, rotation],
                                            axis=-1)

        ret = py_utils.NestedMap({
            'residuals':
            predicted_residuals,
            'classification_logits':
            predicted_classification_logits,
        })

        if p.direction_classifier_weight > 0.0:
            predicted_dir = self.direction_classifier.FProp(
                theta.direction_classifier, act)
            predicted_dir = tf.reshape(
                predicted_dir, [bs, nx, ny, p.grid_size_z, p.num_anchors, 2])
            ret.predicted_dir = predicted_dir

        return ret
Exemple #11
0
    def ComputeLoss(self, theta, predictions, input_batch):
        """Computes loss and other metrics for the given predictions.

    Args:
      theta: A `.NestedMap` object containing variable values of this task.
      predictions: The output of `ComputePredictions`, contains: logits - [b,
        nx, ny, nz, na, 7 + num_classes]. na is the number of anchor
        boxes per cell. [..., :7] are (dx, dy, dz, dw, dl, dh, dt).
      input_batch: The input batch from which we accesses the groundtruth.

    Returns:
      Two dicts defined as BaseTask.ComputeLoss.
    """
        p = self.params
        predicted_residuals = py_utils.HasShape(
            predictions.residuals, [-1, -1, -1, -1, p.num_anchors, 7])
        predicted_class_logits = py_utils.HasShape(
            predictions.classification_logits,
            [-1, -1, -1, -1, p.num_anchors, p.num_classes])
        bs, nx, ny, nz, na, _ = py_utils.GetShape(predicted_class_logits, 6)

        # Compute class and regression weights.
        class_weights = input_batch.assigned_cls_mask
        class_weights = py_utils.HasShape(class_weights, [bs, nx, ny, nz, na])
        reg_weights = input_batch.assigned_reg_mask
        reg_weights = py_utils.HasShape(reg_weights, [bs, nx, ny, nz, na])
        reg_weights = tf.expand_dims(reg_weights, -1)

        if p.loss_norm_type == LossNormType.NORM_BY_NUM_POSITIVES:
            # Compute number of positive anchors per example.
            foreground_mask = py_utils.HasShape(input_batch.assigned_reg_mask,
                                                [bs, nx, ny, nz, na])
            # Sum to get the number of foreground anchors for each example.
            loss_normalization = tf.reduce_sum(foreground_mask,
                                               axis=[1, 2, 3, 4])
            loss_normalization = tf.maximum(loss_normalization,
                                            tf.ones_like(loss_normalization))
            # Reshape for broadcasting.
            loss_normalization = tf.reshape(loss_normalization,
                                            [bs, 1, 1, 1, 1, 1])

            class_weights /= loss_normalization
            reg_weights /= loss_normalization

        # Classification loss.
        assigned_gt_labels = py_utils.HasShape(input_batch.assigned_gt_labels,
                                               [bs, nx, ny, nz, na])
        class_loss = py_utils.SigmoidCrossEntropyFocalLoss(
            logits=predicted_class_logits,
            labels=tf.one_hot(assigned_gt_labels, p.num_classes),
            alpha=p.focal_loss_alpha,
            gamma=p.focal_loss_gamma)
        class_loss *= class_weights[..., tf.newaxis]
        class_loss_sum = tf.reduce_sum(class_loss)

        # Regression loss.
        anchor_localization_residuals = py_utils.HasShape(
            input_batch.anchor_localization_residuals, [bs, nx, ny, nz, na, 7])

        # Location and dimensions loss.
        reg_loc_and_dims_loss = self._utils.ScaledHuberLoss(
            predictions=py_utils.HasShape(predicted_residuals[..., :6],
                                          [bs, nx, ny, nz, na, 6]),
            labels=anchor_localization_residuals[..., :6],
            delta=1 / (3.**2))

        # Rotation loss with SmoothL1(sin(delta)).
        rot_delta = (predicted_residuals[..., 6:] -
                     input_batch.anchor_localization_residuals[..., 6:])

        if p.use_atan2_heading_loss:
            atan2_of_delta = tf.atan2(tf.sin(rot_delta), tf.cos(rot_delta))
            reg_rot_loss = self._utils.ScaledHuberLoss(
                predictions=atan2_of_delta,
                labels=tf.zeros_like(atan2_of_delta),
                delta=1 / (3.**2))
        else:
            # Rotation loss with SmoothL1(sin(delta)).
            reg_rot_loss = self._utils.ScaledHuberLoss(
                predictions=tf.sin(rot_delta),
                labels=tf.zeros_like(rot_delta),
                delta=1 / (3.**2))

        # Direction loss
        if p.direction_classifier_weight > 0.0:
            # The target rotations are in the assigned_gt_bbox tensor,
            # which already has assigned a gt bounding box to every anchor.
            rot_target = input_batch.assigned_gt_bbox[..., 6]
            # If rotation is > 0, the class is 1, else it is 0.
            rot_dir = tf.cast(rot_target > 0., tf.int32)

            # Compute one-hot labels as a target.
            rot_dir_onehot = tf.one_hot(rot_dir, 2)

            # Manually handle loss reduction.
            dir_loss = tf.losses.softmax_cross_entropy(
                onehot_labels=rot_dir_onehot,
                logits=predictions.predicted_dir,
                weights=tf.squeeze(reg_weights, axis=-1),
                reduction=tf.losses.Reduction.NONE)
            # Reduce across all dimensions (we'll divide by the batch size below).
            dir_loss_sum = tf.reduce_sum(dir_loss)
        else:
            dir_loss_sum = 0.0

        # Compute loss contribution from location and dimension separately.
        reg_loc_loss = reg_loc_and_dims_loss[..., :3] * reg_weights
        reg_loc_loss_sum = tf.reduce_sum(reg_loc_loss)

        reg_dim_loss = reg_loc_and_dims_loss[..., 3:6] * reg_weights
        reg_dim_loss_sum = tf.reduce_sum(reg_dim_loss)

        # Compute rotation loss contribution.
        reg_rot_loss *= reg_weights
        reg_rot_loss_sum = tf.reduce_sum(reg_rot_loss)

        # Num. predictions.
        # TODO(zhifengc): Consider other normalization factors. E.g., # of bboxes.
        preds = tf.cast(bs, class_loss_sum.dtype)

        # Normalize all of the components by batch size.
        reg_loc_loss = reg_loc_loss_sum / preds
        reg_dim_loss = reg_dim_loss_sum / preds
        reg_rot_loss = reg_rot_loss_sum / preds
        class_loss = class_loss_sum / preds
        dir_loss = dir_loss_sum / preds

        # Compute total localization regression loss.
        reg_loss = (p.location_loss_weight * reg_loc_loss +
                    p.dimension_loss_weight * reg_dim_loss +
                    p.rotation_loss_weight * reg_rot_loss)

        # Apply weights to normalized class losses.
        loss = (class_loss * p.classification_loss_weight +
                reg_loss * p.localization_loss_weight +
                dir_loss * p.direction_classifier_weight)

        metrics_dict = {
            'loss': (loss, preds),
            'loss/class': (class_loss, preds),
            'loss/reg': (reg_loss, preds),
            'loss/reg/rot': (reg_rot_loss, preds),
            'loss/reg/loc': (reg_loc_loss, preds),
            'loss/reg/dim': (reg_dim_loss, preds),
            'loss/dir': (dir_loss, preds),
        }

        # Calculate dimension errors
        min_angle_rad = -np.pi if p.use_atan2_heading_loss else 0
        gt_bboxes = self._utils_3d.ResidualsToBBoxes(
            input_batch.anchor_bboxes,
            anchor_localization_residuals,
            min_angle_rad=min_angle_rad,
            max_angle_rad=np.pi)
        predicted_bboxes = self._utils_3d.ResidualsToBBoxes(
            input_batch.anchor_bboxes,
            predicted_residuals,
            min_angle_rad=min_angle_rad,
            max_angle_rad=np.pi)
        dimension_errors_dict = self._BBoxDimensionErrors(
            gt_bboxes, predicted_bboxes, reg_weights)
        metrics_dict.update(dimension_errors_dict)

        per_example_dict = {
            'residuals': predicted_residuals,
            'classification_logits': predicted_class_logits,
        }

        return metrics_dict, per_example_dict
Exemple #12
0
  def _FProp(self, theta, source_encs, source_paddings, targets,
             src_segment_id):
    """Decodes `targets` given encoded source.

    Args:
      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      source_encs: source encoding. When `p.is_transparent` is False, it is a
        tensor of shape [time, batch, depth]. When `p.is_transparent` is True,
        it is a tensor of shape [time, batch, depth, num_trans_layers] if
        `p.is_eval` is True, and a list of `num_trans_layers` tensors of shape
        [time, batch, depth] if `p.is_eval` is False.
      source_paddings: source encoding's padding, of shape [time, batch].
      targets: A dict of string to tensors representing the targets one try to
        predict. Each tensor in targets is of shape [batch, time].
      src_segment_id: source segment id, of shape [time, batch].

    Returns:
      Output of last decoder layer, [target_time, target_batch, source_dim].
    """
    p = self.params
    time, batch = py_utils.GetShape(source_paddings, 2)
    if p.is_transparent:
      if p.is_eval:
        source_encs = py_utils.HasShape(
            source_encs, [time, batch, p.source_dim, p.num_trans_layers])
        source_encs = tf.unstack(source_encs, axis=3)
      else:
        assert isinstance(source_encs, list)
        assert len(source_encs) == p.num_trans_layers
        for i in range(p.num_trans_layers):
          source_encs[i] = py_utils.HasShape(source_encs[i],
                                             [time, batch, p.source_dim])
    else:
      source_encs = py_utils.HasShape(source_encs, [time, batch, p.source_dim])
      source_encs = [source_encs] * p.num_trans_layers
    with tf.name_scope(p.name):
      # [batch, time]
      target_ids = targets.ids
      # [time, batch]
      target_paddings = tf.transpose(targets.paddings)
      target_segment_pos = None
      target_segment_id = None
      if p.packed_input:
        target_segment_id = tf.transpose(targets.segment_ids)
        target_segment_pos = targets.segment_pos
        assert src_segment_id is not None, ('Need to provide src_segment_id '
                                            'for packed input.')

      # Embedding layer
      # [batch, time, model_dim]
      token_embs = self.token_emb.EmbLookup(theta.token_emb, target_ids)
      target_time = py_utils.GetShape(target_ids)[1]
      # [1, time, model_dim]
      if p.packed_input:
        posit_embs = self.position_emb.FPropWithPosition(
            theta.position_emb, target_segment_pos)
      else:
        posit_embs = tf.expand_dims(
            self.position_emb.FProp(theta.position_emb, target_time), 0)

      # [time, batch, model_dim]
      input_embs = token_embs + posit_embs

      if p.model_dim != p.token_emb.embedding_dim:
        input_embs = self.emb_proj.FProp(theta.emb_proj, input_embs)

      input_embs = tf.transpose(input_embs, [1, 0, 2])
      input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs)

      atten_probs = []
      layer_in = input_embs
      for i, (layer, layer_theta) in enumerate(zip(self.trans, theta.trans)):
        # [time, batch, model_dim]
        layer_out, probs = layer.FProp(
            layer_theta,
            layer_in,
            target_paddings,
            source_encs[i],
            source_paddings,
            source_segment_id=target_segment_id,
            aux_segment_id=src_segment_id)
        layer_in = layer_out
        atten_probs.append(probs)

      self._AddAttenProbsSummary(source_paddings, targets, atten_probs)

      return layer_out
Exemple #13
0
  def ComputePredictions(self, theta, source_encs, source_paddings, targets,
                         src_segment_id):
    """Decodes `targets` given encoded source.

    Args:
      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      source_encs: source encoding, of shape [time, batch, depth].
      source_paddings: source encoding's padding, of shape [time, batch].
      targets: A dict of string to tensors representing the targets one try to
        predict. Each tensor in targets is of shape [batch, time].
      src_segment_id: source segment id, of shape [time, batch].

    Returns:
      A Tensor with shape [time, batch, params.softmax.input_dim].
    """
    p = self.params
    time, batch = py_utils.GetShape(source_paddings, 2)
    source_encs = py_utils.HasShape(source_encs, [time, batch, p.source_dim])
    with tf.name_scope(p.name):
      target_ids = tf.transpose(targets.ids)
      target_paddings = py_utils.HasRank(targets.paddings, 2)
      target_paddings = tf.expand_dims(tf.transpose(target_paddings), 2)
      if p.packed_input:
        target_segment_id = tf.expand_dims(tf.transpose(targets.segment_ids), 2)
      else:
        target_segment_id = tf.zeros_like(target_paddings)

      if py_utils.use_tpu():
        emb_device = self.cluster.WorkerDeviceInModelSplit(0)
      else:
        emb_device = ''
      with tf.device(emb_device):
        inputs = self.emb.EmbLookup(theta.emb, target_ids)
        inputs = self.ApplyClipping(theta, inputs)
        summary_utils.histogram('input_emb', inputs)
        inputs = self.ApplyDropout(inputs)
        self._emb_out = inputs

        # Layer 0 interwines with attention.
        (atten_ctxs, xs, atten_probs, _) = self.frnn_with_atten.FProp(
            theta.frnn_with_atten,
            source_encs,
            source_paddings,
            inputs,
            target_paddings,
            src_segment_id=src_segment_id,
            segment_id=target_segment_id)
        self._AddAttenProbsSummary(source_paddings, targets, [atten_probs])

        atten_ctxs = self.ApplyClipping(theta, atten_ctxs)
        summary_utils.histogram('atten_ctxs', atten_ctxs)

        for i, (layer, layer_theta) in enumerate(zip(self.frnn, theta.frnn)):
          # Forward through Layer-(i + 1) because Layer-0 handled before.
          ys, _ = layer.FProp(
              layer_theta,
              tf.concat([xs, atten_ctxs], 2),
              target_paddings,
              segment_id=target_segment_id)
          ys = self.ApplyDropout(ys)
          if 1 + i >= p.residual_start:
            xs += ys  # Residual skip
            xs = self.ApplyClipping(theta, xs)
          else:
            xs = ys
          summary_utils.histogram('layer_out_%s' % i, xs)

        if p.feed_attention_context_vec_to_softmax:
          xs = tf.concat([xs, atten_ctxs], 2)

        return xs
Exemple #14
0
    def FProp(self, theta, *args):
        """Runs p.repeat copies of self.body.FProp independently.

    Args:
      theta: Layer model parameters. The shape of each variable in theta is
        always [p.repeat, ...]. And the i-th slice theta[i] becomes theta of the
        i-th copy of self.body.
      *args: Input arguments. The shape of each tensor in args is always
        [p.repeat, ....]. And the list [arg[i] for arg in args] becomes inputs
        to the i-th copy of self.body.FProp.

    Returns:
      The accumulated output_tensors. Each tensor t in the return has the shape
      [p.repeat, ....] and the tuple (t[i] for i in output_tensors) is the
      return tuple of the i-th self.body.FProp.
    """
        p = self.params
        for arg in args:
            if arg is not None:
                arg = py_utils.HasShape(arg, [p.repeat], ndims=1)

        theta_stack = _MaybeStackExtraTheta(theta.body, self.body.vars,
                                            p.repeat)
        inputs = py_utils.NestedMap(theta=theta_stack, args=list(args))
        # Infer out_shapes from FPropMeta.
        out_shapes = self._InferOutShapes(args)

        def _CellFn(unused_theta, unused_state0, inputs):
            """Recurrent cell function wrapper of body.FProp."""
            # Sets shapes for both theta and inputs to self.body.FProp.
            for dst, src in zip(inputs.args + inputs.theta.Flatten(),
                                list(args) + theta_stack.Flatten()):
                if src is not None:
                    dst.set_shape(tf.TensorShape(src.shape.as_list()[1:]))

            # Runs the actual body.FProp
            fprop_outputs = self.body.FProp(inputs.theta, *inputs.args)
            fprop_outputs = _ToTuple(fprop_outputs)
            assert len(fprop_outputs) == len(out_shapes)
            # Passes fprop outputs to the next layer through state.
            state1 = py_utils.NestedMap(outputs=list(fprop_outputs))
            return state1, py_utils.NestedMap()

        with tf.name_scope(p.name):
            # Initiate state0 with inferred output shapes.
            state0 = py_utils.NestedMap(outputs=[
                tf.zeros(shape, args[0].dtype) for shape in out_shapes
            ])
            # Runs body.FProp p.repeat times using Recurrent.
            acc_states, _ = recurrent.Recurrent(theta=py_utils.NestedMap(),
                                                state0=state0,
                                                inputs=inputs,
                                                cell_fn=_CellFn)

            # Retrieves fprop outputs from state1 and sets shapes.
            output_tensors = tuple(acc_states.outputs)
            for out_idx in range(len(output_tensors)):
                output_tensors[out_idx].set_shape(
                    tf.TensorShape([p.repeat] + out_shapes[out_idx].as_list()))

            return output_tensors[0] if len(args) == 1 else tuple(
                output_tensors)
Exemple #15
0
    def FProp(self, theta, inputs, paddings, state0=None, segment_id=None):
        """Computes LSTM forward pass.

    Args:
      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      inputs: A single tensor or a tuple of tensors with cardinality equal to
        rnn_cell.inputs_arity. For every input tensor, the first dimension is
        assumed to be time, second dimension batch, and third dimension depth.
      paddings: A tensor. First dim is time, second dim is batch, and third dim
        is expected to be 1.
      state0: If not None, the initial rnn state in a `.NestedMap`. Defaults to
        the cell's zero-state.
      segment_id: A tensor to support packed inputs. First dim is time, second
        dim is batch, and third dim is expected to be 1.

    Returns:
      A tensor of [time, batch, dims].
      The final recurrent state.
    """
        p = self.params
        rcell = self.cell
        assert isinstance(rcell, (rnn_cell.RNNCell))

        if not isinstance(inputs, (list, tuple)):
            inputs = [inputs]

        # Slicing wm to wm_{i,h} outside the loop to get 20% speedup over regular
        # LSTM baseline.
        # Keeping slicing within the loop gives only < 3% speedup.
        cell_theta = theta.cell.copy()
        num_input_nodes = p.cell.num_input_nodes
        cell_theta['wm_i'] = cell_theta.wm[:num_input_nodes, :]
        cell_theta['wm_h'] = cell_theta.wm[num_input_nodes:, :]
        tf.logging.vlog(1, 'cell_theta: %r', cell_theta)
        if p.packed_input:
            assert segment_id is not None
            reset_mask = rnn_layers.GeneratePackedInputResetMask(
                segment_id, is_reverse=False)
            reset_mask = py_utils.HasShape(reset_mask, tf.shape(paddings))
        else:
            reset_mask = tf.zeros_like(paddings)

        if not state0:
            batch_size = py_utils.GetShape(paddings)[1]
            state0 = rcell.zero_state(cell_theta, batch_size)

        # [T, B, H]
        proj_inputs = rcell.ProjectInputSequence(
            cell_theta, py_utils.NestedMap(act=inputs))
        proj_inputs = py_utils.NestedMap(proj_inputs=proj_inputs,
                                         padding=paddings,
                                         reset_mask=reset_mask)

        acc_state, final_state = recurrent.Recurrent(
            theta=cell_theta,
            state0=state0,
            inputs=proj_inputs,
            cell_fn=rcell.FPropWithProjectedInput,
            cell_type=rcell.layer_type,
            accumulator_layer=self,
            allow_implicit_capture=p.allow_implicit_capture)

        act = rcell.GetOutput(acc_state)
        return act, final_state
Exemple #16
0
    def _InferenceSubgraph_Default(self):
        """Default inference subgraph.

    Returns:
      (fetches, feeds), with:

      - fetches: A dictionary of fetches, containing:

        - log_pplx_per_token: A matrix of shape [batch, time]. [i, j]
          is i-th input text's j-th token's log prob.
        - paddings: A matrix of shape [batch, time]. The padding mask.
        - log_pplx_per_sample: A vector of shape [batch]. [i]
          is i-th input text's log prob.
        - num_oovs_per_sample: A vector of shape [batch] counting the total
          number of out-of-vocabulary tokens in each input.
        - tokens_from_labels: A vector of shape [batch] returning the predicted
          tokens as a sequence after mapping them back to strings from ids using
          the vocabulary.
        - ids: A matrix of shape [batch, time]. [i, j]
          is i-th input text's j-th token's id.

      - feeds: A dictionary of feeds, containing:

        - text: A placeholder for a vector of strings.
    """
        text = tf.placeholder(tf.string, shape=[None])
        # [batch, time]
        ids, labels, paddings = self.input_generator.StringsToIds(text)
        lengths = tf.reduce_sum(tf.to_int32(1 - paddings), axis=1)
        tokens_from_labels = self.input_generator.IdsToStrings(labels, lengths)
        oovs = tf.equal(labels, self.input_generator.tokenizer.unk_id)
        num_oovs_per_sample = tf.to_int32(
            tf.round(tf.reduce_sum(tf.to_float(oovs) * (1 - paddings),
                                   axis=1)))
        # [time, batch]
        ids, paddings, labels, weights = self._TrimIfPossibleThenTranspose(
            ids, paddings, labels, 1.0 - paddings)
        batch_size = tf.shape(ids)[1]
        xent_output, _ = self.lm.FPropDefaultTheta(
            inputs=ids,
            paddings=paddings,
            state0=self.lm.zero_state(self.theta.lm, batch_size),
            labels=py_utils.NestedMap(class_ids=labels, class_weights=weights))

        per_example_xent = py_utils.HasShape(xent_output.per_example_xent,
                                             tf.shape(ids))
        log_pplx_per_sample = tf.reduce_sum(per_example_xent * (1 - paddings),
                                            axis=0)
        fetches = {
            'log_pplx_per_token':  # [batch, time]
            tf.transpose(per_example_xent),
            'paddings':  # [batch, time]
            tf.transpose(paddings),
            'lengths':  # [batch]
            lengths,
            'log_pplx_per_sample':  # [batch]
            log_pplx_per_sample,
            'num_oovs_per_sample':  # [batch], int32
            num_oovs_per_sample,
            'tokens_from_labels':  # [batch], string
            tokens_from_labels,
            'ids':  # [batch, time], int32
            ids
        }
        feeds = {'text': text}
        return fetches, feeds
Exemple #17
0
    def FProp(self,
              theta,
              inputs,
              paddings,
              state0,
              labels=None,
              direct_features=None):
        """Computes xent loss given the language model input activations.

    Args:
      theta: A `.NestedMap` object containing weights' values of this
        layer and its children layers.
      inputs: input activation. A tensor of shape [time, batch, dims].
      paddings: a 0/1 tensor of shape [time, batch].
      state0: A `.NestedMap` containing the initial recurrent state.
      labels: If not None, a `.NestedMap` containing the following fields.

        - class_weights, a tensor with shape [time, batch] containing the
          weights for each target word.
        - class_ids, a tensor with shape [time, batch] of int32 dtype containing
          the target class labels.
        - class_probabilities, a tensor with shape [time, batch, vocab_size] of
          float values indicating class-membership probabilities.
      direct_features:
        If not None, a tensor of [time, batch, direct_feature_dims] that is
        concatenated to the output of the last RNN layer.

    Returns:
      If `labels` is not None, returns (xent_output, state1), where
      `xent_output` is a `.NestedMap` as defined by `SoftmaxLayer`'s return
      value and `state1` is the next recurrent state. Otherwise,
      `xent_output` contains the softmax logits, probabilities (.probs) and
      log-probabilities (.log_probs).
    """
        inputs = py_utils.HasRank(inputs, 3)
        seqlen, batch, _ = tf.unstack(tf.shape(inputs), num=3)
        paddings = py_utils.HasShape(paddings, [seqlen, batch])
        assert state0 is not None
        activation, state1 = self.rnns.FProp(theta.rnns, inputs,
                                             tf.expand_dims(paddings, 2),
                                             state0)

        if direct_features is not None:
            direct_features = py_utils.HasRank(direct_features, 3)
            activation = tf.concat([activation, direct_features], axis=2)

        if labels is None:
            # We can only compute the logits here.
            logits = self.softmax.Logits(theta=theta.softmax,
                                         inputs=tf.reshape(
                                             activation, [seqlen * batch, -1]))
            xent_output = py_utils.NestedMap(
                logits=tf.reshape(logits, [seqlen, batch, -1]))
            xent_output.probs = tf.nn.softmax(xent_output.logits)
            xent_output.log_probs = tf.nn.log_softmax(xent_output.logits)
        elif 'class_ids' in labels:
            xent_output = self.softmax.FProp(
                theta=theta.softmax,
                inputs=activation,
                class_weights=labels.class_weights,
                class_ids=labels.class_ids)
        else:
            assert 'class_probabilities' in labels
            xent_output = self.softmax.FProp(
                theta=theta.softmax,
                inputs=activation,
                class_weights=labels.class_weights,
                class_probabilities=labels.class_probabilities)
        xent_output.last_hidden = activation
        return xent_output, state1
Exemple #18
0
def ExtractBlockContextV2(x,
                          block_size,
                          left_context,
                          right_context,
                          padding_val=0.0,
                          paddings=None):
    """Extracts temporal context for every block (without restrictions).

  This is a generalized implementation of ExtractBlockContext, where block_size,
  left_context, and right_context are 3 free parameters and we don't have
  constraints (other than l>=1, r>=0, block_size>0).

  Args:
    x: a tensor of [batch, time, dim].
    block_size: int. Number of time frames in a block.
    left_context: int. Left context size. Note that the actual left context is
      `left_context - 1` (this is to be compatible with ExtractBlockContext
      implementation).
    right_context: int. Right context size.
    padding_val: float. value on the padded frames.
    paddings: optional. If specified, it must be a tensor of [batch, time], and
      we will return a padding tensor indicating padding info for the returned
      tensor.

  Returns:
    (x_patches, x_paddings) where

    - x_patches: A tensor of
      [batch, num_blocks, context_size + block_size, dim] with necessary
      paddings, where context_size = (left_context - 1) + right_context,
      and output[:, i, ...] are
      x[:, start-left_context+1:end+right_context, ...], where
      start = i * block_size, end = (i + 1) * block_size.
    - x_paddings: None if paddings = None; else a
      [batch, num_blocks, context_size + block_size] tensor, indicating the
      padding info for the corresponding position in x_patches.

  Let's define some variables here:

  B: batch size
  T: input tensor length in time axis
  D: input tensor dimension in the last axis
  W: block size
  U: ceil(T/W)
  L: left context size
  R: right context size
  C: L-1+W+R, full block length

  Given a [B, T, D] tensor, the return is a [B, U, C, D] tensor
  where ret[b, u, :] is a length of 2D tensor in a shape (L - 1 + W + R, D),
  which is a u-th block of the input tensor with (L - 1) left context frames
  and R right context frames.

  Implementation note:

  We use the following procedure to get the return tensor

  - first do padding in the beginning and at the end:
    [B, T, D] -> [B, L - 1 + U*W + L - 1 + R, D]
  - add one extra axis
    [B, L-1+U*W+R, D] -> [B, L-1+U*W+R, D, 1]
  - use gather to gather blocks
    [B, L-1+U*W+R+L-1, D, 1] -> [B, U, C, D]

  TODO(yqw): after verfiying correctness and benchmark, consider replace v1
  implementation?
  """
    # 0. basic shapes
    b, t, d = py_utils.GetShape(x, 3)
    w = block_size
    u = (t + w - 1) // w  # equivalent to math.ceil(t/w)
    l = left_context
    r = right_context
    c = l - 1 + r + w

    # the only constraints are block_size > 0 , l >= 1, r>=0
    if w <= 0:
        raise ValueError(f'block size ({w}) must be greater than 0')
    if l < 1:
        raise ValueError(f'Left context ({left_context}) must be >= 1.')
    if r < 0:
        raise ValueError(f'Right context ({right_context}) must be >= 0')
    if paddings is not None:
        paddings = py_utils.HasShape(paddings, [b, t])

    # 1. do front and rear padding
    left_pad = l - 1
    # we need to make sure all u * w elements have enough long context
    right_pad = (u * w - t + l - 1 + r)
    x_padded = _DoPadding(x,
                          b,
                          left_pad,
                          right_pad,
                          d,
                          padding_val=padding_val)
    if paddings is not None:
        paddings = _DoPadding(paddings,
                              b,
                              left_pad,
                              right_pad,
                              d=None,
                              padding_val=1.0)

    # 2. generate gather indices
    # gather_indices is a [u, c] matrix like
    #  [ 0, .........,             c-1]
    #  [ w, .........,       w + (c-1)]
    #  [2w, ..........,     2w + (c-1)]
    #  [(u-1)*w, ...., (u-1)*w + (c-1)]
    gather_indices = (tf.tile(tf.expand_dims(tf.range(0, c), axis=0), (u, 1)) +
                      tf.tile(tf.expand_dims(tf.range(0, u * w, w), axis=1),
                              (1, c)))

    # 3. generate x_patches, shape [b, u, c, d]
    x_patches = tf.gather(x_padded, gather_indices, axis=1)

    if paddings is not None:
        # gather is now a [b, u, c] tensor
        paddings = tf.gather(paddings, gather_indices, axis=1)

    return x_patches, paddings
Exemple #19
0
    def FProp(self, theta, inputs, paddings, state0=None, labels=None):
        """Computes xent loss given the language model input activations.

    Args:
      theta: A `.NestedMap` object containing weights' values of this
        layer and its children layers.
      inputs: Input activation. A tensor of shape [time, batch, model_dim].
      paddings: A 0/1 tensor of shape [time, batch].
      state0: Not used for Transformer.
      labels: If not None, a `.NestedMap` containing the following fields:

        - class_weights, a tensor with shape [time, batch] containing the
          weights for each target word.
        - class_ids, a tensor with shape [time, batch] of int32 dtype containing
          the target class labels.
        - class_probabilities, a tensor with shape [time, batch, vocab_size] of
          float values indicating class-membership probabilities.

    Returns:
      If `labels` is not None, returns (xent_output, None), where
      `xent_output` is a `.NestedMap` as defined by `SoftmaxLayer`'s return
      value. Otherwise, `xent_output` only contains the softmax logits.
    """
        p = self.params
        inputs = py_utils.HasRank(inputs, 3)
        seqlen, batch, _ = tf.unstack(tf.shape(inputs), num=3)
        inputs = py_utils.HasShape(inputs, [seqlen, batch, p.model_dim])
        paddings = py_utils.HasShape(paddings, [seqlen, batch])

        # [time, 1, model_dim]
        posit_embs = tf.expand_dims(
            self.position_emb.FProp(theta.position_emb, seqlen), 1)
        # [time, batch, model_dim]
        input_embs = inputs + posit_embs
        input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs)

        layer_in = input_embs
        for layer, layer_theta in zip(self.trans, theta.trans):
            # [time, batch, model_dim]
            layer_out, _ = layer.FProp(layer_theta, layer_in, paddings)
            layer_in = layer_out

        if labels is None:
            # We can only compute the logits here.
            logits = self.softmax.Logits(theta=theta.softmax,
                                         inputs=tf.reshape(
                                             layer_out, [seqlen * batch, -1]))
            xent_output = py_utils.NestedMap(
                logits=tf.reshape(logits, [seqlen, batch, -1]))
        elif 'class_ids' in labels:
            xent_output = self.softmax.FProp(
                theta=theta.softmax,
                inputs=layer_out,
                class_weights=labels.class_weights,
                class_ids=labels.class_ids)
        else:
            assert 'class_probabilities' in labels
            xent_output = self.softmax.FProp(
                theta=theta.softmax,
                inputs=layer_out,
                class_weights=labels.class_weights,
                class_probabilities=labels.class_probabilities)
        xent_output.last_hidden = layer_out
        return xent_output, None
Exemple #20
0
    def RelPositionBias(self, content, abs_pos_emb, skip_term_b=False):
        """Compute relative position bias.

    This is a subroutine used by variants of self-attentions with relative
    positional embedding.

    output[b][n][i][j] = content[b][i][n] x abs_pos_emb[i-j+T-1][n]

    Padding should be masked by the caller of this function.

    B: batch size
    T: sequence length
    N: num of attention heads.
    H: per-head attention dimension.

    Args:
      tensors of the following shapes:
      content:         [N, H] if skip_term_b else [B, T, N, H]
      abs_pos_emb:     [2T - 1, N, H], the absolute positional embedding.
        abs_pos_emb[i] is the emb of relative distance i - (T-1).
      skip_term_b:     If to skip term_b in section 3.3 equation.

    Returns:
      The attention logits tensor. [N, T, T] if skip_term_b else [B, N, T, T].
    """
        if not skip_term_b:
            b, t, n, h = py_utils.GetShape(content)
            l = 2 * t - 1
            abs_pos_emb = py_utils.HasShape(abs_pos_emb, [l, n, h])
        else:
            n, h = py_utils.GetShape(content)
            l = py_utils.GetShape(abs_pos_emb)[0]
            t = (l + 1) // 2

        if not skip_term_b:
            # [B, N, T, L=2T-1]
            content, abs_pos_emb = self.ToAqtActActInputs(content, abs_pos_emb)
            term_bd = tf.einsum('BTNH,LNH->BNTL', content, abs_pos_emb)
            term_bd = self.FromAqtActActMatmul(term_bd)

            term_bd = tf.reshape(term_bd, [b, n, t * l], name='flatten')
            # [B, N, T * (L + 1)].
            term_bd = tf.pad(term_bd, ((0, 0), (0, 0), (0, t)))
            # [B, N, T, L + 1].
            term_bd = tf.reshape(term_bd, [b, n, t, l + 1], name='restore')
            return term_bd[:, :, :, t - 1::-1]
        else:
            # [N, L=2T-1]
            content, abs_pos_emb = self.ToAqtActActInputs(content, abs_pos_emb)
            term_d = tf.einsum('NH,LNH->NL', content, abs_pos_emb)
            term_d = self.FromAqtActActMatmul(term_d)

            # [N, T, L]
            term_d = tf.tile(tf.expand_dims(term_d, axis=1), [1, t, 1],
                             name='tile')
            term_d = tf.reshape(term_d, [n, t * l])
            # [N, T * (L + 1)].
            term_d = tf.pad(term_d, ((0, 0), (0, t)))
            # [N, T, L + 1].
            term_d = tf.reshape(term_d, [n, t, l + 1], name='restore')
            return term_d[:, :, t - 1::-1]
Exemple #21
0
def FarthestPointSampler(points,
                         padding,
                         num_sampled_points,
                         precomputed_squared_distance=None,
                         num_seeded_points=0,
                         random_seed=None):
    """Samples num_sampled_points from points using farthest point sampling.

  Algorithm:
  1. Start by selecting a random point and adding to a selected set.
  2. For all remaining points, find the furthest point from those selected.
  3. Add furthest point to selected.
  4. Repeat 2-3 until num_sampled_points are selected.

  More details at https://en.wikipedia.org/wiki/Farthest-first_traversal

  This output of this function can be used with tf.batch_gather to extract the
  desired points, for example: tf.batch_gather(points, sampled_idx)

  Args:
    points: floating point tf.Tensor of shape [N, P1, dims]
    padding: A floating point tf.Tensor of shape [N, P1] with 0 if the point is
      real, and 1 otherwise.
    num_sampled_points: integer number of points to sample.
    precomputed_squared_distance: optional tf.Tensor of shape [N, P1, P1] of
      distances between each point. if None, distances will be computed on the
      fly.
    num_seeded_points: If num_seeded_points > 0, then the first
      num_seeded_points in points are considered to be seeded in the FPS
      sampling. Note that we assume that these points are *not* padded, and do
      not check padding when seeding them.
    random_seed: optional integer random seed to use with all the random ops.

  Returns:
    A tuple of tf.Tensors (sampled_idx, closest_idx) of types
    (tf.int32, tf.int32).

    sampled_idx is of shape [N, num_sampled_points] representing the indices
    selected using the sampler. This will have range of [0, P1].

    closest_idx is of shape [N, P1] representing the indices of the closest
    sampled points for each input point. closest_idx is used in PCNN as part of
    the pooling operation: each point is assigned to the closest sampled point
    and a max is taken over them. This will have a range of [0, P2] with the
    index of the closest sampled point that remains.
  """
    points = py_utils.HasRank(points, 3)
    batch_size, num_points, dims = py_utils.GetShape(points, 3)

    points = py_utils.with_dependencies(
        [py_utils.assert_greater_equal(num_points, num_sampled_points)],
        points)

    # Add a tiny bit of noise to the distance matrix or points so all
    # points are unique. This will also ensure true repeated points
    # like padded points are only selected after all valid points are selected.
    if precomputed_squared_distance is not None:
        precomputed_squared_distance = py_utils.HasShape(
            precomputed_squared_distance, [batch_size, num_points, num_points])
        precomputed_squared_distance += tf.random.uniform(
            (batch_size, num_points, 1),
            minval=1e-6,
            maxval=1e-5,
            dtype=tf.float32,
            seed=random_seed)
    else:
        points += tf.random.uniform((batch_size, num_points, dims),
                                    minval=1e-6,
                                    maxval=1e-5,
                                    dtype=tf.float32,
                                    seed=random_seed)

    # TensorArray to store the sampled indices in the loop.
    sampled_idx = tf.TensorArray(tf.int32, num_sampled_points)

    # Initialize distance_to_selected to inf for all points.
    distance_to_selected = float('inf') * tf.ones((batch_size, num_points))

    # For tracking the index to the closest selected point.
    closest_idx = tf.zeros((batch_size, num_points), dtype=tf.int32)

    # Current loop index counter.
    curr_idx = tf.constant(0, dtype=tf.int32)

    # Get number of valid points (1 is padded, so num_points - num_padded).
    num_valid_points = tf.cast(tf.cast(num_points, dtype=tf.float32) -
                               tf.reduce_sum(padding, axis=1),
                               dtype=tf.int32)

    def _BodyFn(curr_idx, distance_to_selected, sampled_idx, closest_idx):
        """Loop body for farthest point sampler."""
        def _GetRandomRealPoint():
            """Select the first point.

      For the first point, we want any random real (non padded) point, so we
      create a random values per point, and then set all padded ones to
      some large value (more than the maxval). We then take the min per batch
      element to get the first points.

      Returns:
        Tensor containing the index of a random point selected for each example
        in the batch.
      """
            random_values = tf.random.uniform((batch_size, num_points),
                                              minval=0,
                                              maxval=1,
                                              dtype=tf.float32,
                                              seed=random_seed)
            random_values = tf.where(tf.equal(padding, 0.0), random_values,
                                     padding * 10)
            return tf.argmin(random_values, axis=1, output_type=tf.int32)

        def _GetFurthestPoint():
            """Get point that is furthest from those already selected.

      We also bias the sampling towards real points by setting the distance
      to padded points negative until we are out of real points.

      Returns:
        Tensor containing the index of the next farthest point selected for each
        example in the batch.
      """
            # Set padded points distance to negative so they aren't selected.
            padding_masked_distance_to_selected = tf.where(
                tf.equal(padding, 0.0), distance_to_selected, -1.0 * tf.ones(
                    (batch_size, num_points), dtype=tf.float32))
            # But only do this when we still have valid points left.
            padding_masked_distance_to_selected = tf.where(
                tf.less(curr_idx, num_valid_points),
                padding_masked_distance_to_selected, distance_to_selected)
            return tf.argmax(padding_masked_distance_to_selected,
                             axis=-1,
                             output_type=tf.int32)

        def _GetSeededPoint():
            """Select a seeded point.

      Seeded points are assumed to be at the beginning of the original points.

      Returns:
        Tensor containing the index of the next seeded point to select for each
        example in the batch.
      """
            return tf.ones((batch_size, ), dtype=tf.int32) * curr_idx

        # Select indices for this loop iteration.
        def _Seeded():
            return tf.cond(tf.less(curr_idx, num_seeded_points),
                           _GetSeededPoint, _GetFurthestPoint)

        def _Real():
            return tf.cond(tf.equal(curr_idx, 0), _GetRandomRealPoint,
                           _GetFurthestPoint)

        new_selected = tf.cond(tf.greater(num_seeded_points, 0), _Seeded,
                               _Real)
        sampled_idx = sampled_idx.write(curr_idx, new_selected)

        # Extract the distance to the latest point selected to update
        # distance_to_selected.
        new_selected_gather_idx = tf.stack(
            [tf.range(batch_size), new_selected], axis=1)
        if precomputed_squared_distance is not None:
            new_distance = tf.gather_nd(precomputed_squared_distance,
                                        new_selected_gather_idx)
        else:
            new_points = tf.reshape(
                tf.gather_nd(points, new_selected_gather_idx),
                [batch_size, 1, dims])
            new_distance = tf.reshape(
                SquaredDistanceMatrix(points, new_points),
                [batch_size, num_points])

        is_newly_closest = tf.less(new_distance, distance_to_selected)
        distance_to_selected = tf.minimum(distance_to_selected, new_distance)

        # Track the index to the closest selected point.
        new_selected_tiled = tf.tile([[curr_idx]], [batch_size, num_points])
        closest_idx = tf.cond(
            tf.equal(curr_idx, 0),
            # At the first loop iteration, the init points are the closest.
            lambda: new_selected_tiled,
            # Otherwise, update with the new points based on the distances.
            lambda: tf.where(is_newly_closest, new_selected_tiled, closest_idx)
        )
        return curr_idx + 1, distance_to_selected, sampled_idx, closest_idx

    _, _, sampled_idx, closest_idx = tf.while_loop(
        lambda curr_idx, *args: tf.less(curr_idx, num_sampled_points),
        _BodyFn,
        loop_vars=(curr_idx, distance_to_selected, sampled_idx, closest_idx),
        back_prop=False,
        maximum_iterations=num_sampled_points)

    sampled_idx = sampled_idx.stack()  # num_sampled_points x n
    sampled_idx = tf.transpose(sampled_idx, [1, 0])

    if isinstance(batch_size, int) and isinstance(num_sampled_points, int):
        sampled_idx.set_shape((batch_size, num_sampled_points))

    return sampled_idx, closest_idx
Exemple #22
0
    def FProp(self, theta, batch, state0=None):
        """Encodes source as represented by 'inputs' and 'paddings'.

    Args:
      theta: A NestedMap object containing weights' values of this
        layer and its children layers.
      batch: A NestedMap with fields:

        - src_inputs - The inputs tensor. It is expected to be of shape [batch,
          time, feature_dim, channels].
        - paddings - The paddings tensor. It is expected to be of shape [batch,
          time].
      state0: Recurrent input state. Not supported/ignored by this encoder.

    Returns:
      A NestedMap containing:

      - 'encoded': a feature tensor of shape [time, batch, depth]
      - 'padding': a 0/1 tensor of shape [time, batch]
      - 'state': the updated recurrent state
      - '${layer_type}_${layer_index}': The per-layer encoder output. Each one
        is a NestedMap containing 'encoded' and 'padding' similar to regular
        final outputs, except that 'encoded' from conv or conv_lstm layers are
        of shape [time, batch, depth, channels].
    """
        p = self.params
        inputs, paddings = batch.src_inputs, batch.paddings
        outputs = py_utils.NestedMap()
        with tf.name_scope(p.name):
            # Adding specAugmentation.
            if p.use_specaugment and not p.is_eval:
                inputs, paddings = self.specaugment.FProp(
                    theta.specaugment, inputs, paddings)
            # Add a few extra padded timesteps at the end. This is for ensuring the
            # correctness of the conv-layers at the edges.
            if p.pad_steps > 0:
                # inplace_update() is not supported by TPU for now. Since we have done
                # padding on the input_generator, we may avoid this additional padding.
                assert not py_utils.use_tpu()
                inputs_pad = tf.zeros(
                    inplace_ops.inplace_update(tf.shape(inputs), 1,
                                               p.pad_steps), inputs.dtype)
                paddings_pad = tf.ones(
                    inplace_ops.inplace_update(tf.shape(paddings), 1,
                                               p.pad_steps), paddings.dtype)
                inputs = tf.concat([inputs, inputs_pad], 1, name='inputs')
                paddings = tf.concat([paddings, paddings_pad], 1)

            def ReshapeForPlot(tensor, padding, name):
                """Transposes and flattens channels to [batch, dim, seq_len] shape."""
                # Flatten any dimensions beyond the third into the third.
                batch_size = tf.shape(tensor)[0]
                max_len = tf.shape(tensor)[1]
                plot_tensor = tf.reshape(tensor, [batch_size, max_len, -1])
                plot_tensor = tf.transpose(plot_tensor, [0, 2, 1], name=name)
                return (plot_tensor, summary_utils.SequenceLength(padding))

            plots = [
                ReshapeForPlot(tf.transpose(inputs, [0, 1, 3, 2]), paddings,
                               'inputs')
            ]

            conv_out = inputs
            out_padding = paddings
            for i, conv_layer in enumerate(self.conv):
                conv_out, out_padding = conv_layer.FProp(
                    theta.conv[i], conv_out, out_padding)
                if p.extra_per_layer_outputs:
                    conv_out *= (1.0 -
                                 out_padding[:, :, tf.newaxis, tf.newaxis])
                    outputs['conv_%d' % i] = py_utils.NestedMap(
                        encoded=tf.transpose(conv_out,
                                             [1, 0, 2, 3]),  # to [t, b, d, c]
                        padding=tf.transpose(out_padding))
                plots.append(
                    ReshapeForPlot(tf.transpose(conv_out, [0, 1, 3, 2]),
                                   out_padding, 'conv_%d_out' % i))

            def TransposeFirstTwoDims(t):
                first_dim = tf.shape(t)[0]
                second_dim = tf.shape(t)[1]
                t_new = tf.transpose(
                    tf.reshape(t, [first_dim, second_dim, -1]), [1, 0, 2])
                t_shape_new = tf.concat([[second_dim], [first_dim],
                                         tf.shape(t)[2:]], 0)
                return tf.reshape(t_new, t_shape_new)

            # Now the conv-lstm part.
            conv_lstm_out = conv_out
            conv_lstm_out_padding = out_padding
            for i, (rnn, cnn) in enumerate(
                    zip(self.conv_lstm_rnn, self.conv_lstm_cnn)):
                conv_lstm_in = conv_lstm_out
                # Move time dimension to be the first.
                conv_lstm_in = TransposeFirstTwoDims(conv_lstm_in)
                conv_lstm_in = tf.expand_dims(conv_lstm_in, 2)
                conv_lstm_in_padding = tf.expand_dims(
                    tf.transpose(conv_lstm_out_padding), 2)
                lstm_out = rnn.FProp(theta.conv_lstm_rnn[i], conv_lstm_in,
                                     conv_lstm_in_padding)
                # Move time dimension to be the second.
                cnn_in = TransposeFirstTwoDims(lstm_out)
                cnn_in = tf.squeeze(cnn_in, 2)
                cnn_in_padding = conv_lstm_out_padding
                cnn_out, cnn_out_padding = cnn.FProp(theta.conv_lstm_cnn[i],
                                                     cnn_in, cnn_in_padding)
                conv_lstm_out, conv_lstm_out_padding = cnn_out, cnn_out_padding
                if p.extra_per_layer_outputs:
                    conv_lstm_out *= (
                        1.0 -
                        conv_lstm_out_padding[:, :, tf.newaxis, tf.newaxis])
                    outputs['conv_lstm_%d' % i] = py_utils.NestedMap(
                        encoded=tf.transpose(conv_lstm_out,
                                             [1, 0, 2, 3]),  # to [t, b, d, c]
                        padding=tf.transpose(conv_lstm_out_padding))
                plots.append(
                    ReshapeForPlot(conv_lstm_out, conv_lstm_out_padding,
                                   'conv_lstm_%d_out' % i))

            # Need to do a reshape before starting the rnn layers.
            conv_lstm_out = py_utils.HasRank(conv_lstm_out, 4)
            conv_lstm_out_shape = tf.shape(conv_lstm_out)
            new_shape = tf.concat([conv_lstm_out_shape[:2], [-1]], 0)
            conv_lstm_out = tf.reshape(conv_lstm_out, new_shape)
            if self._first_lstm_input_dim_pad:
                conv_lstm_out = tf.pad(
                    conv_lstm_out,
                    [[0, 0], [0, 0], [0, self._first_lstm_input_dim_pad]])

            conv_lstm_out = py_utils.HasShape(
                conv_lstm_out, [-1, -1, self._first_lstm_input_dim])

            # Transpose to move the time dimension to be the first.
            rnn_in = tf.transpose(conv_lstm_out, [1, 0, 2])
            rnn_padding = tf.expand_dims(tf.transpose(conv_lstm_out_padding),
                                         2)
            # rnn_in is of shape [time, batch, depth]
            # rnn_padding is of shape [time, batch, 1]

            # Now the rnn layers.
            num_skips = 0
            for i in range(p.num_lstm_layers):
                rnn_out = self.rnn[i].FProp(theta.rnn[i], rnn_in, rnn_padding)
                residual_index = i - p.residual_start + 1
                if p.residual_start > 0 and residual_index >= 0:
                    if residual_index % p.residual_stride == 0:
                        residual_in = rnn_in
                    if residual_index % p.residual_stride == p.residual_stride - 1:
                        # Highway skip connection.
                        if p.highway_skip:
                            rnn_out = self.highway_skip[num_skips].FProp(
                                theta.highway_skip[num_skips], residual_in,
                                rnn_out)
                            num_skips += 1
                        else:
                            # Residual skip connection.
                            rnn_out += py_utils.HasShape(
                                residual_in, tf.shape(rnn_out))
                if p.project_lstm_output and (i < p.num_lstm_layers - 1):
                    # Projection layers.
                    rnn_out = self.proj[i].FProp(theta.proj[i], rnn_out,
                                                 rnn_padding)
                if i == p.num_lstm_layers - 1:
                    rnn_out *= (1.0 - rnn_padding)
                if p.extra_per_layer_outputs:
                    rnn_out *= (1.0 - rnn_padding)
                    outputs['rnn_%d' % i] = py_utils.NestedMap(
                        encoded=rnn_out, padding=tf.squeeze(rnn_padding, [2]))
                plots.append(
                    ReshapeForPlot(tf.transpose(rnn_out, [1, 0, 2]),
                                   tf.transpose(rnn_padding, [1, 0, 2]),
                                   'rnn_%d_out' % i))
                rnn_in = rnn_out
            final_out = rnn_in

            if self.cluster.add_summary:
                fig = plot.MatplotlibFigureSummary('encoder_example',
                                                   figsize=(8,
                                                            len(plots) * 3.5))

                # Order layers from bottom to top.
                plots.reverse()
                for tensor, seq_len in plots:
                    fig.AddSubplot([tensor, seq_len],
                                   summary_utils.TrimPaddingAndPlotSequence,
                                   title=tensor.name,
                                   xlabel='Time')
                fig.Finalize()

            outputs['encoded'] = final_out
            outputs['padding'] = tf.squeeze(rnn_padding, [2])
            outputs['state'] = py_utils.NestedMap()
            return outputs
Exemple #23
0
def SegmentPool3D(points,
                  point_features,
                  pooling_idx,
                  closest_idx,
                  pooling_method='max'):
    """Performs {min/max/average} pooling over a pointcloud given indices.

  This should be functionally identical when using max to the above
  MaxPool3D function, except it turns out to be much more memory efficient
  on a TPU, and supports min/max/mean.

  Args:
    points: A float tf.Tensor of shape [N, P1, 3] with point locations.
    point_features: A float tf.Tensor of shape [N, P1, C] with point features.
    pooling_idx: A tf.int32 tf.Tensor of shape [N, P2] with the index of which
      points we want to keep. Each value should be in the range [0, P1].
    closest_idx: A tf.int32 tf.Tensor of shape [N, P1] representing which
      sampled point is closest to each original point. Each value should be in
      the range of [0, P2].
    pooling_method: A string for which pooling function to use. Should be one of
      {'min', 'max', 'mean'}.

  Returns:
    pooled_points: A float tf.Tensor of shape [N, P2, 3] with the pooled
      point locations.
    pooled_features: A float tf.Tensor of shape [N, P2, C] with the pooled
      features.
  Raises:
    ValueError: If pooling_method is not one of {min/max/mean}.
  """
    segment_pooling_functions = {
        'min': tf.unsorted_segment_min,
        'max': tf.unsorted_segment_max,
        'mean': tf.unsorted_segment_mean
    }

    if pooling_method not in segment_pooling_functions:
        raise ValueError('`pooling_method` must be one of {}.'.format(
            list(segment_pooling_functions.keys())))
    segment_fn = segment_pooling_functions[pooling_method]

    points = py_utils.HasShape(points, [-1, -1, 3])
    n, p1 = py_utils.GetShape(points, 2)
    point_features = py_utils.HasShape(point_features, [n, p1, -1])
    _, _, c = py_utils.GetShape(point_features)
    pooling_idx = py_utils.HasShape(pooling_idx, [n, -1])
    _, p2 = py_utils.GetShape(pooling_idx)
    closest_idx = py_utils.HasShape(closest_idx, [n, p1])

    # Subselect our output points
    pooled_points = tf.batch_gather(points, pooling_idx)

    # Loop over batch dimension of our features/indices, as unsorted_segment_X
    # does not currently support a batch dimension.
    def _LoopFn(args):
        example_features, example_closest_idx = args
        return segment_fn(example_features,
                          example_closest_idx,
                          num_segments=p2)

    pooled_features = tf.map_fn(fn=_LoopFn,
                                elems=(point_features, closest_idx),
                                dtype=tf.float32)

    return (py_utils.HasShape(pooled_points, [n, p2, 3]),
            py_utils.HasShape(pooled_features, [n, p2, c]))
    def _Extract(self, features):
        p = self.params
        ri_outputs = {}
        outputs = {}
        frame_pose = tf.reshape(_Dense(features['pose']), [4, 4])
        for laser in p.cbr_laser_names + p.gbr_laser_names:
            # Extract range images.
            for returns in p.returns:
                ri_shape = tf.reshape(
                    _Dense(features['%s_%s_shape' % (laser, returns)]), [-1])
                range_image = tf.reshape(
                    _Dense(features['%s_%s' % (laser, returns)]), ri_shape)

                shape_to_check = (p.cbr_ri_shape if laser in p.cbr_laser_names
                                  else p.gbr_ri_shape)
                range_image = py_utils.HasShape(range_image, shape_to_check)

                ri_outputs['%s_%s' % (laser, returns)] = range_image

            # Extract beam inclinations and extrinsics
            outputs['%s_extrinsics' % laser] = tf.reshape(
                _Dense(features['%s_extrinsics' % laser]), [4, 4])

        # CBRs have uniform inclination
        for laser in p.cbr_laser_names:
            beam_inclination_min = tf.reshape(
                _Dense(features['%s_beam_inclination_min' % laser]), [])
            beam_inclination_max = tf.reshape(
                _Dense(features['%s_beam_inclination_max' % laser]), [])
            outputs['%s_beam_inclinations' % laser] = tf.stack(
                [beam_inclination_min, beam_inclination_max], axis=0)

        # GBRs have non-uniform inclinations defined by 64 floats.
        for laser in p.gbr_laser_names:
            outputs['%s_beam_inclinations' % laser] = tf.reshape(
                _Dense(features['%s_beam_inclinations' % laser]), [64])

        # Embed xyz onto each range image pixel.
        for laser in p.cbr_laser_names + p.gbr_laser_names:
            extrinsics = outputs['%s_extrinsics' % laser]
            inclinations = outputs['%s_beam_inclinations' % laser]
            if laser in p.cbr_laser_names:
                ri_shape = p.cbr_ri_shape

                # Convert from 2-tuple range inclination to the full range
                # via linear interpolation.
                #
                # CBR lasers currently are always uniform inclinations specified by a
                # length 2 vector.
                height = ri_shape[0]
                min_inclination = inclinations[0]
                max_inclination = inclinations[1]
                diff = max_inclination - min_inclination
                ratio = (.5 + tf.cast(tf.range(
                    0, height), tf.float32)) / tf.cast(height, tf.float32)
                # interpolate from min to max inclination.
                inclinations = (ratio * diff) + min_inclination
            else:
                ri_shape = p.gbr_ri_shape

            pixel_pose = None
            if laser in p.gbr_laser_names:
                pixel_pose = tf.reshape(_Dense(features['%s_pose' % laser]),
                                        shape=p.gbr_ri_shape[0:2] + [4, 4])
                outputs['%s_pose' % laser] = pixel_pose

            for returns in p.returns:
                range_image = ri_outputs['%s_%s' % (laser, returns)]
                range_image = tf.reshape(range_image, ri_shape)
                range_image_mask = range_image[..., 0] >= 0
                ri_xyz = tf.cast(
                    self._XYZFromRangeImage(range_image, range_image_mask,
                                            extrinsics, inclinations,
                                            pixel_pose, frame_pose),
                    tf.float32)

                # Produce the NestedMap of xyz, features, mask.
                ri_result = py_utils.NestedMap({
                    'xyz':
                    ri_xyz,
                    'features':
                    range_image,
                    'mask':
                    tf.cast(range_image_mask, tf.float32),
                })

                outputs['%s_%s' % (laser, returns)] = ri_result

        return py_utils.NestedMap(outputs)
Exemple #25
0
    def ComputeLoss(self, theta, predictions, input_batch):
        """Compute loss for the sparse detector model v1.

    Args:
      theta: A `.NestedMap` object containing variable values of this task.
      predictions: A `.NestedMap` object containing residuals and
        classification_logits.
      input_batch: A `.NestedMap` expected to contain cell_center_xyz,
        cell_points_xyz, cell_feature, anchor_bboxes,
        anchor_localization_residuals, assigned_gt_labels, and
        assigned_cls_mask. See class doc string for details.

    Returns:
      Two dicts:

      - A dict containing str keys and (metric, weight) pairs as values, where
        one of the keys is expected to be 'loss'.
      - A dict containing arbitrary tensors describing something about each
        training example, where the first dimension of each tensor is the batch
        index.
    """
        p = self.params

        batch_size, num_centers = py_utils.GetShape(
            input_batch.cell_center_xyz, 2)

        # Assert shapes of inputs.
        anchor_bboxes = py_utils.HasShape(
            input_batch.anchor_bboxes,
            [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7])
        anchor_localization_residuals = py_utils.HasShape(
            input_batch.anchor_localization_residuals,
            [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7])
        predicted_residuals = py_utils.HasShape(
            predictions.residuals,
            [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7])

        assigned_gt_labels = py_utils.HasShape(
            input_batch.assigned_gt_labels,
            [batch_size, num_centers, p.num_anchor_bboxes_per_center])
        predicted_classification_logits = py_utils.HasShape(
            predictions.classification_logits, [
                batch_size, num_centers, p.num_anchor_bboxes_per_center,
                p.num_classes
            ])

        # assigned_cls_mask is for weighting the classification loss.
        # Ignored targets will have their mask = 0; this happens when their IOU is
        # not high enough to be a foreground object and not low enough to be
        # background.
        class_weights = py_utils.HasShape(
            input_batch.assigned_cls_mask,
            [batch_size, num_centers, p.num_anchor_bboxes_per_center])
        class_weights = tf.reshape(
            class_weights,
            [batch_size, num_centers, p.num_anchor_bboxes_per_center, 1])

        # Broadcast per class loss weights. For each anchor, there are num_classes
        # prediction heads, we weight the outputs of these heads by the per class
        # loss weights.
        per_class_loss_weight = tf.constant([[[p.per_class_loss_weight]]],
                                            dtype=tf.float32)
        per_class_loss_weight = py_utils.HasShape(per_class_loss_weight,
                                                  [1, 1, 1, p.num_classes])
        class_weights *= per_class_loss_weight
        class_weights = py_utils.HasShape(class_weights, [
            batch_size, num_centers, p.num_anchor_bboxes_per_center,
            p.num_classes
        ])

        # We use assigned_reg_mask for masking the regression loss.
        # Only foreground objects will have assigned_reg_mask = 1.
        reg_weights = py_utils.HasShape(
            input_batch.assigned_reg_mask,
            [batch_size, num_centers, p.num_anchor_bboxes_per_center])
        reg_weights = tf.reshape(
            reg_weights,
            [batch_size, num_centers, p.num_anchor_bboxes_per_center, 1])

        if p.loss_norm_type == LossNormType.NORM_BY_NUM_POS_PER_CENTER:
            # Compute number of positive anchors per example.
            foreground_mask = py_utils.HasShape(
                input_batch.assigned_reg_mask,
                [batch_size, num_centers, p.num_anchor_bboxes_per_center])

            # Sum to get the number of foreground anchors for each example.
            loss_normalization = tf.reduce_sum(foreground_mask, axis=2)
            loss_normalization = tf.maximum(loss_normalization,
                                            tf.ones_like(loss_normalization))

            # Reshape for broadcasting.
            loss_normalization = tf.reshape(loss_normalization,
                                            [batch_size, num_centers, 1, 1])

            # Normalize so that the loss is independent of # centers.
            loss_normalization *= num_centers
            class_weights /= loss_normalization
            reg_weights /= loss_normalization

        classification_loss = py_utils.SigmoidCrossEntropyFocalLoss(
            logits=predicted_classification_logits,
            labels=tf.one_hot(assigned_gt_labels, p.num_classes),
            alpha=p.focal_loss_alpha,
            gamma=p.focal_loss_gamma)

        # Apply mask.
        classification_loss *= class_weights

        # TODO(jngiam): Consider normalizing by num_foreground_anchors for each
        # example instead. This would match the 1/N_positive normalization in
        # point pillars.

        # Reduce sum over centers, boxes and classes.
        classification_loss = tf.reduce_sum(classification_loss,
                                            axis=[1, 2, 3])

        # Reduce mean over batch.
        classification_loss = tf.reduce_mean(classification_loss)

        # Localization regression loss with Huber loss (SmoothL1).
        regression_loc_and_dims_loss = self._utils_3d.ScaledHuberLoss(
            labels=anchor_localization_residuals[..., :6],
            predictions=predicted_residuals[..., :6],
            delta=p.huber_loss_delta)

        # Rotation loss is computed on a transform on rotation_delta. For a
        # direction aware loss, we simply wrap the angles to -pi to pi; for a loss
        # that is symmetric to direction (i.e., rotating by pi), we use a sin
        # transform.
        rotation_delta_transform = tf.sin
        if p.direction_aware_rot_loss:
            rotation_delta_transform = functools.partial(geometry.WrapAngleRad,
                                                         min_val=-np.pi,
                                                         max_val=np.pi)
        rotation_delta = (predicted_residuals[..., 6:] -
                          anchor_localization_residuals[..., 6:])
        regression_rotation_loss = self._utils_3d.ScaledHuberLoss(
            labels=tf.zeros_like(rotation_delta),
            predictions=rotation_delta_transform(rotation_delta),
            delta=p.huber_loss_delta)

        reg_loc_loss = regression_loc_and_dims_loss[..., :3]
        reg_dim_loss = regression_loc_and_dims_loss[..., 3:6]

        gt_bboxes = self._utils_3d.ResidualsToBBoxes(
            anchor_bboxes,
            anchor_localization_residuals,
            min_angle_rad=-np.pi,
            max_angle_rad=np.pi)
        predicted_bboxes = self._utils_3d.ResidualsToBBoxes(
            anchor_bboxes,
            predicted_residuals,
            min_angle_rad=-np.pi,
            max_angle_rad=np.pi)

        # Apply mask to individual losses.
        #
        # And then reduce sum over centers, boxes, residuals, and batch
        # and divide by the batch_size.
        regression_rotation_loss *= reg_weights
        reg_rot_loss = tf.reduce_sum(regression_rotation_loss) / batch_size

        reg_loc_loss *= reg_weights
        reg_loc_loss = tf.reduce_sum(reg_loc_loss) / batch_size

        reg_dim_loss *= reg_weights
        reg_dim_loss = tf.reduce_sum(reg_dim_loss) / batch_size

        # Do not create corner loss graph if weight is 0.0
        # TODO(bcyang): Remove condition after fixing corner loss NaN issue
        if p.corner_loss_weight != 0.0:
            reg_corner_loss = self._utils_3d.CornerLoss(
                gt_bboxes=gt_bboxes, predicted_bboxes=predicted_bboxes)
            reg_corner_loss = tf.expand_dims(reg_corner_loss, axis=-1)

            reg_corner_loss *= reg_weights
            reg_corner_loss = tf.reduce_sum(reg_corner_loss) / batch_size
        else:
            reg_corner_loss = 0.0

        # Sum components of regression loss.
        regression_loss = (p.location_loss_weight * reg_loc_loss +
                           p.dimension_loss_weight * reg_dim_loss +
                           p.rotation_loss_weight * reg_rot_loss +
                           p.corner_loss_weight * reg_corner_loss)

        # Compute total loss.
        total_loss = (p.loss_weight_localization * regression_loss +
                      p.loss_weight_classification * classification_loss)

        metrics_dict = py_utils.NestedMap({
            'loss': (total_loss, batch_size),
            'loss/regression': (regression_loss, batch_size),
            'loss/regression/loc': (reg_loc_loss, batch_size),
            'loss/regression/dim': (reg_dim_loss, batch_size),
            'loss/regression/rot': (reg_rot_loss, batch_size),
            'loss/regression/corner': (reg_corner_loss, batch_size),
            'loss/classification': (classification_loss, batch_size),
        })

        # Calculate dimension errors
        dimension_errors_dict = self._BBoxDimensionErrors(
            gt_bboxes, predicted_bboxes, reg_weights)
        metrics_dict.update(dimension_errors_dict)

        per_example_dict = py_utils.NestedMap({
            'residuals': predicted_residuals,
            'classification_logits': predicted_classification_logits,
            'predicted_bboxes': predicted_bboxes,
            'gt_bboxes': gt_bboxes,
            'reg_weights': reg_weights,
        })

        return metrics_dict, per_example_dict
    def _XYZFromRangeImage(self,
                           lidar_image,
                           lidar_image_mask,
                           extrinsics,
                           inclinations,
                           pixel_pose=None,
                           frame_pose=None):
        """Extract the cartesian coordinates from the range image.

    Args:
       lidar_image: [H, W, C] range image Tensor.
       lidar_image_mask: [H, W] boolean indicating which 2d coordinates in the
         lidar image are present.
       extrinsics: [4, 4] float matrix representing transformation matrix to
         world coordinates.
       inclinations: [V] beam inclinations vector.
       pixel_pose: [64, 2650, 4, 4] tensor representing per pixel pose of GBR.
       frame_pose: [4, 4] matrix representing vehicle to world transformation.

    Returns:
      [H, W, 3] range image cartesian coordinates.
    """
        height, width, channels = py_utils.GetShape(lidar_image, 3)

        conversion_dtype = tf.float32
        lidar_image = tf.cast(lidar_image, conversion_dtype)
        extrinsics = tf.cast(extrinsics, conversion_dtype)
        inclinations = tf.cast(inclinations, conversion_dtype)
        inclinations = tf.reverse(inclinations, axis=[-1])

        az_correction = py_utils.HasShape(
            tf.atan2(extrinsics[1, 0], extrinsics[0, 0]), [])
        ratios = (tf.cast(tf.range(width, 0, -1), dtype=conversion_dtype) -
                  .5) / tf.cast(width, conversion_dtype)
        ratios = py_utils.HasShape(ratios, [width])

        azimuth = (ratios * 2. - 1.) * np.pi - az_correction[..., tf.newaxis]
        azimuth = py_utils.HasShape(azimuth, [width])

        lidar_image_mask = lidar_image_mask[..., tf.newaxis]
        lidar_image_mask = tf.tile(lidar_image_mask, [1, 1, channels])
        lidar_image = tf.where(lidar_image_mask, lidar_image,
                               tf.zeros_like(lidar_image))
        lidar_image_range = lidar_image[..., 0]

        azimuth = py_utils.HasShape(azimuth[tf.newaxis, ...], [1, width])
        inclinations = py_utils.HasShape(inclinations[..., tf.newaxis],
                                         [height, 1])

        cos_azimuth = tf.cos(azimuth)
        sin_azimuth = tf.sin(azimuth)
        cos_incl = tf.cos(inclinations)
        sin_incl = tf.sin(inclinations)

        x = cos_azimuth * cos_incl * lidar_image_range
        y = sin_azimuth * cos_incl * lidar_image_range
        z = sin_incl * lidar_image_range

        lidar_image_points = tf.stack([x, y, z], -1)
        lidar_image_points = py_utils.HasShape(lidar_image_points,
                                               [height, width, 3])
        rotation = extrinsics[0:3, 0:3]
        translation = extrinsics[0:3, 3][tf.newaxis, ...]

        # Transform the image points in cartesian coordinates to
        # the world coordinate system using the extrinsics matrix.
        #
        # We first flatten the points, apply rotation, then
        # reshape to restore the original input and then apply
        # translation.
        lidar_image_points = tf.matmul(tf.reshape(lidar_image_points, [-1, 3]),
                                       rotation,
                                       transpose_b=True)
        lidar_image_points = tf.reshape(lidar_image_points, [height, width, 3])
        lidar_image_points += translation

        lidar_image_points = py_utils.HasShape(lidar_image_points,
                                               [height, width, 3])
        # GBR uses per pixel pose.
        if pixel_pose is not None:
            pixel_pose_rotation = pixel_pose[..., 0:3, 0:3]
            pixel_pose_translation = pixel_pose[..., 0:3, 3]
            lidar_image_points = tf.einsum(
                'hwij,hwj->hwi', pixel_pose_rotation,
                lidar_image_points) + pixel_pose_translation
            if frame_pose is None:
                raise ValueError(
                    'frame_pose must be set when pixel_pose is set.')
            # To vehicle frame corresponding to the given frame_pose
            # [4, 4]
            world_to_vehicle = tf.matrix_inverse(frame_pose)
            world_to_vehicle_rotation = world_to_vehicle[0:3, 0:3]
            world_to_vehicle_translation = world_to_vehicle[0:3, 3]
            # [H, W, 3]
            lidar_image_points = tf.einsum(
                'ij,hwj->hwi', world_to_vehicle_rotation, lidar_image_points
            ) + world_to_vehicle_translation[tf.newaxis, tf.newaxis, :]

        return lidar_image_points
Exemple #27
0
    def ComputePredictions(self, theta, input_batch):
        """Computes predictions for `input_batch`.

    Args:
      theta: A `.NestedMap` object containing variable values of this task.
      input_batch: A `.NestedMap` expected to contain lasers.points_xyz,
        lasers.points_feature, lasers.points_padding, cell_center_xyz,
        cell_points_xyz, cell_feature, anchor_bboxes,
        anchor_localization_residuals, assigned_gt_labels, and
        assigned_cls_mask. See class doc string for details.

    Returns:
      A `.NestedMap` object containing residuals and classification_logits.
    """
        p = self.params
        input_batch.Transform(lambda x:
                              (x.shape, x.shape.num_elements())).VLog(
                                  1, 'input_batch shapes: ')
        cell_feature = py_utils.HasRank(input_batch.cell_feature, 4)
        batch_size, num_centers = py_utils.GetShape(cell_feature, 2)

        featurized_cell = self._CellFeaturizer(theta, input_batch)

        # Project each featurized_cell features to each bbox per center.
        featurized_anchors = self.cell_feature_projector.FProp(
            theta.cell_feature_projector, featurized_cell)

        # Reshape output so that we have features per offset.
        featurized_anchors = tf.reshape(
            featurized_anchors,
            [batch_size, num_centers, p.num_anchor_bboxes_offsets, -1])

        # Predict localization residuals.
        predicted_residuals = self.localization_regressor.FProp(
            theta.localization_regressor, featurized_anchors)
        predicted_residuals = tf.reshape(
            predicted_residuals,
            [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7])

        if any([p.oracle_location, p.oracle_dimension, p.oracle_rotation]):
            gt_residuals = py_utils.HasShape(
                input_batch.anchor_localization_residuals,
                [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7])
            residuals = []
            if p.oracle_location:
                residuals.append(gt_residuals[..., 0:3])
            else:
                residuals.append(predicted_residuals[..., 0:3])

            if p.oracle_dimension:
                residuals.append(gt_residuals[..., 3:6])
            else:
                residuals.append(predicted_residuals[..., 3:6])

            if p.oracle_rotation:
                residuals.append(gt_residuals[..., 6:])
            else:
                residuals.append(predicted_residuals[..., 6:])
            predicted_residuals = tf.concat(residuals, axis=-1)

        if p.squash_rotation_predictions:
            predicted_rotations = predicted_residuals[..., 6:]
            predicted_rotations = np.pi * tf.tanh(predicted_rotations)
            predicted_residuals = tf.concat(
                [predicted_residuals[..., :6], predicted_rotations], axis=-1)

        # Predict object classification at each bbox.
        predicted_classification_logits = self.classifier.FProp(
            theta.classifier, featurized_anchors)
        predicted_classification_logits = tf.reshape(
            predicted_classification_logits, [
                batch_size, num_centers, p.num_anchor_bboxes_per_center,
                p.num_classes
            ])

        if p.oracle_classification:
            assigned_gt_labels = py_utils.HasShape(
                input_batch.assigned_gt_labels,
                [batch_size, num_centers, p.num_anchor_bboxes_per_center])
            predicted_classification_logits = tf.one_hot(
                assigned_gt_labels, p.num_classes)

        return py_utils.NestedMap({
            'residuals':
            predicted_residuals,
            'classification_logits':
            predicted_classification_logits,
        })
    def FProp(self, theta, inputs, paddings):
        """Apply convolution to inputs.

    Args:
      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      inputs: The inputs tensor. It is expected to be of shape [batch, time,
        frequency, channel]. The time dimension corresponds to the height
        dimension as in images and the frequency dimension corresponds to the
        width dimension as in images.
      paddings: The paddings tensor, expected to be of shape [batch, time].

    Returns:
      outputs, out_paddings pair.
    """
        p = self.params
        with tf.name_scope(p.name):
            inputs = py_utils.with_dependencies([
                py_utils.assert_shape_match(tf.shape(paddings), [-1, -1]),
                py_utils.assert_shape_match(
                    tf.shape(inputs),
                    tf.concat([
                        tf.shape(paddings),
                        [-1, symbolic.ToStatic(self.input_channels)]
                    ], 0))
            ], inputs)

            def _ApplyPadding(tensor_in, padding_in):
                padding_expanded = tf.expand_dims(
                    tf.expand_dims(padding_in, -1), -1)
                return tensor_in * (1.0 - padding_expanded)

            # Zeroing out padded inputs.
            inputs = _ApplyPadding(inputs, paddings)

            # Apply conv on 'inputs'.
            if p.v2_padding:
                padded_inputs, slice_len = _PadForLengthCompatibleStridesV2(
                    inputs, p.filter_stride[0], 'SAME', 0.)
                out = self._ApplyConv(theta, padded_inputs)
                if p.filter_stride[0] > 1:
                    slice_end = py_utils.GetShape(out)[1] - slice_len
                    out = out[:, :slice_end, :, :]
            else:
                out = self._ApplyConv(theta, inputs)

            if p.partial_conv:
                out = self._RescaleBoundary(out, paddings)
            # NOTE: this may be slightly inaccurate when p.dilation_rate[0] > 1.
            # But there's likely no real problems. Trying to set it gives an error:
            # pooling with SAME padding is not implemented for dilation_rate > 1.
            # implementation. Consider updating it to be the actual shape.
            if p.v2_padding:
                conv_padding = _ComputeConvOutputPaddingV2(
                    paddings,
                    window=p.filter_shape[0],
                    stride=p.filter_stride[0])
            else:
                conv_padding = ComputeConvOutputPadding(
                    paddings,
                    window=p.filter_stride[0],
                    stride=p.filter_stride[0])

            # Assuming padded nodes will be properly zero-ed out if necessary by
            # sub-sequent layers.
            # out = _ApplyPadding(out, conv_padding)
            out = py_utils.HasShape(
                out, symbolic.ToStatic(self.OutShape(tf.shape(inputs))))
            return out, conv_padding
Exemple #29
0
 def PadOne(inp):
     inp = py_utils.HasShape(inp, [-1, -1, 3])
     return tf.pad(inp, [[0, 0], [0, 0], [0, 1]], constant_values=1.0)
Exemple #30
0
    def ResidualsToBBoxes(self,
                          anchor_bboxes,
                          residuals,
                          min_angle_rad=-np.pi,
                          max_angle_rad=np.pi):
        r"""Converts anchor_boxes and residuals to predicted bboxes.

    This converts predicted residuals into bboxes using the following formulae::

      x_predicted = x_a + x_residual * diagonal_xy
      y_predicted = y_a + y_residual * diagonal_xy
      z_predicted = z_a + z_residual * dz_a

      dx_predicted = dx_a * exp(dx_residual)
      dy_predicted = dy_a * exp(dy_residual)
      dz_predicted = dz_a * exp(dz_residual)

      # Adding the residual, and bounding it between
      # [min_angle_rad, max_angle_rad]
      phi_predicted = NormalizeAngleRad(phi_a + phi_residual,
                                        min_angle_rad, max_angle_rad)

    These equations follow from those in LocalizationResiduals, where we solve
    for the \*_gt variables.

    Args:
      anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz,
        phi), corresponding to each anchor bbox parameters.
      residuals: tf.float32 of the same shape as anchor_bboxes containing
        predicted residuals at each anchor location.
      min_angle_rad: Scalar with the minimum angle allowed (before wrapping)
        in radians.
      max_angle_rad: Scalar with the maximum angle allowed (before wrapping)
        in radians. This value usually should be pi.

    Returns:
      A tf.float32 tensor of the same shape as anchor_bboxes with predicted
      bboxes.
    """
        anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes)
        anchor_bboxes = py_utils.with_dependencies(
            [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes)
        residuals = py_utils.HasShape(residuals, anchor_bboxes_shape)

        x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(anchor_bboxes,
                                                            num=7,
                                                            axis=-1)
        (x_residual, y_residual, z_residual, dx_residual, dy_residual,
         dz_residual, phi_residual) = tf.unstack(residuals, num=7, axis=-1)

        diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a))

        x_predicted = x_a + x_residual * diagonal_xy
        y_predicted = y_a + y_residual * diagonal_xy
        z_predicted = z_a + z_residual * dz_a

        dx_predicted = dx_a * tf.exp(dx_residual)
        dy_predicted = dy_a * tf.exp(dy_residual)
        dz_predicted = dz_a * tf.exp(dz_residual)

        # We bound the angle between [min_angle_rad, max_angle_rad], which should
        # be passed in depending on the heading handling in the calling model.
        # If the model uses a sine(delta_phi) transformation in the loss, then it
        # cannot distinguish direction and a [0, np.pi]
        # [min_angle_rad, max_angle_rad] should be used.
        # If there is a heading encoding that is directional, most likely you
        # should use a [-np.pi, np.pi] [min_angle_rad, max_angle_rad].
        phi_predicted = phi_a + phi_residual
        phi_predicted = geometry.WrapAngleRad(phi_predicted, min_angle_rad,
                                              max_angle_rad)

        return tf.stack([
            x_predicted,
            y_predicted,
            z_predicted,
            dx_predicted,
            dy_predicted,
            dz_predicted,
            phi_predicted,
        ],
                        axis=-1)  # pyformat: disable