Example #1
0
def infer_depth(rgb_image, params):
    """Runs depth inference given an RGB frame.

  Args:
    rgb_image: A tf.Tensor or shape [B, H, W, 3] containing RGB images.
    params: A dictionary of parameters contraining overrides for
      DEFAULT_PARAMS.

  Returns:
    A tf.Tensor of shape [B, H, W, 1] containing the inferred depths.
  """
    if rgb_image.shape.rank != 4:
        raise ValueError('rgb_image should have rank 4, not %d.' %
                         rgb_image.shape.rank)
    params = parameter_container.ParameterContainer.from_defaults_and_overrides(
        DEFAULT_PARAMS, params, is_strict=True, strictness_depth=2)

    depth_predictor = depth_prediction_nets.ResNet18DepthPredictor(
        tf.estimator.ModeKeys.PREDICT, params.depth_predictor_params.as_dict())
    return depth_predictor.predict_depth(rgb_image)
Example #2
0
def loss_fn(features, mode, params):
    """Computes the training loss for depth and egomotion training.

  This function is written with TPU-friendlines in mind.

  Args:
    features: A dictionary mapping strings to tuples of (tf.Tensor, tf.Tensor),
      representing pairs of frames. The loss will be calculated from these
      tensors. The expected endpoints are 'rgb', 'depth', 'intrinsics_mat'
      and 'intrinsics_mat_inv'.
    mode: One of tf.estimator.ModeKeys: TRAIN, PREDICT or EVAL.
    params: A dictionary with hyperparameters that optionally override
      DEFAULT_PARAMS above.

  Returns:
    A dictionary mapping each loss name (see DEFAULT_PARAMS['loss_weights']'s
    keys) to a scalar tf.Tensor representing the respective loss. The total
    training loss.

  Raises:
    ValueError: `features` endpoints that don't conform with their expected
       structure.
  """
    params = parameter_container.ParameterContainer.from_defaults_and_overrides(
        DEFAULT_PARAMS, params, is_strict=True, strictness_depth=2)

    if len(features['rgb']) != 2 or 'depth' in features and len(
            features['depth']) != 2:
        raise ValueError(
            'RGB and depth endpoints are expected to be a tuple of two'
            ' tensors. Rather, they are %s.' % str(features))

    # On tpu we strive to stack tensors together and perform ops once on the
    # entire stack, to save time HBM memory. We thus stack the batch-of-first-
    # frames and the batch-of-second frames, for both depth and RGB. The batch
    # dimension of rgb_stack and gt_depth_stack are thus twice the original batch
    # size.
    rgb_stack = tf.concat(features['rgb'], axis=0)

    depth_predictor = depth_prediction_nets.ResNet18DepthPredictor(
        mode, params.depth_predictor_params.as_dict())
    predicted_depth = depth_predictor.predict_depth(rgb_stack)
    maybe_summary.histogram('PredictedDepth', predicted_depth)

    endpoints = {}
    endpoints['predicted_depth'] = tf.split(predicted_depth, 2, axis=0)
    endpoints['rgb'] = features['rgb']

    # We make the heuristic that depths that are less than 0.2 meters are not
    # accurate. This is a rough placeholder for a confidence map that we're going
    # to have in future.
    if 'depth' in features:
        endpoints['groundtruth_depth'] = features['depth']

    if params.cascade:
        motion_features = [
            tf.concat([features['rgb'][0], endpoints['predicted_depth'][0]],
                      axis=-1),
            tf.concat([features['rgb'][1], endpoints['predicted_depth'][1]],
                      axis=-1)
        ]
    else:
        motion_features = features['rgb']

    motion_features_stack = tf.concat(motion_features, axis=0)
    flipped_motion_features_stack = tf.concat(motion_features[::-1], axis=0)
    # Unlike `rgb_stack`, here we stacked the frames in reverse order along the
    # Batch dimension. By concatenating the two stacks below along the channel
    # axis, we create the following tensor:
    #
    #         Channel dimension (3)
    #   _                                 _
    #  |  Frame1-s batch | Frame2-s batch  |____Batch
    #  |_ Frame2-s batch | Frame1-s batch _|    dimension (0)
    #
    # When we send this tensor to the motion prediction network, the first and
    # second halves of the result represent the camera motion from Frame1 to
    # Frame2 and from Frame2 to Frame1 respectively. Further below we impose a
    # loss that drives these two to be the inverses of one another
    # (cycle-consistency).
    pairs = tf.concat([motion_features_stack, flipped_motion_features_stack],
                      axis=-1)

    rot, trans, residual_translation, intrinsics_mat = (
        object_motion_nets.motion_field_net(
            images=pairs,
            weight_reg=params.motion_prediction_params.weight_reg,
            align_corners=params.motion_prediction_params.align_corners,
            auto_mask=params.motion_prediction_params.auto_mask))

    if params.motion_field_burnin_steps > 0.0:
        step = tf.to_float(tf.train.get_or_create_global_step())
        burnin_steps = tf.to_float(params.motion_field_burnin_steps)
        residual_translation *= tf.clip_by_value(2 * step / burnin_steps - 1,
                                                 0.0, 1.0)

    # If using grouth truth egomotion
    if not params.learn_egomotion:
        egomotion_mat = tf.concat(features['egomotion_mat'], axis=0)
        rot = transform_utils.angles_from_matrix(egomotion_mat[:, :3, :3])
        trans = egomotion_mat[:, :3, 3]
        trans = tf.expand_dims(trans, 1)
        trans = tf.expand_dims(trans, 1)

    if params.use_mask:
        mask = tf.to_float(tf.concat(features['mask'], axis=0) > 0)
        if params.foreground_dilation > 0:
            pool_size = params.foreground_dilation * 2 + 1
            mask = tf.nn.max_pool(mask, [1, pool_size, pool_size, 1], [1] * 4,
                                  'SAME')
        residual_translation *= mask

    maybe_summary.histogram('ResidualTranslation', residual_translation)
    maybe_summary.histogram('BackgroundTranslation', trans)
    maybe_summary.histogram('Rotation', rot)
    endpoints['residual_translation'] = tf.split(residual_translation,
                                                 2,
                                                 axis=0)
    endpoints['background_translation'] = tf.split(trans, 2, axis=0)
    endpoints['rotation'] = tf.split(rot, 2, axis=0)

    if not params.learn_intrinsics.enabled:
        endpoints['intrinsics_mat'] = features['intrinsics_mat']
        endpoints['intrinsics_mat_inv'] = features['intrinsics_mat_inv']
    elif params.learn_intrinsics.per_video:
        int_mat = intrinsics_utils.create_and_fetch_intrinsics_per_video_index(
            features['video_index'][0],
            params.image_preprocessing.image_height,
            params.image_preprocessing.image_width,
            max_video_index=params.learn_intrinsics.max_number_of_videos)
        endpoints['intrinsics_mat'] = tf.concat([int_mat] * 2, axis=0)
        endpoints[
            'intrinsics_mat_inv'] = intrinsics_utils.invert_intrinsics_matrix(
                int_mat)
    else:
        # The intrinsic matrix should be the same, no matter the order of
        # images (mat = inv_mat). It's probably a good idea to enforce this
        # by a loss, but for now we just take their average as a prediction for the
        # intrinsic matrix.
        intrinsics_mat = 0.5 * sum(tf.split(intrinsics_mat, 2, axis=0))
        endpoints['intrinsics_mat'] = [intrinsics_mat] * 2
        endpoints['intrinsics_mat_inv'] = [
            intrinsics_utils.invert_intrinsics_matrix(intrinsics_mat)
        ] * 2

    aggregator = loss_aggregator.DepthMotionFieldLossAggregator(
        endpoints, params.loss_weights.as_dict(), params.loss_params.as_dict())

    # Add some more summaries.
    maybe_summary.image('rgb0', features['rgb'][0])
    maybe_summary.image('rgb1', features['rgb'][1])
    disp0, disp1 = tf.split(aggregator.output_endpoints['disparity'],
                            2,
                            axis=0)
    maybe_summary.image('disparity0/grayscale', disp0)
    maybe_summary.image_with_colormap('disparity0/plasma',
                                      tf.squeeze(disp0, axis=3), 'plasma', 0.0)
    maybe_summary.image('disparity1/grayscale', disp1)
    maybe_summary.image_with_colormap('disparity1/plasma',
                                      tf.squeeze(disp1, axis=3), 'plasma', 0.0)
    if maybe_summary.summaries_enabled():
        if 'depth' in features:
            gt_disp0 = 1.0 / tf.maximum(features['depth'][0], 0.5)
            gt_disp1 = 1.0 / tf.maximum(features['depth'][1], 0.5)
            maybe_summary.image('disparity_gt0', gt_disp0)
            maybe_summary.image('disparity_gt1', gt_disp1)

        depth_proximity_weight0, depth_proximity_weight1 = tf.split(
            aggregator.output_endpoints['depth_proximity_weight'], 2, axis=0)
        maybe_summary.image('consistency_weight0',
                            tf.expand_dims(depth_proximity_weight0, -1))
        maybe_summary.image('consistency_weight1',
                            tf.expand_dims(depth_proximity_weight1, -1))
        maybe_summary.image('trans', aggregator.output_endpoints['trans'])
        maybe_summary.image('trans_inv',
                            aggregator.output_endpoints['inv_trans'])
        maybe_summary.image('trans_res', endpoints['residual_translation'][0])
        maybe_summary.image('trans_res_inv',
                            endpoints['residual_translation'][1])

    return aggregator.losses