def cou_bb_proj(R_est, t_est, R_gt, t_gt, K, renderer, obj_id):
  """Complement over Union of projected 2D bounding boxes.

  :param R_est: 3x3 ndarray with the estimated rotation matrix.
  :param t_est: 3x1 ndarray with the estimated translation vector.
  :param R_gt: 3x3 ndarray with the ground-truth rotation matrix.
  :param t_gt: 3x1 ndarray with the ground-truth translation vector.
  :param K: 3x3 ndarray with an intrinsic camera matrix.
  :param renderer: Instance of the Renderer class (see renderer.py).
  :param obj_id: Object identifier.
  :return: The calculated error.
  """
  # Render depth images of the model at the estimated and the ground-truth pose.
  fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
  depth_est = renderer.render_object(
    obj_id, R_est, t_est, fx, fy, cx, cy)['depth']
  depth_gt = renderer.render_object(
    obj_id, R_gt, t_gt, fx, fy, cx, cy)['depth']

  # Masks of the rendered model and their intersection and union
  mask_est = depth_est > 0
  mask_gt = depth_gt > 0

  ys_est, xs_est = mask_est.nonzero()
  bb_est = misc.calc_2d_bbox(xs_est, ys_est, im_size=None, clip=False)

  ys_gt, xs_gt = mask_gt.nonzero()
  bb_gt = misc.calc_2d_bbox(xs_gt, ys_gt, im_size=None, clip=False)

  e = 1.0 - misc.iou(bb_est, bb_gt)
  return e
            px_count_visib = visib_gt.sum()

            # Visible surface fraction.
            if px_count_all > 0:
                visib_fract = px_count_visib / float(px_count_all)
            else:
                visib_fract = 0.0

            # Bounding box of the whole object silhouette
            # (including the truncated part).
            bbox = [-1, -1, -1, -1]
            if px_count_visib > 0:
                ys, xs = obj_mask_gt_large.nonzero()
                ys -= ren_cy_offset
                xs -= ren_cx_offset
                bbox = misc.calc_2d_bbox(xs, ys, im_size)

            # Bounding box of the visible surface part.
            bbox_visib = [-1, -1, -1, -1]
            if px_count_visib > 0:
                ys, xs = visib_gt.nonzero()
                bbox_visib = misc.calc_2d_bbox(xs, ys, im_size)

            # Store the calculated info.
            scene_gt_info[im_id].append({
                'px_count_all':
                int(px_count_all),
                'px_count_valid':
                int(px_count_valid),
                'px_count_visib':
                int(px_count_visib),
Exemple #3
0
def vis_object_poses(
      poses, K, renderer, rgb=None, depth=None, vis_rgb_path=None,
      vis_depth_diff_path=None, vis_rgb_resolve_visib=False):
  """Visualizes 3D object models in specified poses in a single image.

  Two visualizations are created:
  1. An RGB visualization (if vis_rgb_path is not None).
  2. A Depth-difference visualization (if vis_depth_diff_path is not None).

  :param poses: List of dictionaries, each with info about one pose:
    - 'obj_id': Object ID.
    - 'R': 3x3 ndarray with a rotation matrix.
    - 't': 3x1 ndarray with a translation vector.
    - 'text_info': Info to write at the object (see write_text_on_image).
  :param K: 3x3 ndarray with an intrinsic camera matrix.
  :param renderer: Instance of the Renderer class (see renderer.py).
  :param rgb: ndarray with the RGB input image.
  :param depth: ndarray with the depth input image.
  :param vis_rgb_path: Path to the output RGB visualization.
  :param vis_depth_diff_path: Path to the output depth-difference visualization.
  :param vis_rgb_resolve_visib: Whether to resolve visibility of the objects
    (i.e. only the closest object is visualized at each pixel).
  """
  fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]

  # Indicators of visualization types.
  vis_rgb = vis_rgb_path is not None
  vis_depth_diff = vis_depth_diff_path is not None

  if vis_rgb and rgb is None:
    raise ValueError('RGB visualization triggered but RGB image not provided.')

  if (vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib)) and depth is None:
    raise ValueError('Depth visualization triggered but D image not provided.')

  # Prepare images for rendering.
  im_size = None
  ren_rgb = None
  ren_rgb_info = None
  ren_depth = None

  if vis_rgb:
    im_size = (rgb.shape[1], rgb.shape[0])
    ren_rgb = np.zeros(rgb.shape, np.uint8)
    ren_rgb_info = np.zeros(rgb.shape, np.uint8)

  if vis_depth_diff:
    if im_size and im_size != (depth.shape[1], depth.shape[0]):
        raise ValueError('The RGB and D images must have the same size.')
    else:
      im_size = (depth.shape[1], depth.shape[0])

  if vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib):
    ren_depth = np.zeros((im_size[1], im_size[0]), np.float32)

  # Render the pose estimates one by one.
  for pose in poses:

    # Rendering.
    ren_out = renderer.render_object(
      pose['obj_id'], pose['R'], pose['t'], fx, fy, cx, cy)

    m_rgb = None
    if vis_rgb:
      m_rgb = ren_out['rgb']

    m_mask = None
    if vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib):
      m_depth = ren_out['depth']

      # Get mask of the surface parts that are closer than the
      # surfaces rendered before.
      visible_mask = np.logical_or(ren_depth == 0, m_depth < ren_depth)
      m_mask = np.logical_and(m_depth != 0, visible_mask)

      ren_depth[m_mask] = m_depth[m_mask].astype(ren_depth.dtype)

    # Combine the RGB renderings.
    if vis_rgb:
      if vis_rgb_resolve_visib:
        ren_rgb[m_mask] = m_rgb[m_mask].astype(ren_rgb.dtype)
      else:
        ren_rgb_f = ren_rgb.astype(np.float32) + m_rgb.astype(np.float32)
        ren_rgb_f[ren_rgb_f > 255] = 255
        ren_rgb = ren_rgb_f.astype(np.uint8)

      # Draw 2D bounding box and write text info.
      obj_mask = np.sum(m_rgb > 0, axis=2)
      ys, xs = obj_mask.nonzero()
      if len(ys):
        # bbox_color = model_color
        # text_color = model_color
        bbox_color = (0.3, 0.3, 0.3)
        text_color = (1.0, 1.0, 1.0)
        text_size = 11

        bbox = misc.calc_2d_bbox(xs, ys, im_size)
        im_size = (obj_mask.shape[1], obj_mask.shape[0])
        ren_rgb_info = draw_rect(ren_rgb_info, bbox, bbox_color)

        if 'text_info' in pose:
          text_loc = (bbox[0] + 2, bbox[1])
          ren_rgb_info = write_text_on_image(
            ren_rgb_info, pose['text_info'], text_loc, color=text_color,
            size=text_size)

  # Blend and save the RGB visualization.
  if vis_rgb:
    vis_im_rgb = 0.5 * rgb.astype(np.float32) + \
                 0.5 * ren_rgb.astype(np.float32) + \
                 1.0 * ren_rgb_info.astype(np.float32)
    vis_im_rgb[vis_im_rgb > 255] = 255
    misc.ensure_dir(os.path.dirname(vis_rgb_path))
    inout.save_im(vis_rgb_path, vis_im_rgb.astype(np.uint8), jpg_quality=95)

  # Save the image of depth differences.
  if vis_depth_diff:
    # Calculate the depth difference at pixels where both depth maps
    # are valid.
    valid_mask = (depth > 0) * (ren_depth > 0)
    depth_diff = valid_mask * (depth - ren_depth.astype(np.float32))

    f, ax = plt.subplots(1, 1)
    cax = ax.matshow(depth_diff)
    ax.axis('off')
    ax.set_title('captured - GT depth [mm]')
    f.colorbar(cax, fraction=0.03, pad=0.01)
    f.tight_layout(pad=0)

    if not vis_rgb:
      misc.ensure_dir(os.path.dirname(vis_depth_diff_path))
    plt.savefig(vis_depth_diff_path, pad=0, bbox_inches='tight', quality=95)
    plt.close()
def vis_object_poses(
      poses, K, renderer, rgb=None, depth=None, vis_rgb_path=None,
      vis_depth_diff_path=None, vis_rgb_resolve_visib=False):
  """Visualizes 3D object models in specified poses in a single image.

  Two visualizations are created:
  1. An RGB visualization (if vis_rgb_path is not None).
  2. A Depth-difference visualization (if vis_depth_diff_path is not None).

  :param poses: List of dictionaries, each with info about one pose:
    - 'obj_id': Object ID.
    - 'R': 3x3 ndarray with a rotation matrix.
    - 't': 3x1 ndarray with a translation vector.
    - 'text_info': Info to write at the object (see write_text_on_image).
  :param K: 3x3 ndarray with an intrinsic camera matrix.
  :param renderer: Instance of the Renderer class (see renderer.py).
  :param rgb: ndarray with the RGB input image.
  :param depth: ndarray with the depth input image.
  :param vis_rgb_path: Path to the output RGB visualization.
  :param vis_depth_diff_path: Path to the output depth-difference visualization.
  :param vis_rgb_resolve_visib: Whether to resolve visibility of the objects
    (i.e. only the closest object is visualized at each pixel).
  """
  fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]

  # Indicators of visualization types.
  vis_rgb = vis_rgb_path is not None
  vis_depth_diff = vis_depth_diff_path is not None

  if vis_rgb and rgb is None:
    raise ValueError('RGB visualization triggered but RGB image not provided.')

  if (vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib)) and depth is None:
    raise ValueError('Depth visualization triggered but D image not provided.')

  # Prepare images for rendering.
  im_size = None
  ren_rgb = None
  ren_rgb_info = None
  ren_depth = None

  if vis_rgb:
    im_size = (rgb.shape[1], rgb.shape[0])
    ren_rgb = np.zeros(rgb.shape, np.uint8)
    ren_rgb_info = np.zeros(rgb.shape, np.uint8)

  if vis_depth_diff:
    if im_size and im_size != (depth.shape[1], depth.shape[0]):
        raise ValueError('The RGB and D images must have the same size.')
    else:
      im_size = (depth.shape[1], depth.shape[0])

  if vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib):
    ren_depth = np.zeros((im_size[1], im_size[0]), np.float32)

  # Render the pose estimates one by one.
  for pose in poses:

    # Rendering.
    ren_out = renderer.render_object(
      pose['obj_id'], pose['R'], pose['t'], fx, fy, cx, cy)

    m_rgb = None
    if vis_rgb:
      m_rgb = ren_out['rgb']

    m_mask = None
    if vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib):
      m_depth = ren_out['depth']

      # Get mask of the surface parts that are closer than the
      # surfaces rendered before.
      visible_mask = np.logical_or(ren_depth == 0, m_depth < ren_depth)
      m_mask = np.logical_and(m_depth != 0, visible_mask)

      ren_depth[m_mask] = m_depth[m_mask].astype(ren_depth.dtype)

    # Combine the RGB renderings.
    if vis_rgb:
      if vis_rgb_resolve_visib:
        ren_rgb[m_mask] = m_rgb[m_mask].astype(ren_rgb.dtype)
      else:
        ren_rgb_f = ren_rgb.astype(np.float32) + m_rgb.astype(np.float32)
        ren_rgb_f[ren_rgb_f > 255] = 255
        ren_rgb = ren_rgb_f.astype(np.uint8)

      # Draw 2D bounding box and write text info.
      obj_mask = np.sum(m_rgb > 0, axis=2)
      ys, xs = obj_mask.nonzero()
      if len(ys):
        # bbox_color = model_color
        # text_color = model_color
        bbox_color = (0.3, 0.3, 0.3)
        text_color = (1.0, 1.0, 1.0)
        text_size = 11

        bbox = misc.calc_2d_bbox(xs, ys, im_size)
        im_size = (obj_mask.shape[1], obj_mask.shape[0])
        ren_rgb_info = draw_rect(ren_rgb_info, bbox, bbox_color)

        if 'text_info' in pose:
          text_loc = (bbox[0] + 2, bbox[1])
          ren_rgb_info = write_text_on_image(
            ren_rgb_info, pose['text_info'], text_loc, color=text_color,
            size=text_size)

  # Blend and save the RGB visualization.
  if vis_rgb:
    misc.ensure_dir(os.path.dirname(vis_rgb_path))

    vis_im_rgb = 0.5 * rgb.astype(np.float32) + \
                 0.5 * ren_rgb.astype(np.float32) + \
                 1.0 * ren_rgb_info.astype(np.float32)
    vis_im_rgb[vis_im_rgb > 255] = 255
    inout.save_im(vis_rgb_path, vis_im_rgb.astype(np.uint8), jpg_quality=95)

  # Save the image of depth differences.
  if vis_depth_diff:
    misc.ensure_dir(os.path.dirname(vis_depth_diff_path))

    # Calculate the depth difference at pixels where both depth maps are valid.
    valid_mask = (depth > 0) * (ren_depth > 0)
    depth_diff = valid_mask * (ren_depth.astype(np.float32) - depth)

    # Get mask of pixels where the rendered depth is at most by the tolerance
    # delta behind the captured depth (this tolerance is used in VSD).
    delta = 15
    below_delta = valid_mask * (depth_diff < delta)
    below_delta_vis = (255 * below_delta).astype(np.uint8)

    depth_diff_vis = 255 * depth_for_vis(depth_diff - depth_diff.min())

    # Pixels where the rendered depth is more than the tolerance delta behing
    # the captured depth will be cyan.
    depth_diff_vis = np.dstack(
      [below_delta_vis, depth_diff_vis, depth_diff_vis]).astype(np.uint8)

    depth_diff_vis[np.logical_not(valid_mask)] = 0
    depth_diff_valid = depth_diff[valid_mask]
    depth_info = [
      {'name': 'min diff', 'fmt': ':.3f', 'val': np.min(depth_diff_valid)},
      {'name': 'max diff', 'fmt': ':.3f', 'val': np.max(depth_diff_valid)},
      {'name': 'mean diff', 'fmt': ':.3f', 'val': np.mean(depth_diff_valid)},
    ]
    depth_diff_vis = write_text_on_image(depth_diff_vis, depth_info)
    inout.save_im(vis_depth_diff_path, depth_diff_vis)