dist_im = misc.depth_im_to_dist_im(depth_im, K) for gt_id, gt in enumerate(scene_gt[im_id]): # Render the depth image. depth_gt = ren.render_object(gt['obj_id'], gt['cam_R_m2c'], gt['cam_t_m2c'], fx, fy, cx, cy)['depth'] # Convert depth image to distance image. dist_gt = misc.depth_im_to_dist_im(depth_gt, K) # Mask of the full object silhouette. mask = dist_gt > 0 # Mask of the visible part of the object silhouette. mask_visib = visibility.estimate_visib_mask_gt(dist_im, dist_gt, p['delta'], visib_mode='bop19') # Save the calculated masks. mask_path = dp_split['mask_tpath'].format(scene_id=scene_id, im_id=im_id, gt_id=gt_id) inout.save_im(mask_path, 255 * mask.astype(np.uint8)) mask_visib_path = dp_split['mask_visib_tpath'].format( scene_id=scene_id, im_id=im_id, gt_id=gt_id) inout.save_im(mask_visib_path, 255 * mask_visib.astype(np.uint8))
}) # Visualization of the visibility mask. if p['vis_visibility_masks']: depth_im_vis = visualization.depth_for_vis(depth, 0.2, 1.0) depth_im_vis = np.dstack([depth_im_vis] * 3) visib_gt_vis = visib_gt.astype(np.float) zero_ch = np.zeros(visib_gt_vis.shape) visib_gt_vis = np.dstack([zero_ch, visib_gt_vis, zero_ch]) vis = 0.5 * depth_im_vis + 0.5 * visib_gt_vis vis[vis > 1] = 1 vis_path = p['vis_mask_visib_tpath'].format( delta=p['delta'], dataset=p['dataset'], split=p['dataset_split'], scene_id=scene_id, im_id=im_id, gt_id=gt_id) misc.ensure_dir(os.path.dirname(vis_path)) inout.save_im(vis_path, vis) # Save the info for the current scene. scene_gt_info_path = dp_split['scene_gt_info_tpath'].format( scene_id=scene_id) misc.ensure_dir(os.path.dirname(scene_gt_info_path)) inout.save_json(scene_gt_info_path, scene_gt_info)
def visualize_pred_frag(frag_confs, frag_coords, output_size, model_store, vis_prefix, vis_dir, vis_ext='png'): """Visualizes predicted fragment fields. Args: frag_confs: Predicted fragment confidences of shape [output_h, output_w, num_objs, num_frags]. frag_coords: Predicted 3D fragment coordinates of shape [field_h, field_w, num_fg_cls, num_bins, 3]. output_size: Size of the fragment fields. model_store: Store of 3D object models. vis_prefix: Name prefix of the visualizations. vis_dir: Where to save the visualizations. vis_ext: Extension of the visualizations ('jpg', 'png', etc.). """ num_objs = frag_confs.shape[2] tiles_centers = [] tiles_coords = [] tiles_reconst = [] for obj_id in range(1, num_objs + 1): # Fragment confidences of shape [field_h, field_w, num_frags]. conf_obj = frag_confs[:, :, obj_id - 1, :] field_shape = (conf_obj.shape[0], conf_obj.shape[1], 3) # Indices of fragments with the highest confidence. top_inds = np.argmax(conf_obj, axis=2) top_inds_f = top_inds.flatten() # Fragment centers. top_centers = np.reshape(model_store.frag_centers[obj_id][top_inds_f], field_shape) # Fragment coordinates of shape [field_h * field_w, num_frags, 3]. num_frags = frag_coords.shape[3] coords_obj = frag_coords[:, :, obj_id - 1, :, :].reshape( (-1, num_frags, 3)) # Top fragment coordinates of shape [field_h * field_w, 3]. top_coords_rel = coords_obj[np.arange(top_inds.size), top_inds_f] top_scales = model_store.frag_sizes[obj_id][top_inds_f] top_coords = top_coords_rel * top_scales.reshape((-1, 1)) # Reshape to [field_h, field_w, 3]. top_coords = top_coords.reshape(field_shape) # Reconstruction of shape [field_h * field_w, 3]. top_reconst = top_centers + top_coords txt_list = [{'name': 'cls', 'val': obj_id, 'fmt': ':d'}] tiles_centers.append( visualization.write_text_on_image(colorize_xyz(top_centers), txt_list, size=10, color=(1.0, 1.0, 1.0))) tiles_coords.append( visualization.write_text_on_image(colorize_xyz(top_coords), txt_list, size=10, color=(1.0, 1.0, 1.0))) tiles_reconst.append( visualization.write_text_on_image(colorize_xyz(top_reconst), txt_list, size=10, color=(1.0, 1.0, 1.0))) # Assemble and save the visualization grids. fname = '{}_pred_frag_centers.{}'.format(vis_prefix, vis_ext) grid = build_grid(tiles_centers, output_size) inout.save_im(os.path.join(vis_dir, fname), grid) fname = '{}_pred_frag_coords.{}'.format(vis_prefix, vis_ext) grid = build_grid(tiles_coords, output_size) inout.save_im(os.path.join(vis_dir, fname), grid) fname = '{}_pred_frag_reconst.{}'.format(vis_prefix, vis_ext) grid = build_grid(tiles_reconst, output_size) inout.save_im(os.path.join(vis_dir, fname), grid)
def vis_object_poses( poses, K, renderer, rgb=None, depth=None, vis_rgb_path=None, vis_depth_diff_path=None, vis_rgb_resolve_visib=False): """Visualizes 3D object models in specified poses in a single image. Two visualizations are created: 1. An RGB visualization (if vis_rgb_path is not None). 2. A Depth-difference visualization (if vis_depth_diff_path is not None). :param poses: List of dictionaries, each with info about one pose: - 'obj_id': Object ID. - 'R': 3x3 ndarray with a rotation matrix. - 't': 3x1 ndarray with a translation vector. - 'text_info': Info to write at the object (see write_text_on_image). :param K: 3x3 ndarray with an intrinsic camera matrix. :param renderer: Instance of the Renderer class (see renderer.py). :param rgb: ndarray with the RGB input image. :param depth: ndarray with the depth input image. :param vis_rgb_path: Path to the output RGB visualization. :param vis_depth_diff_path: Path to the output depth-difference visualization. :param vis_rgb_resolve_visib: Whether to resolve visibility of the objects (i.e. only the closest object is visualized at each pixel). """ fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2] # Indicators of visualization types. vis_rgb = vis_rgb_path is not None vis_depth_diff = vis_depth_diff_path is not None if vis_rgb and rgb is None: raise ValueError('RGB visualization triggered but RGB image not provided.') if (vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib)) and depth is None: raise ValueError('Depth visualization triggered but D image not provided.') # Prepare images for rendering. im_size = None ren_rgb = None ren_rgb_info = None ren_depth = None if vis_rgb: im_size = (rgb.shape[1], rgb.shape[0]) ren_rgb = np.zeros(rgb.shape, np.uint8) ren_rgb_info = np.zeros(rgb.shape, np.uint8) if vis_depth_diff: if im_size and im_size != (depth.shape[1], depth.shape[0]): raise ValueError('The RGB and D images must have the same size.') else: im_size = (depth.shape[1], depth.shape[0]) if vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib): ren_depth = np.zeros((im_size[1], im_size[0]), np.float32) # Render the pose estimates one by one. for pose in poses: # Rendering. ren_out = renderer.render_object( pose['obj_id'], pose['R'], pose['t'], fx, fy, cx, cy) m_rgb = None if vis_rgb: m_rgb = ren_out['rgb'] m_mask = None if vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib): m_depth = ren_out['depth'] # Get mask of the surface parts that are closer than the # surfaces rendered before. visible_mask = np.logical_or(ren_depth == 0, m_depth < ren_depth) m_mask = np.logical_and(m_depth != 0, visible_mask) ren_depth[m_mask] = m_depth[m_mask].astype(ren_depth.dtype) # Combine the RGB renderings. if vis_rgb: if vis_rgb_resolve_visib: ren_rgb[m_mask] = m_rgb[m_mask].astype(ren_rgb.dtype) else: ren_rgb_f = ren_rgb.astype(np.float32) + m_rgb.astype(np.float32) ren_rgb_f[ren_rgb_f > 255] = 255 ren_rgb = ren_rgb_f.astype(np.uint8) # Draw 2D bounding box and write text info. obj_mask = np.sum(m_rgb > 0, axis=2) ys, xs = obj_mask.nonzero() if len(ys): # bbox_color = model_color # text_color = model_color bbox_color = (0.3, 0.3, 0.3) text_color = (1.0, 1.0, 1.0) text_size = 11 bbox = misc.calc_2d_bbox(xs, ys, im_size) im_size = (obj_mask.shape[1], obj_mask.shape[0]) ren_rgb_info = draw_rect(ren_rgb_info, bbox, bbox_color) if 'text_info' in pose: text_loc = (bbox[0] + 2, bbox[1]) ren_rgb_info = write_text_on_image( ren_rgb_info, pose['text_info'], text_loc, color=text_color, size=text_size) # Blend and save the RGB visualization. if vis_rgb: vis_im_rgb = 0.5 * rgb.astype(np.float32) + \ 0.5 * ren_rgb.astype(np.float32) + \ 1.0 * ren_rgb_info.astype(np.float32) vis_im_rgb[vis_im_rgb > 255] = 255 misc.ensure_dir(os.path.dirname(vis_rgb_path)) inout.save_im(vis_rgb_path, vis_im_rgb.astype(np.uint8), jpg_quality=95) # Save the image of depth differences. if vis_depth_diff: # Calculate the depth difference at pixels where both depth maps # are valid. valid_mask = (depth > 0) * (ren_depth > 0) depth_diff = valid_mask * (depth - ren_depth.astype(np.float32)) f, ax = plt.subplots(1, 1) cax = ax.matshow(depth_diff) ax.axis('off') ax.set_title('captured - GT depth [mm]') f.colorbar(cax, fraction=0.03, pad=0.01) f.tight_layout(pad=0) if not vis_rgb: misc.ensure_dir(os.path.dirname(vis_depth_diff_path)) plt.savefig(vis_depth_diff_path, pad=0, bbox_inches='tight', quality=95) plt.close()
def vis_object_poses( poses, K, renderer, rgb=None, depth=None, vis_rgb_path=None, vis_depth_diff_path=None, vis_rgb_resolve_visib=False): """Visualizes 3D object models in specified poses in a single image. Two visualizations are created: 1. An RGB visualization (if vis_rgb_path is not None). 2. A Depth-difference visualization (if vis_depth_diff_path is not None). :param poses: List of dictionaries, each with info about one pose: - 'obj_id': Object ID. - 'R': 3x3 ndarray with a rotation matrix. - 't': 3x1 ndarray with a translation vector. - 'text_info': Info to write at the object (see write_text_on_image). :param K: 3x3 ndarray with an intrinsic camera matrix. :param renderer: Instance of the Renderer class (see renderer.py). :param rgb: ndarray with the RGB input image. :param depth: ndarray with the depth input image. :param vis_rgb_path: Path to the output RGB visualization. :param vis_depth_diff_path: Path to the output depth-difference visualization. :param vis_rgb_resolve_visib: Whether to resolve visibility of the objects (i.e. only the closest object is visualized at each pixel). """ fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2] # Indicators of visualization types. vis_rgb = vis_rgb_path is not None vis_depth_diff = vis_depth_diff_path is not None if vis_rgb and rgb is None: raise ValueError('RGB visualization triggered but RGB image not provided.') if (vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib)) and depth is None: raise ValueError('Depth visualization triggered but D image not provided.') # Prepare images for rendering. im_size = None ren_rgb = None ren_rgb_info = None ren_depth = None if vis_rgb: im_size = (rgb.shape[1], rgb.shape[0]) ren_rgb = np.zeros(rgb.shape, np.uint8) ren_rgb_info = np.zeros(rgb.shape, np.uint8) if vis_depth_diff: if im_size and im_size != (depth.shape[1], depth.shape[0]): raise ValueError('The RGB and D images must have the same size.') else: im_size = (depth.shape[1], depth.shape[0]) if vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib): ren_depth = np.zeros((im_size[1], im_size[0]), np.float32) # Render the pose estimates one by one. for pose in poses: # Rendering. ren_out = renderer.render_object( pose['obj_id'], pose['R'], pose['t'], fx, fy, cx, cy) m_rgb = None if vis_rgb: m_rgb = ren_out['rgb'] m_mask = None if vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib): m_depth = ren_out['depth'] # Get mask of the surface parts that are closer than the # surfaces rendered before. visible_mask = np.logical_or(ren_depth == 0, m_depth < ren_depth) m_mask = np.logical_and(m_depth != 0, visible_mask) ren_depth[m_mask] = m_depth[m_mask].astype(ren_depth.dtype) # Combine the RGB renderings. if vis_rgb: if vis_rgb_resolve_visib: ren_rgb[m_mask] = m_rgb[m_mask].astype(ren_rgb.dtype) else: ren_rgb_f = ren_rgb.astype(np.float32) + m_rgb.astype(np.float32) ren_rgb_f[ren_rgb_f > 255] = 255 ren_rgb = ren_rgb_f.astype(np.uint8) # Draw 2D bounding box and write text info. obj_mask = np.sum(m_rgb > 0, axis=2) ys, xs = obj_mask.nonzero() if len(ys): # bbox_color = model_color # text_color = model_color bbox_color = (0.3, 0.3, 0.3) text_color = (1.0, 1.0, 1.0) text_size = 11 bbox = misc.calc_2d_bbox(xs, ys, im_size) im_size = (obj_mask.shape[1], obj_mask.shape[0]) ren_rgb_info = draw_rect(ren_rgb_info, bbox, bbox_color) if 'text_info' in pose: text_loc = (bbox[0] + 2, bbox[1]) ren_rgb_info = write_text_on_image( ren_rgb_info, pose['text_info'], text_loc, color=text_color, size=text_size) # Blend and save the RGB visualization. if vis_rgb: misc.ensure_dir(os.path.dirname(vis_rgb_path)) vis_im_rgb = 0.5 * rgb.astype(np.float32) + \ 0.5 * ren_rgb.astype(np.float32) + \ 1.0 * ren_rgb_info.astype(np.float32) vis_im_rgb[vis_im_rgb > 255] = 255 inout.save_im(vis_rgb_path, vis_im_rgb.astype(np.uint8), jpg_quality=95) # Save the image of depth differences. if vis_depth_diff: misc.ensure_dir(os.path.dirname(vis_depth_diff_path)) # Calculate the depth difference at pixels where both depth maps are valid. valid_mask = (depth > 0) * (ren_depth > 0) depth_diff = valid_mask * (ren_depth.astype(np.float32) - depth) # Get mask of pixels where the rendered depth is at most by the tolerance # delta behind the captured depth (this tolerance is used in VSD). delta = 15 below_delta = valid_mask * (depth_diff < delta) below_delta_vis = (255 * below_delta).astype(np.uint8) depth_diff_vis = 255 * depth_for_vis(depth_diff - depth_diff.min()) # Pixels where the rendered depth is more than the tolerance delta behing # the captured depth will be cyan. depth_diff_vis = np.dstack( [below_delta_vis, depth_diff_vis, depth_diff_vis]).astype(np.uint8) depth_diff_vis[np.logical_not(valid_mask)] = 0 depth_diff_valid = depth_diff[valid_mask] depth_info = [ {'name': 'min diff', 'fmt': ':.3f', 'val': np.min(depth_diff_valid)}, {'name': 'max diff', 'fmt': ':.3f', 'val': np.max(depth_diff_valid)}, {'name': 'mean diff', 'fmt': ':.3f', 'val': np.mean(depth_diff_valid)}, ] depth_diff_vis = write_text_on_image(depth_diff_vis, depth_info) inout.save_im(vis_depth_diff_path, depth_diff_vis)
def visualize_gt_frag(gt_obj_ids, gt_obj_masks, gt_frag_labels, gt_frag_weights, gt_frag_coords, output_size, model_store, vis_prefix, vis_dir): """Visualizes GT fragment fields. Args: gt_obj_ids: GT object ID's. gt_obj_masks: GT object instance masks. gt_frag_labels: GT fragment labels. gt_frag_weights: GT fragment weights. gt_frag_coords: GT fragment coordinates. output_size: Size of the output fields. model_store: Store of 3D object models. vis_dir: Where to save the visualizations. vis_prefix: Name prefix of the visualizations. """ # Consider the first (i.e. the closest) fragment. frag_ind = 0 centers_vis = np.zeros((output_size[1], output_size[0], 3)) for gt_id, obj_id in enumerate(gt_obj_ids): obj_mask = gt_obj_masks[gt_id] obj_frag_labels = gt_frag_labels[obj_mask][:, frag_ind] centers_vis[obj_mask] = model_store.frag_centers[obj_id][ obj_frag_labels] weights_vis = gt_frag_weights[:, :, frag_ind] weights_vis /= weights_vis.max() coords_vis = np.zeros((output_size[1], output_size[0], 3)) for gt_id, obj_id in enumerate(gt_obj_ids): obj_mask = gt_obj_masks[gt_id] obj_frag_labels = gt_frag_labels[obj_mask][:, frag_ind] obj_frag_coords = gt_frag_coords[obj_mask][:, frag_ind, :] # Scale by fragment sizes. frag_scales = model_store.frag_sizes[obj_id][obj_frag_labels] obj_frag_coords *= np.expand_dims(frag_scales, 1) coords_vis[obj_mask] = obj_frag_coords # Reconstruct the XYZ object coordinates. xyz_vis = centers_vis + coords_vis # Normalize the visualizations. centers_vis = centers_vis - centers_vis.min() centers_vis /= centers_vis.max() coords_vis = coords_vis - coords_vis.min() coords_vis /= coords_vis.max() xyz_vis = xyz_vis - xyz_vis.min() xyz_vis /= xyz_vis.max() # Save the visualizations. inout.save_im( os.path.join(vis_dir, '{}_gt_frag_labels.png'.format(vis_prefix)), (255.0 * centers_vis).astype(np.uint8)) inout.save_im( os.path.join(vis_dir, '{}_gt_frag_coords.png'.format(vis_prefix)), (255.0 * coords_vis).astype(np.uint8)) inout.save_im( os.path.join(vis_dir, '{}_gt_frag_reconst.png'.format(vis_prefix)), (255.0 * xyz_vis).astype(np.uint8)) inout.save_im( os.path.join(vis_dir, '{}_gt_frag_weights.png'.format(vis_prefix)), (255.0 * weights_vis).astype(np.uint8))
# mask_color = tuple(colors[(obj_id - 1) % len(colors)]) # find bbox top left and bottom right and cut images rs, cs = obj_mask[:,:,0].nonzero() # row and column coordinates if len(rs): bb_min = [rs.min(), cs.min()] bb_max = [rs.max(), cs.max()] rgb = rgb[bb_min[0]:bb_max[0]+1, bb_min[1]:bb_max[1]+1, :] uv = uv[bb_min[0]:bb_max[0]+1, bb_min[1]:bb_max[1]+1, :] obj_mask = obj_mask[bb_min[0]:bb_max[0]+1, bb_min[1]:bb_max[1]+1, :] # depth tbd... # Save the rendered images. out_rgb_path = out_rgb_tpath.format( out_path=out_path, obj_id=obj_id, im_id=im_id) inout.save_im(out_rgb_path, rgb) # out_depth_path = out_depth_tpath.format( # out_path=out_path, obj_id=obj_id, im_id=im_id) # inout.save_depth(out_depth_path, depth) out_uv_path = out_uv_tpath.format( out_path=out_path, obj_id=obj_id, im_id=im_id) inout.save_im(out_uv_path, uv) out_mask_path = out_mask_tpath.format( out_path=out_path, obj_id=obj_id, im_id=im_id) inout.save_im(out_mask_path, obj_mask) # Get 2D bounding box of the object model at the ground truth pose. # ys, xs = np.nonzero(depth > 0)
for obj_id in dp_model['obj_ids']: # Load object model. misc.log('Loading 3D model of object {}...'.format(obj_id)) model_path = dp_model['model_tpath'].format(obj_id=obj_id) ren.add_object(obj_id, model_path) poses = misc.get_symmetry_transformations(models_info[obj_id], p['max_sym_disc_step']) for pose_id, pose in enumerate(poses): for view_id, view in enumerate(p['views']): R = view['R'].dot(pose['R']) t = view['R'].dot(pose['t']) + view['t'] vis_rgb = ren.render_object(obj_id, R, t, fx, fy, cx, cy)['rgb'] # Path to the output RGB visualization. vis_rgb_path = p['vis_rgb_tpath'].format(vis_path=p['vis_path'], dataset=p['dataset'], obj_id=obj_id, view_id=view_id, pose_id=pose_id) misc.ensure_dir(os.path.dirname(vis_rgb_path)) inout.save_im(vis_rgb_path, vis_rgb) misc.log('Done.')
img = inout.load_im(rgb_fn) mask = inout.load_im(mask_files[img_id]) > 0 vu_valid = np.where(mask) bbox = np.array([ np.min(vu_valid[0]), np.min(vu_valid[1]), np.max(vu_valid[0]), np.max(vu_valid[1]) ]) crop_img = np.zeros((bbox[2] - bbox[0], bbox[3] - bbox[1], 3), np.uint8) img = img[bbox[0]:bbox[2], bbox[1]:bbox[3]] crop_img[mask[bbox[0]:bbox[2], bbox[1]:bbox[3]]] = img[mask[bbox[0]:bbox[2], bbox[1]:bbox[3]]] inout.save_im(crop_fn, crop_img) inout.save_im( cropmask_fn, mask[bbox[0]:bbox[2], bbox[1]:bbox[3]].astype(np.uint8) * 255) crop_fns.append(crop_fn) crop_masks.append(cropmask_fn) obj_idx = model_map.index(obj_id) instance_id = model_maxinst[obj_idx] model_idx[obj_idx, instance_id] = overall_idx model_maxinst[obj_idx] += 1 overall_idx += 1 z_tra_mean = np.mean(z_tras) mean_scale = z_tra_mean / mean_depth # 0.5 / 1 mean_sigma = 0.5 * mean_scale
# Convert depth so it is in the same units as other images in the dataset. depth /= float(dp_camera['depth_scale']) # The OpenCV function was used for rendering of the training images # provided for the SIXD Challenge 2017. rgb = cv2.resize(rgb, dp_camera['im_size'], interpolation=cv2.INTER_AREA) # rgb = scipy.misc.imresize(rgb, par['cam']['im_size'][::-1], 'bicubic') # Save the rendered images. out_rgb_path = out_rgb_tpath.format(out_path=out_path, obj_id=obj_id, im_id=im_id) inout.save_im(out_rgb_path, rgb) out_depth_path = out_depth_tpath.format(out_path=out_path, obj_id=obj_id, im_id=im_id) inout.save_depth(out_depth_path, depth) # Get 2D bounding box of the object model at the ground truth pose. # ys, xs = np.nonzero(depth > 0) # obj_bb = misc.calc_2d_bbox(xs, ys, dp_camera['im_size']) scene_camera[im_id] = { 'cam_K': dp_camera['K'].flatten().tolist(), 'depth_scale': dp_camera['depth_scale'], 'view_level': int(views_level[view_id]) }
for im_id in range(len(ref_gt)): rgb_fn = os.path.join(target_dir + "/rgb", "{:06d}.png".format(im_id)) depth_fn = os.path.join(target_dir + "/depth", "{:06d}.png".format(im_id)) mask_fn = os.path.join(target_dir + "/mask", "{:06d}.png".format(im_id)) rot = ref_gt[im_id][0]['cam_R_m2c'] tra = ref_gt[im_id][0]['cam_t_m2c'] / 1000 tf = np.eye(4) tf[:3, :3] = rot tf[:3, 3] = tra[:, 0] ren.clear() ren.draw_model(obj_model, tf) img_r, depth = ren.finish() img_r = img_r[:, :, ::-1] mask = depth > 0 inout.save_im(rgb_fn, (img_r * 255).astype(np.uint8)) inout.save_im(mask_fn, mask.astype(np.uint8) * 255) new_gt[im_id][0]['obj_bb'] = [0, 0, 0, 0] new_gt[im_id][0]['obj_id'] = int(model_ids[i]) new_camera[im_id]['cam_K'] = np.array(camK) new_camera[im_id]['depth_scale'] = float(1) #inout.save_depth(depth_fn,depth*65535) #we don't need detph for training (use only for ICP/inference) inout.save_scene_gt(scene_gt, new_gt) inout.save_scene_camera(scene_camera, new_camera)
def vis_object_poses_uv(poses, K, renderer, rgb=None, depth=None, vis_rgb_path=None, vis_depth_diff_path=None, vis_rgb_resolve_visib=False, vis_uv_path=None, vis_mask_path=None): """Visualizes 3D object models in specified poses in a single image. Two visualizations are created: 1. An RGB visualization (if vis_rgb_path is not None). 2. A Depth-difference visualization (if vis_depth_diff_path is not None). :param poses: List of dictionaries, each with info about one pose: - 'obj_id': Object ID. - 'R': 3x3 ndarray with a rotation matrix. - 't': 3x1 ndarray with a translation vector. - 'text_info': Info to write at the object (see write_text_on_image). :param K: 3x3 ndarray with an intrinsic camera matrix. :param renderer: Instance of the Renderer class (see renderer.py). :param rgb: ndarray with the RGB input image. :param depth: ndarray with the depth input image. :param vis_rgb_path: Path to the output RGB visualization. :param vis_depth_diff_path: Path to the output depth-difference visualization. :param vis_rgb_resolve_visib: Whether to resolve visibility of the objects (i.e. only the closest object is visualized at each pixel). """ fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2] # Indicators of visualization types. vis_rgb = vis_rgb_path is not None vis_depth_diff = vis_depth_diff_path is not None vis_uv = vis_uv_path is not None # assert background images if vis_rgb and rgb is None: raise ValueError( 'RGB visualization triggered but RGB image not provided.') if (vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib)) and depth is None: raise ValueError( 'Depth visualization triggered but D image not provided.') # Prepare images for rendering. im_size = None ren_rgb = None ren_rgb_info = None ren_depth = None if vis_rgb: im_size = (rgb.shape[1], rgb.shape[0]) ren_rgb = np.zeros(rgb.shape, np.uint8) ren_rgb_info = np.zeros(rgb.shape, np.uint8) # for the masks if vis_uv: im_size = (rgb.shape[1], rgb.shape[0]) ren_mask = np.zeros(rgb.shape, np.uint8) if vis_depth_diff: if im_size and im_size != (depth.shape[1], depth.shape[0]): raise ValueError('The RGB and D images must have the same size.') else: im_size = (depth.shape[1], depth.shape[0]) if vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib): ren_depth = np.zeros((im_size[1], im_size[0]), np.float32) # Render the pose estimates one by one. for gt_id, pose in enumerate(poses): # Rendering. ren_out = renderer.render_object(pose['obj_id'], pose['R'], pose['t'], fx, fy, cx, cy) # currently in uv colors m_rgb = None if vis_rgb: m_rgb = ren_out['rgb'] m_mask_rgb = None if vis_uv: # create mask in object color m_mask_rgb = np.sum(m_rgb > 0, axis=2) >= 1 m_mask_rgb = np.stack([m_mask_rgb] * 3, axis=2) # erode mask to remove 'black' border kernel = np.ones((5, 5), np.uint8) m_mask_rgb = cv2.erode(m_mask_rgb.astype(np.uint8), kernel, cv2.BORDER_CONSTANT, borderValue=0).astype(np.bool_) # apply eroded mask to renderings m_rgb = m_rgb * m_mask_rgb # create mask with obj id m_mask_rgb = (m_mask_rgb * pose['obj_id']).astype('uint8') # mask_color = tuple(colors[(obj_id - 1) % len(colors)]) m_mask = None if vis_depth_diff or (vis_rgb and vis_rgb_resolve_visib): m_depth = ren_out['depth'] # Get mask of the surface parts that are closer than the # surfaces rendered before. visible_mask = np.logical_or(ren_depth == 0, m_depth < ren_depth) m_mask = np.logical_and(m_depth != 0, visible_mask) ren_depth[m_mask] = m_depth[m_mask].astype(ren_depth.dtype) # # Save uv models solely before starting comination steps # if vis_uv: # misc.ensure_dir(os.path.dirname(vis_uv_path[gt_id])) # ren_uv = np.zeros(rgb.shape, np.uint8) # ren_uv_f = ren_uv.astype(np.float32) + m_rgb.astype(np.float32) # black background + current model rendered # ren_uv_f[ren_uv_f > 255] = 255 # ren_uv = ren_uv_f.astype(np.uint8) # inout.save_im(vis_uv_path[gt_id], ren_uv, jpg_quality=95) # Combine the RGB renderings. if vis_rgb: if vis_rgb_resolve_visib: ren_rgb[m_mask] = m_rgb[m_mask].astype(ren_rgb.dtype) else: ren_rgb_f = ren_rgb.astype(np.float32) + m_rgb.astype( np.float32) ren_rgb_f[ren_rgb_f > 255] = 255 ren_rgb = ren_rgb_f.astype(np.uint8) m_mask_idx = (ren_mask == 0) & (ren_mask > 0) m_mask_rgb = m_mask_rgb[m_mask_idx] ren_mask = ren_mask + m_mask_rgb ren_mask[ren_mask > 255] = 255 ren_mask = ren_mask.astype(np.uint8) # # Draw 2D bounding box and write text info. # obj_mask = np.sum(m_rgb > 0, axis=2) # ys, xs = obj_mask.nonzero() # if len(ys): # # bbox_color = model_color # # text_color = model_color # bbox_color = (0.3, 0.3, 0.3) # text_color = (1.0, 1.0, 1.0) # text_size = 11 # bbox = misc.calc_2d_bbox(xs, ys, im_size) # im_size = (obj_mask.shape[1], obj_mask.shape[0]) # ren_rgb_info = draw_rect(ren_rgb_info, bbox, bbox_color) # if 'text_info' in pose: # text_loc = (bbox[0] + 2, bbox[1]) # ren_rgb_info = write_text_on_image( # ren_rgb_info, pose['text_info'], text_loc, color=text_color, # size=text_size) # Blend and save the RGB visualization. if vis_rgb: misc.ensure_dir(os.path.dirname(vis_rgb_path)) # vis_im_rgb = 0.5 * rgb.astype(np.float32) + \ # 0.5 * ren_rgb.astype(np.float32) + \ # 1.0 * ren_rgb_info.astype(np.float32) # vis_im_rgb[vis_im_rgb > 255] = 255 # inout.save_im(vis_rgb_path, vis_im_rgb.astype(np.uint8), jpg_quality=95) inout.save_im(vis_rgb_path, rgb) # only background # Save uv models and masks if vis_uv: misc.ensure_dir(os.path.dirname(vis_uv_path)) ren_uv = ren_rgb.astype(np.uint8) inout.save_im(vis_uv_path, ren_uv) misc.ensure_dir(os.path.dirname(vis_mask_path)) ren_mask = ren_mask.astype(np.uint8) inout.save_im(vis_mask_path, ren_mask) # Save the image of depth differences. if vis_depth_diff: misc.ensure_dir(os.path.dirname(vis_depth_diff_path)) # Calculate the depth difference at pixels where both depth maps are valid. valid_mask = (depth > 0) * (ren_depth > 0) depth_diff = valid_mask * (ren_depth.astype(np.float32) - depth) delta = 15 below_delta = valid_mask * (depth_diff < delta) below_delta_vis = (255 * below_delta).astype(np.uint8) depth_diff_vis = 255 * depth_for_vis(depth_diff - depth_diff.min()) depth_diff_vis = np.dstack( [below_delta_vis, depth_diff_vis, depth_diff_vis]).astype(np.uint8) depth_diff_vis[np.logical_not(valid_mask)] = 0 depth_diff_valid = depth_diff[valid_mask] depth_info = [ { 'name': 'min diff', 'fmt': ':.3f', 'val': np.min(depth_diff_valid) }, { 'name': 'max diff', 'fmt': ':.3f', 'val': np.max(depth_diff_valid) }, { 'name': 'mean diff', 'fmt': ':.3f', 'val': np.mean(depth_diff_valid) }, ] depth_diff_vis = write_text_on_image(depth_diff_vis, depth_info) inout.save_im(vis_depth_diff_path, depth_diff_vis)
def visualize( samples, predictions, pred_poses, im_ind, crop_size, output_scale, model_store, renderer, vis_dir): """Visualizes estimates from one image. Args: samples: Dictionary with input data. predictions: Dictionary with predictions. pred_poses: Predicted poses. im_ind: Image index. crop_size: Image crop size (width, height). output_scale: Scale of the model output w.r.t. the input (output / input). model_store: Store for 3D object models of class ObjectModelStore. renderer: Renderer of class bop_renderer.Renderer(). vis_dir: Directory where the visualizations will be saved. """ tf.logging.info('Visualization for: {}'.format( samples[common.IMAGE_PATH][0].decode('utf8'))) # Size of a visualization grid tile. tile_size = (300, 225) # Extension of the saved visualizations ('jpg', 'png', etc.). vis_ext = 'jpg' # Font settings. font_size = 10 font_color = (0.8, 0.8, 0.8) # Intrinsics. K = samples[common.K][0] output_K = K * output_scale output_K[2, 2] = 1.0 # Tiles for the grid visualization. tiles = [] # Size of the output fields. output_size =\ int(output_scale * crop_size[0]), int(output_scale * crop_size[1]) # Prefix of the visualization names. vis_prefix = '{:06d}'.format(im_ind) # Input RGB image. rgb = np.squeeze(samples[common.IMAGE][0]) vis_rgb = visualization.write_text_on_image( misc.resize_image_py(rgb, tile_size).astype(np.uint8), [{'name': '', 'val': 'input', 'fmt': ':s'}], size=font_size, color=font_color) tiles.append(vis_rgb) # Visualize the ground-truth poses. if FLAGS.vis_gt_poses: gt_poses = [] for gt_id, obj_id in enumerate(samples[common.GT_OBJ_IDS][0]): q = samples[common.GT_OBJ_QUATS][0][gt_id] R = transform.quaternion_matrix(q)[:3, :3] t = samples[common.GT_OBJ_TRANS][0][gt_id].reshape((3, 1)) gt_poses.append({'obj_id': obj_id, 'R': R, 't': t}) vis_gt_poses = vis.visualize_object_poses(rgb, K, gt_poses, renderer) vis_gt_poses = visualization.write_text_on_image( misc.resize_image_py(vis_gt_poses, tile_size), [{'name': '', 'val': 'gt poses', 'fmt': ':s'}], size=font_size, color=font_color) tiles.append(vis_gt_poses) # Visualize the estimated poses. if FLAGS.vis_pred_poses: vis_pred_poses = vis.visualize_object_poses(rgb, K, pred_poses, renderer) vis_pred_poses = visualization.write_text_on_image( misc.resize_image_py(vis_pred_poses, tile_size), [{'name': '', 'val': 'pred poses', 'fmt': ':s'}], size=font_size, color=font_color) tiles.append(vis_pred_poses) # Ground-truth object labels. if FLAGS.vis_gt_obj_labels and common.GT_OBJ_LABEL in samples: obj_labels = np.squeeze(samples[common.GT_OBJ_LABEL][0]) obj_labels = obj_labels[:crop_size[1], :crop_size[0]] obj_labels = vis.colorize_label_map(obj_labels) obj_labels = visualization.write_text_on_image( misc.resize_image_py(obj_labels.astype(np.uint8), tile_size), [{'name': '', 'val': 'gt obj labels', 'fmt': ':s'}], size=font_size, color=font_color) tiles.append(obj_labels) # Predicted object labels. if FLAGS.vis_pred_obj_labels: obj_labels = np.squeeze(predictions[common.PRED_OBJ_LABEL][0]) obj_labels = obj_labels[:crop_size[1], :crop_size[0]] obj_labels = vis.colorize_label_map(obj_labels) obj_labels = visualization.write_text_on_image( misc.resize_image_py(obj_labels.astype(np.uint8), tile_size), [{'name': '', 'val': 'predicted obj labels', 'fmt': ':s'}], size=font_size, color=font_color) tiles.append(obj_labels) # Predicted object confidences. if FLAGS.vis_pred_obj_confs: num_obj_labels = predictions[common.PRED_OBJ_CONF].shape[-1] for obj_label in range(num_obj_labels): obj_confs = misc.resize_image_py(np.array( predictions[common.PRED_OBJ_CONF][0, :, :, obj_label]), tile_size) obj_confs = (255.0 * obj_confs).astype(np.uint8) obj_confs = np.dstack([obj_confs, obj_confs, obj_confs]) # To RGB. obj_confs = visualization.write_text_on_image( obj_confs, [{'name': 'cls', 'val': obj_label, 'fmt': ':d'}], size=font_size, color=font_color) tiles.append(obj_confs) # Visualization of ground-truth fragment fields. if FLAGS.vis_gt_frag_fields and common.GT_OBJ_IDS in samples: vis.visualize_gt_frag( gt_obj_ids=samples[common.GT_OBJ_IDS][0], gt_obj_masks=samples[common.GT_OBJ_MASKS][0], gt_frag_labels=samples[common.GT_FRAG_LABEL][0], gt_frag_weights=samples[common.GT_FRAG_WEIGHT][0], gt_frag_coords=samples[common.GT_FRAG_LOC][0], output_size=output_size, model_store=model_store, vis_prefix=vis_prefix, vis_dir=vis_dir) # Visualization of predicted fragment fields. if FLAGS.vis_pred_frag_fields: vis.visualize_pred_frag( frag_confs=predictions[common.PRED_FRAG_CONF][0], frag_coords=predictions[common.PRED_FRAG_LOC][0], output_size=output_size, model_store=model_store, vis_prefix=vis_prefix, vis_dir=vis_dir, vis_ext=vis_ext) # Build and save a visualization grid. grid = vis.build_grid(tiles, tile_size) grid_vis_path = os.path.join( vis_dir, '{}_grid.{}'.format(vis_prefix, vis_ext)) inout.save_im(grid_vis_path, grid)