def test_raster_coordinates(scene, batch_size): """Test if the projected raster coordinates are correct Args: scene: Path to scene file Returns: None """ res = render_scene(scene) scene = make_torch_var(load_scene(scene)) pos_cc = res['pos'].reshape(1, -1, res['pos'].shape[-1]) pos_cc = pos_cc.repeat(batch_size, 1, 1) camera = scene['camera'] camera['eye'] = camera['eye'].repeat(batch_size, 1) camera['at'] = camera['at'].repeat(batch_size, 1) camera['up'] = camera['up'].repeat(batch_size, 1) viewport = make_list2np(camera['viewport']) W, H = float(viewport[2] - viewport[0]), float(viewport[3] - viewport[1]) px_coord_idx, px_coord = project_image_coordinates(pos_cc, camera) xp, yp = np.meshgrid(np.linspace(0, W - 1, int(W)), np.linspace(0, H - 1, int(H))) xp = xp.ravel()[None, ...].repeat(batch_size, axis=0) yp = yp.ravel()[None, ...].repeat(batch_size, axis=0) px_coord = torch.round(px_coord - 0.5).long() np.testing.assert_array_almost_equal(xp, get_data(px_coord[..., 0])) np.testing.assert_array_almost_equal(yp, get_data(px_coord[..., 1]))
def project_image_coordinates(surfels, camera): """Project surfels given in world coordinate to the camera's projection plane. Args: surfels: [batch_size, pos] camera: [{'eye': [num_batches,...], 'lookat': [num_batches,...], 'up': [num_batches,...], 'viewport': [0, 0, W, H], 'fovy': <radians>}] Returns: Image of destination indices of dimensions [batch_size, H*W] Note that the range of possible coordinates is restricted to be between 0 and W*H (inclusive). This is inclusive because we use the last index as a "dump" for any index that falls outside of the camera's field of view """ surfels_plane = project_surfels(surfels, camera) # Rasterize viewport = make_list2np(camera['viewport']) W, H = float(viewport[2] - viewport[0]), float(viewport[3] - viewport[1]) aspect_ratio = float(W) / float(H) fovy = make_list2np(camera['fovy']) focal_length = make_list2np(camera['focal_length']) h = np.tan(fovy / 2) * 2 * focal_length w = h * aspect_ratio px_coord = torch.zeros_like(surfels_plane) px_coord[..., 2] = surfels_plane[..., 2] # Make sure to also transmit the new depth px_coord[..., :2] = surfels_plane[..., :2] * tch_var_f( [-(W - 1) / w, (H - 1) / h]).unsqueeze(-2) + tch_var_f( [W / 2., H / 2.]).unsqueeze(-2) px_coord_idx = torch.round(px_coord - 0.5).long() px_idx = px_coord_idx[..., 1] * W + px_coord_idx[..., 0] max_idx = W * H # Index used if the indices are out of bounds of the camera max_idx_tensor = tch_var_l([max_idx]) # Map out of bounds pixels to the last (extra) index mask = (px_coord_idx[..., 1] < 0) | (px_coord_idx[..., 0] < 0) | ( px_coord_idx[..., 1] >= H) | (px_coord_idx[..., 0] >= W) px_idx = torch.where(mask, max_idx_tensor, px_idx) return px_idx, px_coord
def projection_renderer_differentiable(surfels, rgb, camera, rotated_image=None, blur_size=0.15): """Project surfels given in world coordinate to the camera's projection plane in a way that is differentiable w.r.t depth. This is achieved by interpolating the surfel values using a Gaussian filter. Args: surfels: [batch_size, num_surfels, pos] rgb: [batch_size, num_surfels, D-channel data] or [batch_size, H, W, D-channel data] camera: [{'eye': [num_batches,...], 'lookat': [num_batches,...], 'up': [num_batches,...], 'viewport': [0, 0, W, H], 'fovy': <radians>}] rotated_image: [batch_size, num_surfels, D-channel data] or [batch_size, H, W, D-channel data] Image to mix in with the result of the rotation. sigma: Std of the Gaussian used for filtering. As a rule of thumb, surfels in a radius of 3*sigma around a pixel will have a contribution on that pixel in the final image. Returns: RGB image of dimensions [batch_size, H, W, 3] from projected surfels """ px_idx, px_coord = project_image_coordinates(surfels, camera) viewport = make_list2np(camera['viewport']) W = int(viewport[2] - viewport[0]) H = int(viewport[3] - viewport[1]) rgb_reshaped = rgb.view(rgb.size(0), -1, rgb.size(-1)) # Perform a weighted average of points surrounding a pixel using a Gaussian filter # Very similar to the idea in this paper: https://arxiv.org/pdf/1810.09381.pdf x, y = np.meshgrid( np.linspace(0, W - 1, W) + 0.5, np.linspace(0, H - 1, H) + 0.5) x, y = tch_var_f(x.ravel()).repeat(surfels.size(0), 1), tch_var_f(y.ravel()).repeat( surfels.size(0), 1) x, y = x.unsqueeze(-1), y.unsqueeze(-1) xp, yp = px_coord[..., 0].unsqueeze(-2), px_coord[..., 1].unsqueeze(-2) sigma = blur_size * rgb.size(-2) / 6 scale = torch.exp((-(xp - x)**2 - (yp - y)**2) / (2 * sigma**2)) mask = scale.sum(-1) if rotated_image is not None: rotated_image = rotated_image.view(*rgb_reshaped.size()) # out = (rotated_image_weight * rotated_image + torch.sum(scale.unsqueeze(-1) * rgb_reshaped.unsqueeze(-3), -2)) / (scale.sum(-1) + rotated_image_weight + 1e-10).unsqueeze(-1) out = torch.sum(scale.unsqueeze(-1) * rgb_reshaped.unsqueeze(-3), -2) + rotated_image * (1 - mask) else: out = torch.sum(scale.unsqueeze(-1) * rgb_reshaped.unsqueeze(-3), -2) / (mask + 1e-10).unsqueeze(-1) return out.view(*rgb.size()), mask.view(*rgb.size()[:-1], 1)
def projection_renderer(surfels, rgb, camera): """Project surfels given in world coordinate to the camera's projection plane. Args: surfels: [batch_size, num_surfels, pos] rgb: [batch_size, num_surfels, D-channel data] or [batch_size, H, W, D-channel data] camera: [{'eye': [num_batches,...], 'lookat': [num_batches,...], 'up': [num_batches,...], 'viewport': [0, 0, W, H], 'fovy': <radians>}] Returns: RGB image of dimensions [batch_size, H, W, 3] from projected surfels """ px_idx, _ = project_image_coordinates(surfels, camera) viewport = make_list2np(camera['viewport']) W = int(viewport[2] - viewport[0]) rgb_reshaped = rgb.view(rgb.size(0), -1, rgb.size(-1)) rgb_out, mask = scatter_mean_dim0(rgb_reshaped, px_idx.long()) return rgb_out.reshape(rgb.shape), mask.reshape(rgb.shape)
def projection_reverse_renderer(rgb, in_pos_wc, out_pos_wc, camera1, camera2, rotated_image=None, compute_new_depth=False, depth_epsilon=1e-1, mask_dropout=0): """ Compute the rotated image in the opposite direction: take the surfel positions of the output image (out_pos_wc), and use them to find the corresponding u,v positions on the input image (rgb). Sample these positions using bilinear interpolation. """ viewport = make_list2np(camera1['viewport']) W = int(viewport[2] - viewport[0]) H = int(viewport[3] - viewport[1]) _, px_coord = project_image_coordinates(out_pos_wc, camera1) px_coord = px_coord.view(*rgb.size()[:-1], 3) normalized_px_coord = px_coord[..., :2] / torch.tensor( [W, H], dtype=torch.float, device=px_coord.device) * 2 - 1 out = torch.nn.functional.grid_sample(rgb.permute(0, 3, 1, 2), normalized_px_coord).permute( 0, 2, 3, 1) # TODO this is a hard mask, should we make it soft by using the bilinear weights (at the edges only or everywhere?) # |- An alternative would be to try Dropout on the mask # NOTE: use 0.5 to account for bilinear interpolation mask = (px_coord[..., 1] < 0.5) | (px_coord[..., 0] < 0.5) | ( px_coord[..., 1] >= H - 0.5) | (px_coord[..., 0] >= W - 0.5) mask = 1 - mask.view(*rgb.size()[:-1], 1).float() # Use depth to mask out pixels that end up on the same location # This process could be done by averaging the depth of neighboring pixels, but instead uses grid_sample to interpolate (twice) which turns out to be more efficient depth = px_coord[..., 2].unsqueeze( -1) # (1) depth from cam1, ordered as cam2 pixels _, px_coord_out = project_image_coordinates( in_pos_wc, camera2 ) # project the points from the cam1 (top) image to the bottom image space px_coord_out = px_coord_out.view(*rgb.size()[:-1], 3) normalized_px_coord_out = px_coord_out[..., :2] / torch.tensor( [W, H], dtype=torch.float, device=px_coord_out.device) * 2 - 1 # sample the depth from cam1 (ordered as cam2 pixels) using the points from the cam1 (top) image # we get depth from cam1 (ordered as cam1 pixels) depth_sampled_in = torch.nn.functional.grid_sample( depth.permute(0, 3, 1, 2), normalized_px_coord_out) # we then need to flip the whole thing to get a mask in cam2 space (ordered as cam2 pixels) # sample the depth from cam1 (ordered as cam1 pixels) using the points from the cam2 (bottom) image # we get depth from cam1 (ordered as cam2 pixels) depth_sampled_out = torch.nn.functional.grid_sample( depth_sampled_in, normalized_px_coord).permute(0, 2, 3, 1) # (2) # comparing the two depths images from cam1 (ordered as cam2 pixels) ((1) and (2)), we can get a mask mask = mask * (depth <= depth_sampled_out + depth_epsilon).float() if mask_dropout > 0: mask = torch.nn.functional.dropout(mask, mask_dropout, training=True) proj_out = {'mask': mask, 'image1': out} if rotated_image is not None: # mask = mask.detach() if detach_mask else mask # NOTE: Right now, it doesnt even matter since mask is non-differentiable out = mask * out + (1 - mask) * rotated_image if compute_new_depth: depth2 = px_coord_out[..., 2].unsqueeze(-1) proj_out['depth'] = torch.nn.functional.grid_sample( depth2.permute(0, 3, 1, 2), normalized_px_coord).permute(0, 2, 3, 1) return out, proj_out
def projection_renderer_differentiable_fast(surfels, rgb, camera, rotated_image=None, blur_size=0.15, use_depth=True, use_center_dist=True, compute_new_depth=False, blur_rotated_image=True, detach_mask=False, detach_mask2=False, detach_depth_merge=False): """Project surfels given in world coordinate to the camera's projection plane in a way that is differentiable w.r.t depth. This is achieved by interpolating the surfel values using bilinear interpolation then blurring the output image using a Gaussian filter. Args: surfels: [batch_size, num_surfels, pos] - world coordinates rgb: [batch_size, num_surfels, D-channel data] or [batch_size, H, W, D-channel data] camera: [{'eye': [num_batches,...], 'lookat': [num_batches,...], 'up': [num_batches,...], 'viewport': [0, 0, W, H], 'fovy': <radians>}] rotated_image: [batch_size, num_surfels, D-channel data] or [batch_size, H, W, D-channel data] Image to mix in with the result of the rotation. blur_size: (between 0 and 1). Determines the size of the gaussian kernel as a percentage of the width of the input image The standard deviation of the Gaussian kernel is automatically calculated from this value use_depth: Whether to weight the surfels landing on the same output pixel by their depth relative to the camera use_center_dist: Whether to weight the surfels landing on the same output pixel by their distance to the nearest pixel center location compute_new_depth: Whether to compute and output the depth as seen by the new camera blur_rotated_image: Whether to blur the 'rotated_image' passed as argument before merging it with the output image. Set to False if the rotated image is already blurred detach_mask: Whether to detach the mask m in I_top + (1 - m) * I_bottom detach_mask2: Alternative, to `detach_mask`, Whether to detach the mask m in m * (I_top / m') + (1 - m) * I_bottom Returns: RGB image of dimensions [batch_size, H, W, 3] from projected surfels """ _, px_coord = project_image_coordinates(surfels, camera) viewport = make_list2np(camera['viewport']) W = int(viewport[2] - viewport[0]) H = int(viewport[3] - viewport[1]) rgb_in = rgb.view(rgb.size(0), -1, rgb.size(-1)) # First create a uniform grid through bilinear interpolation # Then, perform a convolution with a Gaussian kernel to blur the output image # Idea from this paper: https://arxiv.org/pdf/1810.09381.pdf # Tensorflow implementation: https://github.com/eldar/differentiable-point-clouds/blob/master/dpc/util/point_cloud.py#L60 px_idx = torch.floor(px_coord[..., :2] - 0.5).long() # Difference to the nearest pixel center on the top left x = (px_coord[..., 0] - 0.5) - px_idx[..., 0].float() y = (px_coord[..., 1] - 0.5) - px_idx[..., 1].float() x, y = x.unsqueeze(-1), y.unsqueeze(-1) def flat_px(px): """Flatten the pixel locations and make sure everything is within bounds""" out = px[..., 1] * W + px[..., 0] max_idx = tch_var_l([W * H]) mask = (px[..., 1] < 0) | (px[..., 0] < 0) | (px[..., 1] >= H) | (px[..., 0] >= W) out = torch.where(mask, max_idx, out) return out depth = px_coord[..., 2].detach() if detach_depth_merge else px_coord[..., 2] center_dist_2 = (x**2 + y**2).squeeze( -1) # squared distance to the nearest pixel center rgb_out = scatter_weighted_blended_oit(rgb_in * (1 - x) * (1 - y), depth, center_dist_2, flat_px(px_idx + tch_var_l([0, 0])), use_depth=use_depth, use_center_dist=use_center_dist) rgb_out += scatter_weighted_blended_oit(rgb_in * (1 - x) * y, depth, center_dist_2, flat_px(px_idx + tch_var_l([0, 1])), use_depth=use_depth, use_center_dist=use_center_dist) rgb_out += scatter_weighted_blended_oit(rgb_in * x * (1 - y), depth, center_dist_2, flat_px(px_idx + tch_var_l([1, 0])), use_depth=use_depth, use_center_dist=use_center_dist) rgb_out += scatter_weighted_blended_oit(rgb_in * x * y, depth, center_dist_2, flat_px(px_idx + tch_var_l([1, 1])), use_depth=use_depth, use_center_dist=use_center_dist) soft_mask = scatter_weighted_blended_oit( (1 - x) * (1 - y), depth, center_dist_2, flat_px(px_idx + tch_var_l([0, 0])), use_depth=use_depth, use_center_dist=use_center_dist) soft_mask += scatter_weighted_blended_oit( (1 - x) * y, depth, center_dist_2, flat_px(px_idx + tch_var_l([0, 1])), use_depth=use_depth, use_center_dist=use_center_dist) soft_mask += scatter_weighted_blended_oit(x * (1 - y), depth, center_dist_2, flat_px(px_idx + tch_var_l([1, 0])), use_depth=use_depth, use_center_dist=use_center_dist) soft_mask += scatter_weighted_blended_oit(x * y, depth, center_dist_2, flat_px(px_idx + tch_var_l([1, 1])), use_depth=use_depth, use_center_dist=use_center_dist) if compute_new_depth: depth_in = depth.unsqueeze(-1) depth_out = scatter_weighted_blended_oit( depth_in * (1 - x) * (1 - y), depth, center_dist_2, flat_px(px_idx + tch_var_l([0, 0])), use_depth=use_depth, use_center_dist=use_center_dist) depth_out += scatter_weighted_blended_oit( depth_in * (1 - x) * y, depth, center_dist_2, flat_px(px_idx + tch_var_l([0, 1])), use_depth=use_depth, use_center_dist=use_center_dist) depth_out += scatter_weighted_blended_oit( depth_in * x * (1 - y), depth, center_dist_2, flat_px(px_idx + tch_var_l([1, 0])), use_depth=use_depth, use_center_dist=use_center_dist) depth_out += scatter_weighted_blended_oit( depth_in * x * y, depth, center_dist_2, flat_px(px_idx + tch_var_l([1, 1])), use_depth=use_depth, use_center_dist=use_center_dist) depth_out = depth_out.view(*rgb.size()[:-1], 1) rgb_out = rgb_out.view(*rgb.size()) soft_mask = soft_mask.view(*rgb.size()[:-1], 1) # Blur the rgb and mask images rgb_out = blur(rgb_out.permute(0, 3, 1, 2), blur_size).permute(0, 2, 3, 1) soft_mask = blur(soft_mask.permute(0, 3, 1, 2), blur_size).permute(0, 2, 3, 1) # There seems to be a bug in PyTorch where if a single division by 0 occurs in a tensor, the whole thing becomes NaN? # Might be related to this issue: https://github.com/pytorch/pytorch/issues/4132 # Because of this behavior, one can't simply do `out / out_mask` in `torch.where` soft_mask_nonzero = torch.where(soft_mask > 0, soft_mask, torch.ones_like(soft_mask)) + 1e-20 # If an additional image is passed in, merge it using the soft mask: rgb_out_normalized = torch.where(soft_mask > 0, rgb_out / soft_mask_nonzero, rgb_out) if rotated_image is not None: if blur_rotated_image: rotated_image = blur(rotated_image.permute(0, 3, 1, 2), blur_size).permute(0, 2, 3, 1) if detach_mask: out = torch.where( soft_mask > 1, rgb_out / soft_mask_nonzero.detach(), rgb_out + rotated_image * (1 - soft_mask.detach())) elif detach_mask2: soft_mask_detached = soft_mask.detach() out = soft_mask_detached * rgb_out_normalized + ( 1 - soft_mask_detached) * rotated_image else: out = torch.where(soft_mask > 1, rgb_out / soft_mask_nonzero, rgb_out + rotated_image * (1 - soft_mask)) else: out = rgb_out_normalized # Other things to output: proj_out = {'mask': soft_mask, 'image1': rgb_out_normalized} if compute_new_depth: depth_out = torch.where(soft_mask > 0, depth_out / soft_mask_nonzero, depth_out) proj_out['depth'] = depth_out return out, proj_out