def unproject_rgb_to_mem(rgb_camB, pixB_T_camA, mem_coord, device=None): # rgb_camB is B x C x H x W # pixB_T_camA is B x 4 x 4 (pix_T_camR) # rgb lives in B pixel coords # we want everything in A memory coords # this puts each C-dim pixel in the rgb_camB # along a ray in the voxelgrid B, C, H, W = list(rgb_camB.shape) Y, X, Z = mem_coord.proto.shape xyz_memA = utils_basic.gridcloud3D(B, Z, Y, X, norm=False, device=device) # grid_z, grid_y, grid_x = meshgrid3D(B, Z, Y, X) # # these are B x Z x Y x X # # these represent the mem grid coordinates # # we need to convert these to pixel coordinates # x = torch.reshape(grid_x, [B, -1]) # y = torch.reshape(grid_y, [B, -1]) # z = torch.reshape(grid_z, [B, -1]) # # these are B x N # xyz_mem = torch.stack([x, y, z], dim=2) # not specifically related to Ref, I am just # converting grid to points here, irrespective # of the which cam it is associated to. xyz_camA = Mem2Ref(xyz_memA, mem_coord) xyz_pixB = utils_geom.apply_4x4(pixB_T_camA, xyz_camA) # this is just getting the z coordinate to divide x/Z, y/Z normalizer = torch.unsqueeze(xyz_pixB[:, :, 2], 2) EPS = 1e-6 xy_pixB = xyz_pixB[:, :, :2] / (EPS + normalizer) # this is B x N x 2 # this is the (floating point) pixel coordinate of each voxel x_pixB, y_pixB = xy_pixB[:, :, 0], xy_pixB[:, :, 1] # these are B x N if (0): # handwritten version values = torch.zeros([B, C, Z * Y * X], dtype=torch.float32) for b in range(B): values[b] = utils_samp.bilinear_sample_single( rgb_camB[b], x_pixB[b], y_pixB[b]) else: # native pytorch version, this makes the pixel between -1 to 1 y_pixB, x_pixB = utils_basic.normalize_grid2D(y_pixB, x_pixB, H, W) # since we want a 3d output, we need 5d tensors z_pixB = torch.zeros_like(x_pixB) xyz_pixB = torch.stack([x_pixB, y_pixB, z_pixB], axis=2) rgb_camB = rgb_camB.unsqueeze(2) xyz_pixB = torch.reshape(xyz_pixB, [B, Z, Y, X, 3]) values = F.grid_sample(rgb_camB, xyz_pixB, mode='nearest') values = torch.reshape(values, (B, C, Z, Y, X)) return values
def apply_4x4_to_vox(B_T_A, feat_A, mem_coord_As=None, mem_coord_Bs=None, already_mem=False, binary_feat=False, rigid=True): # B_T_A is B x 4 x 4 # if already_mem=False, it is a transformation between cam systems # if already_mem=True, it is a transformation between mem systems # feat_A is B x C x Z x Y x X # it represents some scene features in reference/canonical coordinates # we want to go from these coords to some target coords # since this is a backwarp, # the question to ask is: # "WHERE in the tensor do you want to sample, # to replace each voxel's current value?" # the inverse of B_T_A represents this "where"; # it transforms each coordinate in B # to the location we want to sample in A B, C, Z, Y, X = list(feat_A.shape) # we have B_T_A in input, since this follows the other utils_geom.apply_4x4 # for an apply_4x4 func, but really we need A_T_B if rigid: A_T_B = utils_geom.safe_inverse(B_T_A) else: # this op is slower but more powerful A_T_B = B_T_A.inverse() if not already_mem: cam_T_mem = mem_coord_Bs.cam_T_vox.repeat(B, 1, 1) mem_T_cam = mem_coord_As.vox_T_cam.repeat(B, 1, 1) A_T_B = utils_basic.matmul3(mem_T_cam, A_T_B, cam_T_mem) # we want to sample for each location in the bird grid xyz_B = utils_basic.gridcloud3D(B, Z, Y, X) # this is B x N x 3 # transform xyz_A = utils_geom.apply_4x4(A_T_B, xyz_B) # we want each voxel to take its value # from whatever is at these A coordinates # i.e., we are back-warping from the "A" coords # feat_B = F.grid_sample(feat_A, normalize_grid(xyz_A, Z, Y, X)) feat_B = utils_samp.resample3D(feat_A, xyz_A, binary_feat=binary_feat) # feat_B, valid = utils_samp.resample3D(feat_A, xyz_A, binary_feat=binary_feat) # return feat_B, valid return feat_B
def assemble_padded_obj_masklist(lrtlist, scorelist, Z, Y, X, coeff=1.0): # compute a binary mask in 3D for each object # we use this when computing the center-surround objectness score # lrtlist is B x N x 19 # scorelist is B x N # returns masklist shaped B x N x 1 x Z x Y x Z B, N, D = list(lrtlist.shape) assert (D == 19) masks = torch.zeros(B, N, Z, Y, X) lenlist, ref_T_objlist = utils_geom.split_lrtlist(lrtlist) # lenlist is B x N x 3 # ref_T_objlist is B x N x 4 x 4 lenlist_ = lenlist.reshape(B * N, 3) ref_T_objlist_ = ref_T_objlist.reshape(B * N, 4, 4) obj_T_reflist_ = utils_geom.safe_inverse(ref_T_objlist_) # we want a value for each location in the mem grid xyz_mem_ = utils_basic.gridcloud3D(B * N, Z, Y, X) # this is B*N x V x 3, where V = Z*Y*X xyz_ref_ = Mem2Ref(xyz_mem_, Z, Y, X) # this is B*N x V x 3 lx, ly, lz = torch.unbind(lenlist_, dim=1) # these are B*N # ref_T_obj = convert_box_to_ref_T_obj(boxes3D) # obj_T_ref = ref_T_obj.inverse() xyz_obj_ = utils_geom.apply_4x4(obj_T_reflist_, xyz_ref_) x, y, z = torch.unbind(xyz_obj_, dim=2) # these are B*N x V lx = lx.unsqueeze(1) * coeff ly = ly.unsqueeze(1) * coeff lz = lz.unsqueeze(1) * coeff # these are B*N x 1 x_valid = (x > -lx / 2.0).byte() & (x < lx / 2.0).byte() y_valid = (y > -ly / 2.0).byte() & (y < ly / 2.0).byte() z_valid = (z > -lz / 2.0).byte() & (z < lz / 2.0).byte() inbounds = x_valid.byte() & y_valid.byte() & z_valid.byte() masklist = inbounds.float() # print(masklist.shape) masklist = masklist.reshape(B, N, 1, Z, Y, X) # print(masklist.shape) # print(scorelist.shape) masklist = masklist * scorelist.view(B, N, 1, 1, 1, 1) return masklist
def crop_zoom_from_mem(mem, mem_coord, lrt, Z2, Y2, X2, sensor_camR_T_camXs=None): # mem is B x C x Z x Y x X # lrt is B x 9 it takes me from object to cam coords # sensor_camR_T_camXs takes me from cam coords to ref coords B, C, Z, Y, X = list(mem.shape) ## 2, 512, 32, 32, 32 memY, memX, memZ = mem_coord.proto.shape assert (Z == memZ) assert (Y == memY) assert (X == memX) B2, E = list(lrt.shape) ## 2, 19 # I do not particularly like this inclusion of if statement if sensor_camR_T_camXs is not None: B3, _, _ = list(sensor_camR_T_camXs.shape) assert (E == 19) assert (B == B2) if sensor_camR_T_camXs is not None: assert (B == B3) # this puts each C-dim pixel in the image # along a ray in the zoomed voxelgrid just # the grid it has no coordinate system attached to it xyz_zoom = utils_basic.gridcloud3D(B, Z2, Y2, X2, norm=False) # these represent the zoom grid coordinates # we need to convert these to mem coordinates xyz_ref = Zoom2Ref(xyz_zoom, lrt, Z2, Y2, X2, sensor_camR_T_camXs) xyz_mem = Ref2Mem(xyz_ref, mem_coord) # now it is in grid coordinates of mem # this is just like the grid sample zoom = utils_samp.sample3D(mem, xyz_mem, Z2, Y2, X2) zoom = torch.reshape(zoom, [B, C, Z2, Y2, X2]) return zoom
def get_synth_flow(unpRs, occRs, obj_lrtlist_camX0s, obj_scorelist_s, occXs, set_name, K, summ_writer, sometimes_zero=False, sometimes_real=False, do_vis=False): B, S, _, Z, Y, X = list(occXs.shape) assert (S == 2) flowX0 = get_gt_flow( obj_lrtlist_camX0s, obj_scorelist_s, utils_geom.eye_4x4s(B, S), occXs[:, 0], K=K, occ_only=False, # get the dense flow mod='X0', summ_writer=summ_writer) # we do not sample any rotations here, to keep the distribution purely # uniform across all translation angles. # (rotation ruins this, since the pivot point is at the camera) cam1_T_cam0 = [ utils_geom.get_random_rt(B, r_amount=0.0, t_amount=1.0), # large motion utils_geom.get_random_rt( B, r_amount=0.0, t_amount=0.1, # small motion sometimes_zero=sometimes_zero) ] cam1_T_cam0 = random.sample(cam1_T_cam0, k=1)[0] occ0 = occRs[:, 0] unp0 = unpRs[:, 0] occ1 = utils_vox.apply_4x4_to_vox(cam1_T_cam0, occ0) unp1 = utils_vox.apply_4x4_to_vox(cam1_T_cam0, unp0) occs = [occ0, occ1] unps = [unp0, unp1] # occ1 should be a binary thing, so let's restore that property occ1 = torch.round(occ1) if do_vis: summ_writer.summ_occs('synth/occs', occs) summ_writer.summ_unps('synth/unps', unps, occs) mem_T_cam = utils_vox.get_mem_T_ref(B, Z, Y, X) cam_T_mem = utils_vox.get_ref_T_mem(B, Z, Y, X) mem1_T_mem0 = utils_basic.matmul3(mem_T_cam, cam1_T_cam0, cam_T_mem) xyz_mem0 = utils_basic.gridcloud3D(B, Z, Y, X) xyz_mem1 = utils_geom.apply_4x4(mem1_T_mem0, xyz_mem0) xyz_mem0 = xyz_mem0.reshape(B, Z, Y, X, 3) xyz_mem1 = xyz_mem1.reshape(B, Z, Y, X, 3) flow = xyz_mem1 - xyz_mem0 # this is B x Z x Y x X x 3 flow = flow.permute(0, 4, 1, 2, 3) # this is B x 3 x Z x Y x X if do_vis: summ_writer.summ_3D_flow('synth/flow', flow, clip=2.0) occ0_e = utils_samp.backwarp_using_3D_flow(occ1, flow, binary_feat=True) unp0_e = utils_samp.backwarp_using_3D_flow(unp1, flow) if do_vis: summ_writer.summ_occs('synth/occs_stab', [occ0, occ0_e]) summ_writer.summ_unps('synth/unps_stab', [unp0, unp0_e], [occ0, occ0_e]) occs = torch.stack(occs, dim=1) unps = torch.stack(unps, dim=1) is_synth = 1 if sometimes_real and set_name == 'train': is_synth = random.randint(0, 1) occs = [occRs, occs][is_synth] unps = [unpRs, unps][is_synth] flow = [flowX0, flow][is_synth] cam1_T_cam0 = [utils_geom.eye_4x4(B), cam1_T_cam0][is_synth] return occs, unps, flow, cam1_T_cam0, is_synth
def get_gt_flow(obj_lrtlist_camRs, obj_scorelist, camRs_T_camXs, occR, K=2, occ_only=True, mod='', vis=True, summ_writer=None): # this constructs the flow field according to the given # box trajectories (obj_lrtlist_camRs) (collected from a moving camR) # and egomotion (encoded in camRs_T_camXs) # (so they do not take into account egomotion) # so, we first generate the flow for all the objects, # then in the background, put the ego flow N, B, S, D = list(obj_lrtlist_camRs.shape) assert (S == 2) # as a flow util, this expects S=2 B, _, Z, Y, X = list(occR.shape) flows = [] masks = [] for k in range(K): obj_masklistR0 = utils_vox.assemble_padded_obj_masklist( obj_lrtlist_camRs[k, :, 0:1], obj_scorelist[k, :, 0:1], Z, Y, X, coeff=1.0) # this is B x 1(N) x 1(C) x Z x Y x Z # obj_masklistR0 = obj_masklistR0.squeeze(1) # this is B x 1 x Z x Y x X obj_mask0 = obj_masklistR0.squeeze(1) # this is B x 1 x Z x Y x X camR_T_cam0 = camRs_T_camXs[:, 0] camR_T_cam1 = camRs_T_camXs[:, 1] cam0_T_camR = utils_geom.safe_inverse(camR_T_cam0) cam1_T_camR = utils_geom.safe_inverse(camR_T_cam1) # camR0_T_camR1 = camR0_T_camRs[:,1] # camR1_T_camR0 = utils_geom.safe_inverse(camR0_T_camR1) # obj_masklistA1 = utils_vox.apply_4x4_to_vox(camR1_T_camR0, obj_masklistA0) # if vis and (summ_writer is not None): # summ_writer.summ_occ('flow/obj%d_maskA0' % k, obj_masklistA0) # summ_writer.summ_occ('flow/obj%d_maskA1' % k, obj_masklistA1) if vis and (summ_writer is not None): # summ_writer.summ_occ('flow/obj%d_mask0' % k, obj_mask0) summ_writer.summ_oned('flow/obj%d_mask0' % k, torch.mean(obj_mask0, 3)) _, ref_T_objs_list = utils_geom.split_lrtlist(obj_lrtlist_camRs[k]) # this is B x S x 4 x 4 ref_T_obj0 = ref_T_objs_list[:, 0] ref_T_obj1 = ref_T_objs_list[:, 1] obj0_T_ref = utils_geom.safe_inverse(ref_T_obj0) obj1_T_ref = utils_geom.safe_inverse(ref_T_obj1) # these are B x 4 x 4 mem_T_ref = utils_vox.get_mem_T_ref(B, Z, Y, X) ref_T_mem = utils_vox.get_ref_T_mem(B, Z, Y, X) ref1_T_ref0 = utils_basic.matmul2(ref_T_obj1, obj0_T_ref) cam1_T_cam0 = utils_basic.matmul3(cam1_T_camR, ref1_T_ref0, camR_T_cam0) mem1_T_mem0 = utils_basic.matmul3(mem_T_ref, cam1_T_cam0, ref_T_mem) xyz_mem0 = utils_basic.gridcloud3D(B, Z, Y, X) xyz_mem1 = utils_geom.apply_4x4(mem1_T_mem0, xyz_mem0) xyz_mem0 = xyz_mem0.reshape(B, Z, Y, X, 3) xyz_mem1 = xyz_mem1.reshape(B, Z, Y, X, 3) # only use these displaced points within the obj mask # obj_mask03 = obj_mask0.view(B, Z, Y, X, 1).repeat(1, 1, 1, 1, 3) obj_mask0 = obj_mask0.view(B, Z, Y, X, 1) # # xyz_mem1[(obj_mask03 < 1.0).bool()] = xyz_mem0 # cond = (obj_mask03 < 1.0).float() cond = (obj_mask0 > 0.0).float() xyz_mem1 = cond * xyz_mem1 + (1.0 - cond) * xyz_mem0 flow = xyz_mem1 - xyz_mem0 flow = flow.permute(0, 4, 1, 2, 3) obj_mask0 = obj_mask0.permute(0, 4, 1, 2, 3) # flow is centered on frame0, so we use occ0 to mask it if occ_only: flow = flow * occR # if vis and k==0: if vis: summ_writer.summ_3D_flow('flow/gt_%d' % k, flow, clip=4.0) masks.append(obj_mask0) flows.append(flow) camR_T_cam0 = camRs_T_camXs[:, 0] camR_T_cam1 = camRs_T_camXs[:, 1] cam0_T_camR = utils_geom.safe_inverse(camR_T_cam0) cam1_T_camR = utils_geom.safe_inverse(camR_T_cam1) mem_T_ref = utils_vox.get_mem_T_ref(B, Z, Y, X) ref_T_mem = utils_vox.get_ref_T_mem(B, Z, Y, X) cam1_T_cam0 = utils_basic.matmul2(cam1_T_camR, camR_T_cam0) mem1_T_mem0 = utils_basic.matmul3(mem_T_ref, cam1_T_cam0, ref_T_mem) xyz_mem0 = utils_basic.gridcloud3D(B, Z, Y, X) xyz_mem1 = utils_geom.apply_4x4(mem1_T_mem0, xyz_mem0) xyz_mem0 = xyz_mem0.reshape(B, Z, Y, X, 3) xyz_mem1 = xyz_mem1.reshape(B, Z, Y, X, 3) flow = xyz_mem1 - xyz_mem0 flow = flow.permute(0, 4, 1, 2, 3) if occ_only: flow = flow * occXs[:, 0] bkg_flow = flow # allow zero motion in the bkg any_mask = torch.max(torch.stack(masks, axis=0), axis=0)[0] masks.append(1.0 - any_mask) flows.append(bkg_flow) flows = torch.stack(flows, axis=0) masks = torch.stack(masks, axis=0) masks = masks.repeat(1, 1, 3, 1, 1, 1) flow = utils_basic.reduce_masked_mean(flows, masks, dim=0) if vis: summ_writer.summ_3D_flow('flow/gt_complete', flow, clip=4.0) # flow is shaped B x 3 x D x H x W return flow