Python FeatNet.eval Examples

Programming Language: Python

Namespace/Package Name: nets.featnet

Class/Type: FeatNet

Method/Function: eval

Examples at hotexamples.com: 2

Python FeatNet.eval - 2 examples found. These are the top rated real world Python examples of nets.featnet.FeatNet.eval extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FeatNet(6)

eval(2)

Frequently Used Methods

FeatNet (6)

eval (2)

Example #1

Show file

class TouchEmbed2D(nn.Module):
    def __init__(self):
        """The idea is here the featnet would be frozen, and using some X number
           of images it will form the features for the object, which will be used
           to get the closeby embeddings to learn metric learning
        """
        super(TouchEmbed2D, self).__init__()
        if hyp.do_feat:
            print('using the visual feat net to generate visual feature tensor')
            self.featnet = FeatNet(input_dim=4)  # passing occXs and occXs * unpXs

        if hyp.do_touch_feat:
            print('using 2d backbone network to generate features from sensor depth image')
            self.backbone_2D = VGGNet.Feat2d(touch_emb_dim=hyp.feat_dim, do_bn=hyp.do_bn)
            # if I need to add the 3d encoder decoder I need to add the next line
            # self.touch_featnet = FeatNet(input_dim=1)  # just passing occRs here

        if hyp.do_touch_forward:
            # now here I need to pass this through the bottle3d architecture to predict
            # 1-d vectors
            print('using context net to turn 3d context grid into 1d feature tensor')
            self.context_net = bottle3D.Bottle3D(in_channel=hyp.feat_dim,\
                pred_dim=hyp.feat_dim)

        if hyp.do_touch_occ:
            print('this should not be turned on')
            from IPython import embed; embed()
            self.touch_occnet = OccNet()

        # if hyp.do_touch_embML:
        #     self.touch_embnet3D = EmbNet3D()  # metric learning for making touch feature tensor same as visual feature tensor

        if hyp.do_freeze_feat:
            print('freezing visual features')
            self.featnet = self.featnet.eval()
            assert self.featnet.training == False, "since I am not training FeatNet it should be false"

        if hyp.do_freeze_touch_feat:
            print('freezing backbone_2D')
            self.backbone_2D = self.backbone_2D.eval()

        if hyp.do_freeze_touch_forward:
            print('freezing context net')
            self.context_net = self.context_net.eval()

        # hyperparams for embedding training not really needed as of now
        if hyp.do_touch_embML:
            print('Instantiating Contrastive ML loss')
            self.num_pos_samples = hyp.emb_3D_num_samples
            self.batch_k = 2
            self.n_negatives = 1
            assert (self.num_pos_samples > 0)
            self.sampler = utils_misc.DistanceWeightedSampling(batch_k=self.batch_k, normalize=False,
                num_neg_samples=self.n_negatives)
            self.criterion = utils_misc.MarginLoss()
            self.beta = 1.2

        if hyp.do_moc or hyp.do_eval_recall:
            print('Instantiating MOC net')
            self.key_touch_featnet = VGGNet.Feat2d(touch_emb_dim=hyp.feat_dim,\
                do_bn=hyp.do_bn)
            key_weights = self.backbone_2D.state_dict()
            self.key_touch_featnet.load_state_dict(key_weights)

            self.key_context_net = bottle3D.Bottle3D(in_channel=hyp.feat_dim,\
                pred_dim=hyp.feat_dim)
            key_context_weights = self.context_net.state_dict()
            self.key_context_net.load_state_dict(key_context_weights)

            # check that the two networks indeed have the same weights
            p1 = get_params(self.backbone_2D)
            p2 = get_params(self.key_touch_featnet)
            assert check_equal(p1, p2),\
                "initially both the touch networks should have same weights"

            cp1 = get_params(self.context_net)
            cp2 = get_params(self.key_context_net)
            assert check_equal(cp1, cp2),\
                "initially both the context networks should have same weights"

            self.moc_ml_net = MOCTrainingTouch(dict_len=hyp.dict_len,\
                num_neg_samples=hyp.num_neg_samples)

    def forward(self, feed, moc_init_done=False, debug=False):
        summ_writer = utils_improc.Summ_writer(
            writer = feed['writer'],
            global_step = feed['global_step'],
            set_name= feed['set_name'],
            fps=8)

        writer = feed['writer']
        global_step = feed['global_step']
        total_loss = torch.tensor(0.0).cuda()

        ### ... All things sensor ... ###
        sensor_rgbs = feed['sensor_imgs']
        sensor_depths = feed['sensor_depths']
        center_sensor_H, center_sensor_W = sensor_depths[0][0].shape[-1] // 2, sensor_depths[0][0].shape[-2] // 2
        ### ... All things sensor end ... ###

        # 1. Form the memory tensor using the feat net and visual images.
        # check what all do you need for this and create only those things

        ##  .... Input images ....  ##
        rgb_camRs = feed['rgb_camRs']
        rgb_camXs = feed['rgb_camXs']
        ##  .... Input images end ....  ##

        ## ... Hyperparams ... ##
        B, H, W, V, S = hyp.B, hyp.H, hyp.W, hyp.V, hyp.S
        __p = lambda x: pack_seqdim(x, B)
        __u = lambda x: unpack_seqdim(x, B)
        PH, PW = hyp.PH, hyp.PW
        Z, Y, X = hyp.Z, hyp.Y, hyp.X
        Z2, Y2, X2 = int(Z/2), int(Y/2), int(X/2)
        ## ... Hyperparams end ... ##

        ## .... VISUAL TRANSFORMS BEGIN .... ##
        pix_T_cams = feed['pix_T_cams']
        pix_T_cams_ = __p(pix_T_cams)
        origin_T_camRs = feed['origin_T_camRs']
        origin_T_camRs_ = __p(origin_T_camRs)
        origin_T_camXs = feed['origin_T_camXs']
        origin_T_camXs_ = __p(origin_T_camXs)
        camRs_T_camXs_ = torch.matmul(utils_geom.safe_inverse(
            origin_T_camRs_), origin_T_camXs_)
        camXs_T_camRs_ = utils_geom.safe_inverse(camRs_T_camXs_)
        camRs_T_camXs = __u(camRs_T_camXs_)
        camXs_T_camRs = __u(camXs_T_camRs_)
        pix_T_cams_ = utils_geom.pack_intrinsics(pix_T_cams_[:, 0, 0], pix_T_cams_[:, 1, 1], pix_T_cams_[:, 0, 2],
            pix_T_cams_[:, 1, 2])
        pix_T_camRs_ = torch.matmul(pix_T_cams_, camXs_T_camRs_)
        pix_T_camRs = __u(pix_T_camRs_)
        ## ... VISUAL TRANSFORMS END ... ##

        ## ... SENSOR TRANSFORMS BEGIN ... ##
        sensor_origin_T_camXs = feed['sensor_extrinsics']
        sensor_origin_T_camXs_ = __p(sensor_origin_T_camXs)
        sensor_origin_T_camRs = feed['sensor_origin_T_camRs']
        sensor_origin_T_camRs_ = __p(sensor_origin_T_camRs)
        sensor_camRs_T_origin_ = utils_geom.safe_inverse(sensor_origin_T_camRs_)

        sensor_camRs_T_camXs_ = torch.matmul(utils_geom.safe_inverse(
            sensor_origin_T_camRs_), sensor_origin_T_camXs_)
        sensor_camXs_T_camRs_ = utils_geom.safe_inverse(sensor_camRs_T_camXs_)

        sensor_camRs_T_camXs = __u(sensor_camRs_T_camXs_)
        sensor_camXs_T_camRs = __u(sensor_camXs_T_camRs_)

        sensor_pix_T_cams = feed['sensor_intrinsics']
        sensor_pix_T_cams_ = __p(sensor_pix_T_cams)
        sensor_pix_T_cams_ = utils_geom.pack_intrinsics(sensor_pix_T_cams_[:, 0, 0], sensor_pix_T_cams_[:, 1, 1],
            sensor_pix_T_cams_[:, 0, 2], sensor_pix_T_cams_[:, 1, 2])
        sensor_pix_T_camRs_ = torch.matmul(sensor_pix_T_cams_, sensor_camXs_T_camRs_)
        sensor_pix_T_camRs = __u(sensor_pix_T_camRs_)
        ## .... SENSOR TRANSFORMS END .... ##

        ## .... Visual Input point clouds .... ##
        xyz_camXs = feed['xyz_camXs']
        xyz_camXs_ = __p(xyz_camXs)
        xyz_camRs_ = utils_geom.apply_4x4(camRs_T_camXs_, xyz_camXs_)  # (40, 4, 4) (B*S, N, 3)
        xyz_camRs = __u(xyz_camRs_)
        assert all([torch.allclose(xyz_camR, inp_xyz_camR) for xyz_camR, inp_xyz_camR in zip(
            xyz_camRs, feed['xyz_camRs']
        )]), "computation of xyz_camR here and those computed in input do not match"
        ## .... Visual Input point clouds end .... ##

        ## ... Sensor input point clouds ... ##
        sensor_xyz_camXs = feed['sensor_xyz_camXs']
        sensor_xyz_camXs_ = __p(sensor_xyz_camXs)
        sensor_xyz_camRs_ = utils_geom.apply_4x4(sensor_camRs_T_camXs_, sensor_xyz_camXs_)
        sensor_xyz_camRs = __u(sensor_xyz_camRs_)
        assert all([torch.allclose(sensor_xyz, inp_sensor_xyz) for sensor_xyz, inp_sensor_xyz in zip(
            sensor_xyz_camRs, feed['sensor_xyz_camRs']
        )]), "the sensor_xyz_camRs computed in forward do not match those computed in input"

        ## ... visual occupancy computation voxelize the pointcloud from above ... ##
        occRs_ = utils_vox.voxelize_xyz(xyz_camRs_, Z, Y, X)
        occXs_ = utils_vox.voxelize_xyz(xyz_camXs_, Z, Y, X)
        occRs_half_ = utils_vox.voxelize_xyz(xyz_camRs_, Z2, Y2, X2)
        occXs_half_ = utils_vox.voxelize_xyz(xyz_camXs_, Z2, Y2, X2)
        ## ... visual occupancy computation end ... NOTE: no unpacking ##

        ## .. visual occupancy computation for sensor inputs .. ##
        sensor_occRs_ = utils_vox.voxelize_xyz(sensor_xyz_camRs_, Z, Y, X)
        sensor_occXs_ = utils_vox.voxelize_xyz(sensor_xyz_camXs_, Z, Y, X)
        sensor_occRs_half_ = utils_vox.voxelize_xyz(sensor_xyz_camRs_, Z2, Y2, X2)
        sensor_occXs_half_ = utils_vox.voxelize_xyz(sensor_xyz_camXs_, Z2, Y2, X2)

        ## ... unproject rgb images ... ##
        unpRs_ = utils_vox.unproject_rgb_to_mem(__p(rgb_camXs), Z, Y, X, pix_T_camRs_)
        unpXs_ = utils_vox.unproject_rgb_to_mem(__p(rgb_camXs), Z, Y, X, pix_T_cams_)
        ## ... unproject rgb finish ... NOTE: no unpacking ##

        ## ... Make depth images ... ##
        depth_camXs_, valid_camXs_ = utils_geom.create_depth_image(pix_T_cams_, xyz_camXs_, H, W)
        dense_xyz_camXs_ = utils_geom.depth2pointcloud(depth_camXs_, pix_T_cams_)
        dense_xyz_camRs_ = utils_geom.apply_4x4(camRs_T_camXs_, dense_xyz_camXs_)
        inbound_camXs_ = utils_vox.get_inbounds(dense_xyz_camRs_, Z, Y, X).float()
        inbound_camXs_ = torch.reshape(inbound_camXs_, [B*S, 1, H, W])
        valid_camXs = __u(valid_camXs_) * __u(inbound_camXs_)
        ## ... Make depth images ... ##

        ## ... Make sensor depth images ... ##
        sensor_depth_camXs_, sensor_valid_camXs_ = utils_geom.create_depth_image(sensor_pix_T_cams_,
            sensor_xyz_camXs_, H, W)
        sensor_dense_xyz_camXs_ = utils_geom.depth2pointcloud(sensor_depth_camXs_, sensor_pix_T_cams_)
        sensor_dense_xyz_camRs_ = utils_geom.apply_4x4(sensor_camRs_T_camXs_, sensor_dense_xyz_camXs_)
        sensor_inbound_camXs_ = utils_vox.get_inbounds(sensor_dense_xyz_camRs_, Z, Y, X).float()
        sensor_inbound_camXs_ = torch.reshape(sensor_inbound_camXs_, [B*hyp.sensor_S, 1, H, W])
        sensor_valid_camXs = __u(sensor_valid_camXs_) * __u(sensor_inbound_camXs_)
        ### .. Done making sensor depth images .. ##

        ### ... Sanity check ... Write to tensorboard ... ###
        summ_writer.summ_oneds('2D_inputs/depth_camXs', torch.unbind(__u(depth_camXs_), dim=1))
        summ_writer.summ_oneds('2D_inputs/valid_camXs', torch.unbind(valid_camXs, dim=1))
        summ_writer.summ_rgbs('2D_inputs/rgb_camXs', torch.unbind(rgb_camXs, dim=1))
        summ_writer.summ_rgbs('2D_inputs/rgb_camRs', torch.unbind(rgb_camRs, dim=1))
        summ_writer.summ_occs('3d_inputs/occXs', torch.unbind(__u(occXs_), dim=1), reduce_axes=[2])
        summ_writer.summ_unps('3d_inputs/unpXs', torch.unbind(__u(unpXs_), dim=1),\
            torch.unbind(__u(occXs_), dim=1))

        # A different approach for viewing occRs of sensors
        sensor_occRs = __u(sensor_occRs_)
        vis_sensor_occRs = torch.max(sensor_occRs, dim=1, keepdim=True)[0]
        # summ_writer.summ_occs('3d_inputs/sensor_occXs', torch.unbind(__u(sensor_occXs_), dim=1),
        #     reduce_axes=[2])
        summ_writer.summ_occs('3d_inputs/sensor_occRs', torch.unbind(vis_sensor_occRs, dim=1), reduce_axes=[2])

        ### ... code for visualizing sensor depths and sensor rgbs ... ###
        # summ_writer.summ_oneds('2D_inputs/depths_sensor', torch.unbind(sensor_depths, dim=1))
        # summ_writer.summ_rgbs('2D_inputs/rgbs_sensor', torch.unbind(sensor_rgbs, dim=1))
        # summ_writer.summ_oneds('2D_inputs/validXs_sensor', torch.unbind(sensor_valid_camXs, dim=1))

        if summ_writer.save_this:
            unpRs_ = utils_vox.unproject_rgb_to_mem(__p(rgb_camXs), Z, Y, X, matmul2(pix_T_cams_, camXs_T_camRs_))
            unpRs = __u(unpRs_)
            occRs_ = utils_vox.voxelize_xyz(xyz_camRs_, Z, Y, X)
            summ_writer.summ_occs('3d_inputs/occRs', torch.unbind(__u(occRs_), dim=1), reduce_axes=[2])
            summ_writer.summ_unps('3d_inputs/unpRs', torch.unbind(unpRs, dim=1),\
                torch.unbind(__u(occRs_), dim=1))
        ### ... Sanity check ... Writing to tensoboard complete ... ###
        results = list()

        mask_ = None
        ### ... Visual featnet part .... ###
        if hyp.do_feat:
            featXs_input = torch.cat([__u(occXs_), __u(occXs_)*__u(unpXs_)], dim=2)  # B, S, 4, H, W, D
            featXs_input_ = __p(featXs_input)

            freeXs_ = utils_vox.get_freespace(__p(xyz_camXs), occXs_half_)
            freeXs = __u(freeXs_)
            visXs = torch.clamp(__u(occXs_half_) + freeXs, 0.0, 1.0)

            if type(mask_) != type(None):
                assert(list(mask_.shape)[2:5] == list(featXs_input.shape)[2:5])
            featXs_, validXs_, _ = self.featnet(featXs_input_, summ_writer, mask=occXs_)
            # total_loss += feat_loss  # Note no need of loss

            validXs, featXs = __u(validXs_), __u(featXs_) # unpacked into B, S, C, D, H, W
            # bring everything to ref_frame
            validRs = utils_vox.apply_4x4_to_voxs(camRs_T_camXs, validXs)
            visRs = utils_vox.apply_4x4_to_voxs(camRs_T_camXs, visXs)
            featRs = utils_vox.apply_4x4_to_voxs(camRs_T_camXs, featXs)  # This is now in memory coordinates

            emb3D_e = torch.mean(featRs[:, 1:], dim=1)  # context, or the features of the scene
            emb3D_g = featRs[:, 0]  # this is to predict, basically I will pass emb3D_e as input and hope to predict emb3D_g
            vis3D_e = torch.max(validRs[:, 1:], dim=1)[0] * torch.max(visRs[:, 1:], dim=1)[0]
            vis3D_g = validRs[:, 0] * visRs[:, 0]

            #### ... I do not think I need this ... ####
            results = {}
        #     # if hyp.do_eval_recall:
        #     #     results['emb3D_e'] = emb3D_e
        #     #     results['emb3D_g'] = emb3D_g
        #     #### ... Check if you need the above

            summ_writer.summ_feats('3D_feats/featXs_input', torch.unbind(featXs_input, dim=1), pca=True)
            summ_writer.summ_feats('3D_feats/featXs_output', torch.unbind(featXs, dim=1), pca=True)
            summ_writer.summ_feats('3D_feats/featRs_output', torch.unbind(featRs, dim=1), pca=True)
            summ_writer.summ_feats('3D_feats/validRs', torch.unbind(validRs, dim=1), pca=False)
            summ_writer.summ_feat('3D_feats/vis3D_e', vis3D_e, pca=False)
            summ_writer.summ_feat('3D_feats/vis3D_g', vis3D_g, pca=False)

            # I need to aggregate the features and detach to prevent the backward pass on featnet
            featRs = torch.mean(featRs, dim=1)
            featRs = featRs.detach()
            #  ... HERE I HAVE THE VISUAL FEATURE TENSOR ... WHICH IS MADE USING 5 EVENLY SPACED VIEWS #

        # FOR THE TOUCH PART, I HAVE THE OCC and THE AIM IS TO PREDICT FEATURES FROM THEM #
        if hyp.do_touch_feat:
            # 1. Pass all the sensor depth images through the backbone network
            input_sensor_depths = __p(sensor_depths)
            sensor_features_ = self.backbone_2D(input_sensor_depths)

            # should normalize these feature tensors
            sensor_features_ = l2_normalize(sensor_features_, dim=1)

            sensor_features = __u(sensor_features_)
            assert torch.allclose(torch.norm(sensor_features_, dim=1), torch.Tensor([1.0]).cuda()),\
                "normalization has no effect on you huh."

            if hyp.do_eval_recall:
              results['sensor_features'] = sensor_features_
              results['sensor_depths'] = input_sensor_depths
              results['object_img'] = rgb_camRs
              results['sensor_imgs'] = __p(sensor_rgbs)

            # if moco is used do the same procedure as above but with a different network #
            if hyp.do_moc or hyp.do_eval_recall:
                # 1. Pass all the sensor depth images through the key network
                key_input_sensor_depths = copy.deepcopy(__p(sensor_depths)) # bx1024x1x16x16->(2048x1x16x16)
                self.key_touch_featnet.eval()
                with torch.no_grad():
                    key_sensor_features_ = self.key_touch_featnet(key_input_sensor_depths)

                key_sensor_features_ = l2_normalize(key_sensor_features_, dim=1)
                key_sensor_features = __u(key_sensor_features_)
                assert torch.allclose(torch.norm(key_sensor_features_, dim=1), torch.Tensor([1.0]).cuda()),\
                    "normalization has no effect on you huh."

        # doing the same procedure for moco but with a different network end #

        # do you want to do metric learning voxel point based using visual features and sensor features
        if hyp.do_touch_embML and not hyp.do_touch_forward:
            # trial 1: I do not pass the above obtained features through some encoder decoder in 3d
            # So compute the location is ref_frame which the center of these depth images will occupy
            # at all of these locations I will sample the from the visual tensor. It forms the positive pairs
            # negatives are simply everything except the positive
            sensor_depths_centers_x = center_sensor_W * torch.ones((hyp.B, hyp.sensor_S))
            sensor_depths_centers_x = sensor_depths_centers_x.cuda()
            sensor_depths_centers_y = center_sensor_H * torch.ones((hyp.B, hyp.sensor_S))
            sensor_depths_centers_y = sensor_depths_centers_y.cuda()
            sensor_depths_centers_z = sensor_depths[:, :, 0, center_sensor_H, center_sensor_W]

            # Next use Pixels2Camera to unproject all of these together.
            # merge the batch and the sequence dimension
            sensor_depths_centers_x = sensor_depths_centers_x.reshape(-1, 1, 1)  # BxHxW as required by Pixels2Camera
            sensor_depths_centers_y = sensor_depths_centers_y.reshape(-1, 1, 1)
            sensor_depths_centers_z = sensor_depths_centers_z.reshape(-1, 1, 1)

            fx, fy, x0, y0 = utils_geom.split_intrinsics(sensor_pix_T_cams_)
            sensor_depths_centers_in_camXs_ = utils_geom.Pixels2Camera(sensor_depths_centers_x, sensor_depths_centers_y,
                sensor_depths_centers_z, fx, fy, x0, y0)

            # finally use apply4x4 to get the locations in ref_cam
            sensor_depths_centers_in_ref_cam_ = utils_geom.apply_4x4(sensor_camRs_T_camXs_, sensor_depths_centers_in_camXs_)

            # NOTE: convert them to memory coordinates, the name is xyz so I presume it returns xyz but talk to ADAM
            sensor_depths_centers_in_mem_ = utils_vox.Ref2Mem(sensor_depths_centers_in_ref_cam_, Z2, Y2, X2)
            sensor_depths_centers_in_mem = sensor_depths_centers_in_mem_.reshape(hyp.B, hyp.sensor_S, -1)

            if debug:
                print('assert that you are not entering here')
                from IPython import embed; embed()
                # form a (0, 1) volume here at these locations and see if it resembles a cup
                dim1 = X2 * Y2 * Z2
                dim2 = X2 * Y2
                dim3 = X2
                binary_voxel_grid = torch.zeros((hyp.B, X2, Y2, Z2))
                # NOTE: Z is the leading dimension
                rounded_idxs = torch.round(sensor_depths_centers_in_mem)
                flat_idxs = dim2 * rounded_idxs[0, :, 0] + dim3 * rounded_idxs[0, :, 1] + rounded_idxs[0, :, 2]
                flat_idxs1 = dim2 * rounded_idxs[1, :, 0] + dim3 * rounded_idxs[1, :, 1] + rounded_idxs[1, :, 2]
                flat_idxs1 = flat_idxs1 + dim1
                flat_idxs1 = flat_idxs1.long()
                flat_idxs = flat_idxs.long()

                flattened_grid = binary_voxel_grid.flatten()
                flattened_grid[flat_idxs] = 1.
                flattened_grid[flat_idxs1] = 1.

                binary_voxel_grid = flattened_grid.view(B, X2, Y2, Z2)

                assert binary_voxel_grid[0].sum() == len(torch.unique(flat_idxs)), "some indexes are missed here"
                assert binary_voxel_grid[1].sum() == len(torch.unique(flat_idxs1)), "some indexes are missed here"

                # o3d.io.write_voxel_grid("forward_pass_save/grid0.ply", binary_voxel_grid[0])
                # o3d.io.write_voxel_grid("forward_pass_save/grid1.ply", binary_voxel_grid[0])
                # need to save these voxels
                save_voxel(binary_voxel_grid[0].cpu().numpy(), "forward_pass_save/grid0.binvox")
                save_voxel(binary_voxel_grid[1].cpu().numpy(), "forward_pass_save/grid1.binvox")
                from IPython import embed; embed()

            # use grid sample to get the visual touch tensor at these locations, NOTE: visual tensor features shape is (B, C, N)
            visual_tensor_features = utils_samp.bilinear_sample3D(featRs, sensor_depths_centers_in_mem[:, :, 0],
                sensor_depths_centers_in_mem[:, :, 1], sensor_depths_centers_in_mem[:, :, 2])
            visual_feature_tensor = visual_tensor_features.permute(0, 2, 1)
            # pack it
            visual_feature_tensor_ = __p(visual_feature_tensor)
            C = list(visual_feature_tensor.shape)[-1]
            print('C=', C)

            # do the metric learning this is the same as before.
            # the code is basically copied from embnet3d.py but some changes are being made very minor
            emb_vec = torch.stack((sensor_features_, visual_feature_tensor_), dim=1).view(B*self.num_samples*self.batch_k, C)
            y = torch.stack([torch.range(0,self.num_samples*B-1), torch.range(0,self.num_samples*B-1)], dim=1).view(self.num_samples*B*self.batch_k)
            a_indices, anchors, positives, negatives, _ = self.sampler(emb_vec)

            # I need to write my own version of margin loss since the negatives and anchors may not be same dim
            d_ap = torch.sqrt(torch.sum((positives - anchors)**2, dim=1) + 1e-8)
            pos_loss = torch.clamp(d_ap - beta + self._margin, min=0.0)

            # TODO: expand the dims of anchors and tile them and compute the negative loss

            # do the pair count where you average by contributors only

            # this is your total loss


            # Further idea is to check what volumetric locations do each of the depth images corresponds to
            # unproject the entire depth image and convert to ref. and then sample.

        if hyp.do_touch_forward:
            ## ... Begin code for getting crops from visual memory ... ##
            sensor_depths_centers_x = center_sensor_W * torch.ones((hyp.B, hyp.sensor_S))
            sensor_depths_centers_x = sensor_depths_centers_x.cuda()
            sensor_depths_centers_y = center_sensor_H * torch.ones((hyp.B, hyp.sensor_S))
            sensor_depths_centers_y = sensor_depths_centers_y.cuda()
            sensor_depths_centers_z = sensor_depths[:, :, 0, center_sensor_H, center_sensor_W]

            # Next use Pixels2Camera to unproject all of these together.
            # merge the batch and the sequence dimension
            sensor_depths_centers_x = sensor_depths_centers_x.reshape(-1, 1, 1)
            sensor_depths_centers_y = sensor_depths_centers_y.reshape(-1, 1, 1)
            sensor_depths_centers_z = sensor_depths_centers_z.reshape(-1, 1, 1)

            fx, fy, x0, y0 = utils_geom.split_intrinsics(sensor_pix_T_cams_)
            sensor_depths_centers_in_camXs_ = utils_geom.Pixels2Camera(sensor_depths_centers_x, sensor_depths_centers_y,
                sensor_depths_centers_z, fx, fy, x0, y0)
            sensor_depths_centers_in_world_ = utils_geom.apply_4x4(sensor_origin_T_camXs_, sensor_depths_centers_in_camXs_)  # not used by the algorithm
            ## this will be later used for visualization hence saving it here for now
            sensor_depths_centers_in_ref_cam_ = utils_geom.apply_4x4(sensor_camRs_T_camXs_, sensor_depths_centers_in_camXs_)  # not used by the algorithm

            sensor_depths_centers_in_camXs = __u(sensor_depths_centers_in_camXs_).squeeze(2)

            # There has to be a better way to do this, for each of the cameras in the batch I want a box of size (ch, cw, cd)
            # TODO: rotation is the deviation of the box from the axis aligned do I want this
            tB, tN, _ = list(sensor_depths_centers_in_camXs.shape)  # 2, 512, _
            boxlist = torch.zeros(tB, tN, 9)  # 2, 512, 9
            boxlist[:, :, :3] = sensor_depths_centers_in_camXs  # this lies on the object
            boxlist[:, :, 3:6] = torch.FloatTensor([hyp.contextW, hyp.contextH, hyp.contextD])

            # convert the boxlist to lrtlist and to cuda
            # the rt here transforms the from box coordinates to camera coordinates
            box_lrtlist = utils_geom.convert_boxlist_to_lrtlist(boxlist)

            # Now I will use crop_zoom_from_mem functionality to get the features in each of the boxes
            # I will do it for each of the box separately as required by the api
            context_grid_list = list()
            for m in range(box_lrtlist.shape[1]):
                curr_box = box_lrtlist[:, m, :]
                context_grid = utils_vox.crop_zoom_from_mem(featRs, curr_box, 8, 8, 8,
                    sensor_camRs_T_camXs[:, m, :, :])
                context_grid_list.append(context_grid)
            context_grid_list = torch.stack(context_grid_list, dim=1)
            context_grid_list_ = __p(context_grid_list)
            ## ... till here I believe I have not introduced any randomness, so the points are still in
            ## ... End code for getting crops around this center of certain height, width and depth ... ##

            ## ... Begin code for passing the context grid through 3D CNN to obtain a vector ... ##
            sensor_cam_locs = feed['sensor_locs']  # these are in origin coordinates
            sensor_cam_quats = feed['sensor_quats'] # this too in in world_coordinates
            sensor_cam_locs_ = __p(sensor_cam_locs)
            sensor_cam_quats_ = __p(sensor_cam_quats)
            sensor_cam_locs_in_R_ = utils_geom.apply_4x4(sensor_camRs_T_origin_, sensor_cam_locs_.unsqueeze(1)).squeeze(1)
            # TODO TODO TODO confirm that this is right? TODO TODO TODO
            get_r_mat = lambda cam_quat: transformations.quaternion_matrix_py(cam_quat)
            rot_mat_Xs_ = torch.from_numpy(np.stack(list(map(get_r_mat, sensor_cam_quats_.cpu().numpy())))).to(sensor_cam_locs_.device).float()
            rot_mat_Rs_ = torch.bmm(sensor_camRs_T_origin_, rot_mat_Xs_)
            get_quat = lambda r_mat: transformations.quaternion_from_matrix_py(r_mat)
            sensor_quats_in_R_ = torch.from_numpy(np.stack(list(map(get_quat, rot_mat_Rs_.cpu().numpy())))).to(sensor_cam_locs_.device).float()

            pred_features_ = self.context_net(context_grid_list_,\
                sensor_cam_locs_in_R_, sensor_quats_in_R_)

            # normalize
            pred_features_ = l2_normalize(pred_features_, dim=1)
            pred_features = __u(pred_features_)

            # if doing moco I have to pass the inputs through the key(slow) network as well #
            if hyp.do_moc or hyp.do_eval_recall:
                key_context_grid_list_ = copy.deepcopy(context_grid_list_)
                key_sensor_cam_locs_in_R_ = copy.deepcopy(sensor_cam_locs_in_R_)
                key_sensor_quats_in_R_ = copy.deepcopy(sensor_quats_in_R_)
                self.key_context_net.eval()
                with torch.no_grad():
                    key_pred_features_ = self.key_context_net(key_context_grid_list_,\
                        key_sensor_cam_locs_in_R_, key_sensor_quats_in_R_)

                # normalize, normalization is very important why though
                key_pred_features_ = l2_normalize(key_pred_features_, dim=1)
                key_pred_features = __u(key_pred_features_)
            # end passing of the input through the slow network this is necessary for moco #
            ## ... End code for passing the context grid through 3D CNN to obtain a vector ... ##

        ## ... Begin code for doing metric learning between pred_features and sensor features ... ##
        # 1. Subsample both based on the number of positive samples
        if hyp.do_touch_embML:
            assert(hyp.do_touch_forward)
            assert(hyp.do_touch_feat)
            perm = torch.randperm(len(pred_features_))  ## 1024
            chosen_sensor_feats_ = sensor_features_[perm[:self.num_pos_samples*hyp.B]]
            chosen_pred_feats_ = pred_features_[perm[:self.num_pos_samples*B]]

            # 2. form the emb_vec and get pos and negative samples for the batch
            emb_vec = torch.stack((chosen_sensor_feats_, chosen_pred_feats_), dim=1).view(hyp.B*self.num_pos_samples*self.batch_k, -1)
            y = torch.stack([torch.range(0, self.num_pos_samples*B-1), torch.range(0, self.num_pos_samples*B-1)],\
                dim=1).view(B*self.num_pos_samples*self.batch_k) # (0, 0, 1, 1, ..., 255, 255)

            a_indices, anchors, positives, negatives, _ = self.sampler(emb_vec)

            # 3. Compute the loss, ML loss and the l2 distance betwee the embeddings
            margin_loss, _ = self.criterion(anchors, positives, negatives, self.beta, y[a_indices])
            total_loss = utils_misc.add_loss('embtouch/emb_touch_ml_loss', total_loss, margin_loss,
                hyp.emb_3D_ml_coeff, summ_writer)

            # the l2 loss between the embeddings
            l2_loss = torch.nn.functional.mse_loss(chosen_sensor_feats_, chosen_pred_feats_)
            total_loss = utils_misc.add_loss('embtouch/emb_l2_loss', total_loss, l2_loss,
                hyp.emb_3D_l2_coeff, summ_writer)
        ## ... End code for doing metric learning between pred_features and sensor_features ... ##

        ## ... Begin code for doing moc inspired ML between pred_features and sensor_features ... ##
        if hyp.do_moc and moc_init_done:
            moc_loss = self.moc_ml_net(sensor_features_, key_sensor_features_,\
                pred_features_, key_pred_features_, summ_writer)
            total_loss += moc_loss
        ## ... End code for doing moc inspired ML between pred_features and sensor_feature ... ##

        ## ... add code for filling up results needed for eval recall ... ##
        if hyp.do_eval_recall and moc_init_done:
            results['context_features'] = pred_features_
            results['sensor_depth_centers_in_world'] = sensor_depths_centers_in_world_
            results['sensor_depths_centers_in_ref_cam'] = sensor_depths_centers_in_ref_cam_
            results['object_name'] = feed['object_name']

            # I will do precision recall here at different recall values and summarize it using tensorboard
            recalls = [1, 5, 10, 50, 100, 200]
            # also should not include any gradients because of this
            # fast_sensor_emb_e = sensor_features_
            # fast_context_emb_e = pred_features_
            # slow_sensor_emb_g = key_sensor_features_
            # slow_context_emb_g = key_context_features_
            fast_sensor_emb_e = sensor_features_.clone().detach()
            fast_context_emb_e = pred_features_.clone().detach()

            # I will do multiple eval recalls here
            slow_sensor_emb_g = key_sensor_features_.clone().detach()
            slow_context_emb_g = key_pred_features_.clone().detach()

            # assuming the above thing goes well
            fast_sensor_emb_e = fast_sensor_emb_e.cpu().numpy()
            fast_context_emb_e = fast_context_emb_e.cpu().numpy()
            slow_sensor_emb_g = slow_sensor_emb_g.cpu().numpy()
            slow_context_emb_g = slow_context_emb_g.cpu().numpy()

            # now also move the vis to numpy and plot it using matplotlib
            vis_e = __p(sensor_rgbs)
            vis_g = __p(sensor_rgbs)
            np_vis_e = vis_e.cpu().detach().numpy()
            np_vis_e = np.transpose(np_vis_e, [0, 2, 3, 1])
            np_vis_g = vis_g.cpu().detach().numpy()
            np_vis_g = np.transpose(np_vis_g, [0, 2, 3, 1])

            # bring it back to original color
            np_vis_g = ((np_vis_g+0.5) * 255).astype(np.uint8)
            np_vis_e = ((np_vis_e+0.5) * 255).astype(np.uint8)

            # now compare fast_sensor_emb_e with slow_context_emb_g
            # since I am doing positive against this
            fast_sensor_emb_e_list = [fast_sensor_emb_e, np_vis_e]
            slow_context_emb_g_list = [slow_context_emb_g, np_vis_g]

            prec, vis, chosen_inds_and_neighbors_inds = compute_precision(
                fast_sensor_emb_e_list, slow_context_emb_g_list, recalls=recalls
            )

            # finally plot the nearest neighbour retrieval and move ahead
            if feed['global_step'] % 1 == 0:
                plot_nearest_neighbours(vis, step=feed['global_step'],
                                        save_dir='/home/gauravp/eval_results',
                                        name='fast_sensor_slow_context')

            # plot the precisions at different recalls
            for pr, re in enumerate(recalls):
                summ_writer.summ_scalar(f'evrefast_sensor_slow_context/recall@{re}',\
                    prec[pr])

            # now compare fast_context_emb_e with slow_sensor_emb_g
            fast_context_emb_e_list = [fast_context_emb_e, np_vis_e]
            slow_sensor_emb_g_list = [slow_sensor_emb_g, np_vis_g]

            prec, vis, chosen_inds_and_neighbors_inds = compute_precision(
                fast_context_emb_e_list, slow_sensor_emb_g_list, recalls=recalls
            )
            if feed['global_step'] % 1 == 0:
                plot_nearest_neighbours(vis, step=feed['global_step'],
                                        save_dir='/home/gauravp/eval_results',
                                        name='fast_context_slow_sensor')

            # plot the precisions at different recalls
            for pr, re in enumerate(recalls):
                summ_writer.summ_scalar(f'evrefast_context_slow_sensor/recall@{re}',\
                    prec[pr])


            # now finally compare both the fast, I presume we want them to go closer too
            fast_sensor_list = [fast_sensor_emb_e, np_vis_e]
            fast_context_list = [fast_context_emb_e, np_vis_g]

            prec, vis, chosen_inds_and_neighbors_inds = compute_precision(
                fast_sensor_list, fast_context_list, recalls=recalls
            )
            if feed['global_step'] % 1 == 0:
                plot_nearest_neighbours(vis, step=feed['global_step'],
                                        save_dir='/home/gauravp/eval_results',
                                        name='fast_sensor_fast_context')

            for pr, re in enumerate(recalls):
                summ_writer.summ_scalar(f'evrefast_sensor_fast_context/recall@{re}',\
                    prec[pr])

        ## ... done code for filling up results needed for eval recall ... ##
        summ_writer.summ_scalar('loss', total_loss.cpu().item())
        return total_loss, results, [key_sensor_features_, key_pred_features_]

Example #2

Show file

class MujocoOffline(nn.Module):
    def __init__(self, config):
        super(MujocoOffline, self).__init__()

        self.config = config
        if self.config.do_feat:
            print('------- adding featnet --------')
            self.featnet = FeatNet(self.config)
        if self.config.do_occ:
            print('------- adding occnet ---------')
            self.occnet = OccNet(self.config)
        if self.config.do_view:
            print('------- adding viewnet --------')
            self.viewnet = ViewNet(self.config)
        if self.config.do_det:
            print('------- adding detnet ---------')
            self.detnet = DetNet(self.config)

        # coordinate range
        self.coord_cam_front = Coord(-0.5, 0.5, -0.5, 0.5, 0.2, 1.2, 0.0, -0.4)
        self.coord_mem = Coord(-0.5, 0.5, -0.5, 0.5, -0.5, 0.5, 0.0, -0.4)

        MH, MW, MD = self.config.Y, self.config.X, self.config.Z
        MH2, MW2, MD2 = int(MH / 2), int(MW / 2), int(MD / 2)
        # voxel size
        mem_protos = VoxProto([MH, MW, MD])
        halfmem_protos = VoxProto([MH2, MW2, MD2])

        #combine
        self.mem_coord_cams = VoxCoord(self.coord_cam_front, mem_protos)
        self.mem_coord_Rs = VoxCoord(self.coord_mem, mem_protos)
        self.halfmem_coord_cams = VoxCoord(self.coord_cam_front,
                                           halfmem_protos)
        self.halfmem_coord_Rs = VoxCoord(self.coord_mem, halfmem_protos)
        self.feat_mem_coord_cams = None  #self.halfmem_coord_Rs
        self.feat_mem_coord_Rs = None

        self.is_learned_cluster_centers = False

    def save_local_variables(self):
        output = dict()

        return output

    def unproject(self, cam_rgbd_inputs, cam_info_inputs):

        rgb_camXs, xyz_camXs = cam_rgbd_inputs
        pix_T_cams, origin_T_camXs, origin_T_camRs = cam_info_inputs

        B, H, W, V, S = self.config.B, self.config.H, self.config.W, self.config.V, self.config.S
        PH, PW = self.config.PH, self.config.PW  # this is the size of the predicted image
        # the next are the memory dimensions, do not know why this naming though

        # merge sequence and batch dimensions
        __p = lambda x: utils.basic.pack_seqdim(x, B)
        # unmerge sequence and batch dimensions
        __u = lambda x: utils.basic.unpack_seqdim(x, B)

        pix_T_cams_ = __p(pix_T_cams)  # packing here the (B,S) => (B*S)
        # intrinsic matrix packed and unpacked end
        origin_T_camRs_ = __p(origin_T_camRs)
        origin_T_camXs_ = __p(origin_T_camXs)
        # origin_T_camXs unpacked and packed end

        # completed getting inputs now combining them
        # 1. Converts from camX to camR which is Adam's coordinate system
        # get from camX_T_camR and camR_T_camX and pack unpack it
        camRs_T_camXs_ = torch.matmul(utils.geom.safe_inverse(origin_T_camRs_),
                                      origin_T_camXs_)
        camXs_T_camRs_ = utils.geom.safe_inverse(camRs_T_camXs_)
        camRs_T_camXs = __u(camRs_T_camXs_)
        camXs_T_camRs = __u(camXs_T_camRs_)
        # end of camX_T_camR and camR_T_camX and pack unpack it

        # goes directly from camR to image in each camera image frame
        pix_T_cams_ = utils.geom.pack_intrinsics(pix_T_cams_[:, 0, 0],
                                                 pix_T_cams_[:, 1, 1],
                                                 pix_T_cams_[:, 0, 2],
                                                 pix_T_cams_[:, 1, 2])
        pix_T_camRs_ = torch.matmul(pix_T_cams_, camXs_T_camRs_)
        pix_T_camRs = __u(pix_T_camRs_)
        # end of computation for matrix which goes from camR to each camera image frame

        # pointclouds in each camera frame
        xyz_camXs_ = __p(xyz_camXs)
        # pointclouds converted to camR coordinate system
        xyz_camRs_ = utils.geom.apply_4x4(camRs_T_camXs_, xyz_camXs_)
        xyz_camRs = __u(xyz_camRs_)
        # TODO: visualize the point cloud here and check that it makes sense

        # get occupancy maps from pointclouds
        # QUESTION: what is the space you are discretizing, I mean the extent of the space
        occRs_ = utils.vox.voxelize_xyz(xyz_camRs_, self.mem_coord_Rs)
        occXs_ = utils.vox.voxelize_xyz(xyz_camXs_, self.mem_coord_cams)

        occRs_half_ = utils.vox.voxelize_xyz(xyz_camRs_, self.halfmem_coord_Rs)
        occXs_half_ = utils.vox.voxelize_xyz(xyz_camXs_,
                                             self.halfmem_coord_cams)
        occRs = __u(occRs_)
        occXs = __u(occXs_)
        occRs_half = __u(occRs_half_)
        occXs_half = __u(occXs_half_)

        # unproject depth images, This is done for the color images not the depths
        ## rgb unprojection, bilinearly samples and fills the grid
        my_device = rgb_camXs.device

        unpRs_ = utils.vox.unproject_rgb_to_mem(__p(rgb_camXs),
                                                pix_T_camRs_,
                                                self.mem_coord_Rs,
                                                device=my_device)
        unpXs_ = utils.vox.unproject_rgb_to_mem(__p(rgb_camXs),
                                                pix_T_cams_,
                                                self.mem_coord_cams,
                                                device=my_device)
        unpRs = __u(unpRs_)
        unpXs = __u(unpXs_)

        unpRs_half_ = utils.vox.unproject_rgb_to_mem(__p(rgb_camXs),
                                                     pix_T_camRs_,
                                                     self.halfmem_coord_Rs,
                                                     device=my_device)
        unpRs_half = __u(unpRs_half_)

        unp_visRs = utils.improc.get_unps_vis(unpRs_half, occRs_half)
        unp_visRs = torch.mean(unp_visRs, dim=1)

        # NOTE: still do not know why is this required or where is this used for that matter
        depth_camXs_, valid_camXs_ = utils.geom.create_depth_image(
            pix_T_cams_, xyz_camXs_, H, W)
        dense_xyz_camXs_ = utils.geom.depth2pointcloud(depth_camXs_,
                                                       pix_T_cams_)
        dense_xyz_camRs_ = utils.geom.apply_4x4(camRs_T_camXs_,
                                                dense_xyz_camXs_)

        # this is B*S x H*W x 3
        inbound_camXs_ = utils.vox.get_inbounds(dense_xyz_camRs_,
                                                self.mem_coord_cams).float()
        inbound_camXs_ = torch.reshape(
            inbound_camXs_,
            [B * S, 1, H, W
             ])  # NOTE: Here there is a difference in tensorflow code
        inbound_camXs = __u(inbound_camXs_)

        depth_camXs = __u(depth_camXs_)
        valid_camXs = __u(valid_camXs_) * inbound_camXs

        return depth_camXs, valid_camXs, camRs_T_camXs, camXs_T_camRs, unpXs, unpRs, occXs, occRs, occXs_half, occRs_half, unp_visRs

    def predict_forward(self, feed):
        cam_rgbd_inputs = (feed["rgb_camXs"], feed["xyz_camXs"])
        cam_info_inputs = (feed["pix_T_cams"], feed["origin_T_camXs"],
                           feed["origin_T_camRs"])
        depth_camXs, valid_camXs, camRs_T_camXs, camXs_T_camRs, unpXs, unpRs, occXs, occRs, occXs_half, occRs_half = self.unproject(
            cam_rgbd_inputs, cam_info_inputs)

        B = self.config.B
        __p = lambda x: utils.basic.pack_seqdim(x, B)
        # unmerge sequence and batch dimensions
        __u = lambda x: utils.basic.unpack_seqdim(x, B)

        if self.config.do_feat:
            rgb_camXs, xyz_camXs = cam_rgbd_inputs

            featXs_input = torch.cat([occXs, occXs * unpXs],
                                     dim=2)  # B, S, 4, H, W, D
            featXs_input_ = __p(featXs_input)

            freeXs_ = utils.vox.get_freespace(__p(xyz_camXs), __p(occXs_half),
                                              self.halfmem_coord_cams)
            freeXs = __u(freeXs_)
            visXs = torch.clamp(occXs_half + freeXs, 0.0, 1.0)

            #if type(mask_) != type(None): # featXs_input: B x NVIEWS x 4 x 64 x 64 x 64
            assert (list(occXs.shape)[3:6] == list(featXs_input.shape)[3:6])

            featXs_, validXs_, feat_loss = self.featnet(
                featXs_input_, mask=__p(occXs), set_num=feed['set_num'])
            assert feat_loss.item(
            ) == 0.0, "there is nothing to guide featnet by itself"
            # for each view features are being predicted, NOTE that nothing is brought into common view yet
            validXs, featXs = __u(validXs_), __u(featXs_)

            #### .... BEGIN Converting everything to ref frame .... ####
            validRs = utils.vox.apply_4x4_to_voxs(
                camRs_T_camXs,
                validXs,
                mem_coord_As=self.halfmem_coord_cams,
                mem_coord_Bs=self.halfmem_coord_Rs)
            visRs = utils.vox.apply_4x4_to_voxs(
                camRs_T_camXs,
                visXs,
                mem_coord_As=self.halfmem_coord_cams,
                mem_coord_Bs=self.halfmem_coord_Rs)
            featRs = utils.vox.apply_4x4_to_voxs(
                camRs_T_camXs,
                featXs,
                mem_coord_As=self.halfmem_coord_cams,
                mem_coord_Bs=self.halfmem_coord_Rs)
            if self.feat_mem_coord_Rs == None:
                self.feat_mem_coord_Rs = self.halfmem_coord_Rs
            #### .... featRs_without_target_view contains features from all the views
            #### .... warped and brought into common frame and aggregated .... Using
            #### .... features occupancy and target view should be predicted .... ####
            # B x 32 x H x W x D
            featsRs_without_target_view = torch.mean(featRs[:, 1:], dim=1)

        if self.config.do_view:
            assert self.config.do_feat

            PH, PW = self.config.PH, self.config.PW
            sy = float(PH) / float(self.config.H)
            sx = float(PW) / float(self.config.W)

            assert (sx == 0.5)
            assert (sy == 0.5)

            # projpix_T_cams, are the intrinsics for the projection, just scale the true intrinsics
            pix_T_cams = feed["pix_T_cams"]
            projpix_T_cams = __u(
                utils.geom.scale_intrinsics(__p(pix_T_cams), sx, sy))

            # now I will project the predicted feats to target view (warp)
            feat_projtarget_view = utils.vox.apply_pixX_T_memR_to_voxR(
                projpix_T_cams[:, 0], camXs_T_camRs[:,
                                                    0], self.halfmem_coord_Rs,
                featsRs_without_target_view, self.config.view_depth, PH, PW)

            rgb_X0 = utils.basic.downsample(
                rgb_camXs[:, 0], 2)  ## NOTE: this is the ground truth

            # rgb_e: b x 3 x 64 x 64
            view_loss, rgb_e, emb2D_e = self.viewnet(feat_projtarget_view,
                                                     rgb_X0,
                                                     set_num=feed['set_num'])

        #crop object features
        bbox_in_ref_cam = feed['bbox_in_ref_cam']
        # based on the batch size this would be B, N, 8, 3

        min_bounds = bbox_in_ref_cam[:, :, 0, :]
        max_bounds = bbox_in_ref_cam[:, :, -1, :]

        lengths = torch.abs(max_bounds - min_bounds)
        center = (max_bounds + min_bounds) * 0.5

        # now form the box and then covert to lrt list
        B = self.config.B  # since i have only one box
        N = 1  # number of objects
        # 9 is cx, cy, cz, lx, ly, lz, rx, ry, rz
        boxlist = torch.zeros(B, N, 9)

        # NOTE: Note: I am assuming here that N = 1 !!!!!!
        boxlist[:, :, :3] = center  #.unsqueeze(1)
        boxlist[:, :, 3:6] = lengths  #.unsqueeze(1)

        # convert it to lrt list, it contains box length and rt to go
        # from box coordinates to ref coordinate system.
        box_lrtlist = utils.geom.convert_boxlist_to_lrtlist(boxlist)

        # now this is already in the ref coordinate system which was not
        # the case with my previous use of the crop_zoom_from_mem func.
        # Hence I had previously included camR_T_camXs which is not req here
        _, _, box_dim = box_lrtlist.shape

        presumably_object_tensor = utils.vox.crop_zoom_from_mem(
            featsRs_without_target_view, self.feat_mem_coord_Rs,
            torch.reshape(box_lrtlist[:, :, :], [B * N, box_dim]), 32, 32, 32)

        _, C, H, W, D = presumably_object_tensor.shape

        presumably_object_tensor = torch.reshape(
            presumably_object_tensor.permute([0, 2, 3, 4, 1]),
            [B, N, H, W, D, C])
        # NOTE: As of now I am not doing backprop through this Tensor so
        # no need to keep it in gpu anymore
        results = dict()
        results['object_tensor'] = presumably_object_tensor.cpu().detach(
        ).numpy()
        results[
            'featsRs_without_target_view'] = featsRs_without_target_view.permute(
                [0, 2, 3, 4, 1]).cpu().detach().numpy()
        results['rgb_e'] = rgb_e.permute(0, 2, 3, 1).cpu().detach().numpy()

        # Add the plot of this to tensorboard, and also think how can you
        # visualize if the correct thing is being returned to you.

        return results

    def predict_forward_bbox_detector(self, feed):
        # here I assume that this function will be called only during inference, so all the sequences should be used
        results = dict()
        cam_rgbd_inputs = (feed['rgb_camXs'], feed['xyz_camXs'])
        cam_info_inputs = (feed['pix_T_cams'], feed['origin_T_camXs'],
                           feed['origin_T_camRs'])

        B = self.config.B
        __p = lambda x: utils.basic.pack_seqdim(x, B)
        __u = lambda x: utils.basic.unpack_seqdim(x, B)

        #        if self.config.do_det:
        #            # NOTE: this does not contain a valid box
        #            gt_boxes_corners =  feed['bbox_in_ref_cam'] ## fill in some random values
        #            gt_boxesRMem_corners = utils.vox.Ref2Mem(gt_boxes_corners, self.halfmem_coord_Rs)
        #            gt_boxesRMem_corners = gt_boxesRMem_corners.unsqueeze(1)  # this indicates the number of boxes
        #
        #            # I have corners in mem now, I will convert it to boxlist
        #            gt_boxesRMem_theta = utils.geom.convert_corners_to_axis_aligned_boxlist(gt_boxesRMem_corners)
        #            scores = torch.ones(self.config.B, 1).float().to(gt_boxesRMem_theta.device)

        depth_camXs, valid_camXs, camRs_T_camXs, camXs_T_camRs, unpXs, unpRs, occXs, occRs, occXs_half, occRs_half, unp_visRs = self.unproject(
            cam_rgbd_inputs, cam_info_inputs)

        # put the model in eval mode here
        self.featnet.eval()
        assert self.featnet.training == False, "should have batch norm switched off"

        self.detnet.eval()
        assert self.detnet.training == False, "should have batch norm switched off here"

        if self.config.do_feat:
            rgb_camXs, xyz_camXs = cam_rgbd_inputs
            featXs_input = torch.cat([occXs, occXs * unpXs],
                                     dim=2)  # B, S, 4, H, W, D
            featXs_input_ = __p(featXs_input)

            freeXs_ = utils.vox.get_freespace(__p(xyz_camXs), __p(occXs_half),
                                              self.halfmem_coord_cams)
            freeXs = __u(freeXs_)
            visXs = torch.clamp(occXs_half + freeXs, 0.0, 1.0)

            #if type(mask_) != type(None): # featXs_input: B x NVIEWS x 4 x 64 x 64 x 64
            assert (list(occXs.shape)[3:6] == list(featXs_input.shape)[3:6])

            with torch.no_grad():
                assert self.featnet.training == False
                featXs_, validXs_, feat_loss = self.featnet(
                    featXs_input_, mask=__p(occXs), set_num=feed['set_num'])

            validXs, featXs = __u(validXs_), __u(featXs_)
            validRs = utils.vox.apply_4x4_to_voxs(
                camRs_T_camXs,
                validXs,
                mem_coord_As=self.halfmem_coord_cams,
                mem_coord_Bs=self.halfmem_coord_Rs)
            visRs = utils.vox.apply_4x4_to_voxs(
                camRs_T_camXs,
                visXs,
                mem_coord_As=self.halfmem_coord_cams,
                mem_coord_Bs=self.halfmem_coord_Rs)
            featRs = utils.vox.apply_4x4_to_voxs(
                camRs_T_camXs,
                featXs,
                mem_coord_As=self.halfmem_coord_cams,
                mem_coord_Bs=self.halfmem_coord_Rs)
            if self.feat_mem_coord_Rs == None:
                self.feat_mem_coord_Rs = self.halfmem_coord_Rs

            # since only using for prediction all the views can be used? think about this
            features = torch.mean(featRs,
                                  dim=1)  # across the sequence dimension

        if self.config.do_det:
            self.axboxlist_memR = gt_boxesRMem_theta.clone()
            self.scorelist = scores
            # TODO: some parts of this network are confusing to me, understand it thoroughly
            with torch.no_grad():
                assert self.detnet.training == False
                detect_loss, boxlist_memR_e, scorelist_e, tidlist_e, sco, ove = self.detnet(
                    self.axboxlist_memR,
                    self.scorelist,
                    features,
                    summ_writer=None)

            # now that I have the box list I need to convert it into corners and return
            # convert it to ref image and return, mem to adam to ref cam
            # adam_T_ref = feed['adam_T_camRs'].squeeze(1)
            # ref_T_adam = torch.inverse(adam_T_ref)

            pred_box_corners_mem = utils.geom.transform_boxes_to_corners(
                boxlist_memR_e)
            pred_box_corners_adam = utils.geom.apply_4x4_to_corners(
                utils.coordTcoord.get_ref_T_mem(B, self.halfmem_coord_Rs),
                pred_box_corners_mem)
            #pred_box_corners_adam = utils.vox.Mem2Ref(pred_box_corners_mem, self.halfmem_coord_Rs)

            # pred_box_ref = utils.geom.apply_4x4_to_corners(ref_T_adam, pred_box_corners_adam)

        results['predicted_boxes_adam'] = pred_box_corners_adam
        results['scorelist'] = scorelist_e
        results['tidlist'] = tidlist_e

        return results

    def convert_objects_to_features(self, feed):
        results = self.predict_forward(feed)

        return results['object_tensor']

    def dump_one_batch(self, feed):
        import pickle
        import copy

        feed_copy = dict()
        i = 0
        for key in feed:
            if key in ['record', 'writer', 'global_step']:
                continue
            if torch.is_tensor(feed[key]):
                tensor_np = feed[key].cpu()
                feed_copy[key] = tensor_np
            else:
                feed_copy[key] = feed[key]
            i += 1
            #if i > 1:
            #    break

        with open("tmp/feed.pkl", "wb") as f:
            pickle.dump(feed_copy, f)

        import ipdb
        ipdb.set_trace()

    def forward(self, feed):
        # feed is the input here, let's see what it has
        results = dict()
        #self.dump_one_batch(feed)

        # Whenever forward is called, this is instantiated which creates summ_writer object
        # save this is True if global_step % log_freq == 0
        summ_writer = utils.improc.Summ_writer(config=self.config,
                                               writer=feed['writer'],
                                               global_step=feed['global_step'],
                                               set_name=feed['set_name'],
                                               fps=8)

        writer = feed['writer']
        #global_step = feed['global_step']
        total_loss = torch.tensor(0.0).cuda()

        cam_rgbd_inputs = (feed["rgb_camXs"], feed["xyz_camXs"])
        cam_info_inputs = (feed["pix_T_cams"], feed["origin_T_camXs"],
                           feed["origin_T_camRs"])

        depth_camXs, valid_camXs, camRs_T_camXs, camXs_T_camRs, unpXs, unpRs, occXs, occRs, occXs_half, occRs_half, unp_visRs = self.unproject(
            cam_rgbd_inputs, cam_info_inputs)

        B = self.config.B
        __p = lambda x: utils.basic.pack_seqdim(x, B)
        # unmerge sequence and batch dimensions
        __u = lambda x: utils.basic.unpack_seqdim(x, B)
        #### ... VISUALIZE what we got ... ####

        # prepare stuff for the detector
        if self.config.do_det:
            gt_boxesR_corners = feed['bbox_in_ref_cam']

            gt_boxesR_corners_ = __p(gt_boxesR_corners)
            # convert the corners into memory coordinates

            gt_boxesRMem_corners_ = utils.vox.Ref2Mem(gt_boxesR_corners_,
                                                      self.halfmem_coord_Rs)
            gt_boxesRMem_corners = __u(gt_boxesRMem_corners_)  #.unsqueeze(1)
            gt_boxesRMem_theta = utils.geom.convert_corners_to_axis_aligned_boxlist(
                gt_boxesRMem_corners)
            # finally get the scores, I am assuming 1 since all boxes are visible for me and all boxes for me is 1
            scores = torch.ones(B, 1).float().to(gt_boxesRMem_theta.device)

        if not feed['set_num'] == 1:

            rgb_camXs, xyz_camXs = cam_rgbd_inputs
            rgb_camRs = feed["rgb_camRs"]

            summ_writer.summ_oneds('2D_inputs/depth_camXs',
                                   torch.unbind(depth_camXs, dim=1))
            summ_writer.summ_oneds('2D_inputs/valid_camXs',
                                   torch.unbind(valid_camXs, dim=1))
            summ_writer.summ_rgbs('2D_inputs/rgb_camXs',
                                  torch.unbind(rgb_camXs, dim=1))
            summ_writer.summ_rgbs('2D_inputs/rgb_camRs',
                                  torch.unbind(rgb_camRs, dim=1))
            summ_writer.summ_occs('3d_inputs/occXs',
                                  torch.unbind(occXs, dim=1),
                                  reduce_axes=[2])
            summ_writer.summ_unps('3d_inputs/unpXs', torch.unbind(unpXs,
                                                                  dim=1),
                                  torch.unbind(occXs, dim=1))
            if summ_writer.save_this:
                # why compute again?
                #unpRs_ = utils.vox.unproject_rgb_to_mem(__p(rgb_camXs), utils.basic.matmul2(pix_T_cams_, camXs_T_camRs_), self.mem_coord_Rs)
                #unpRs = __u(unpRs_)
                #occRs_ = utils.vox.voxelize_xyz(xyz_camRs_, self.mem_coord_Rs)
                summ_writer.summ_occs('3d_inputs/occRs',
                                      torch.unbind(occRs, dim=1),
                                      reduce_axes=[2])
                summ_writer.summ_unps('3d_inputs/unpRs',
                                      torch.unbind(unpRs, dim=1),
                                      torch.unbind(occRs, dim=1))
        else:

            rgb_camXs, xyz_camXs = cam_rgbd_inputs
            rgb_camRs = feed["rgb_camRs"]
            summ_writer.summ_oneds('2D_inputs_val/depth_camXs',
                                   torch.unbind(depth_camXs, dim=1))
            summ_writer.summ_oneds('2D_inputs_val/valid_camXs',
                                   torch.unbind(valid_camXs, dim=1))
            summ_writer.summ_rgbs('2D_inputs_val/rgb_camXs',
                                  torch.unbind(rgb_camXs, dim=1))
            summ_writer.summ_rgbs('2D_inputs_val/rgb_camRs',
                                  torch.unbind(rgb_camRs, dim=1))
            summ_writer.summ_occs('3d_inputs_val/occXs',
                                  torch.unbind(occXs, dim=1),
                                  reduce_axes=[2])
            summ_writer.summ_unps('3d_inputs_val/unpXs',
                                  torch.unbind(unpXs, dim=1),
                                  torch.unbind(occXs, dim=1))
            if summ_writer.save_this:
                #unpRs_ = utils.vox.unproject_rgb_to_mem(__p(rgb_camXs), Z, Y, X, utils.basic.matmul2(pix_T_cams_, camXs_T_camRs_))
                #unpRs = __u(unpRs_)
                #occRs_ = utils.vox.voxelize_xyz(xyz_camRs_, Z, Y, X)
                summ_writer.summ_occs('3d_inputs_val/occRs',
                                      torch.unbind(occRs, dim=1),
                                      reduce_axes=[2])
                summ_writer.summ_unps('3d_inputs_val/unpRs',
                                      torch.unbind(unpRs, dim=1),
                                      torch.unbind(occRs, dim=1))

        # the idea behind view-pred is form memory with the remaining views project it to target view and
        # then use this memory to predict the target image
        # idea behind occ_prediction is use the memory to predict occupancy in ref view and compare it
        # with the ground truth occupancy in the ref view

        if self.config.do_feat:
            rgb_camXs, xyz_camXs = cam_rgbd_inputs

            featXs_input = torch.cat([occXs, occXs * unpXs],
                                     dim=2)  # B, S, 4, H, W, D
            featXs_input_ = __p(featXs_input)

            freeXs_ = utils.vox.get_freespace(__p(xyz_camXs), __p(occXs_half),
                                              self.halfmem_coord_cams)
            freeXs = __u(freeXs_)
            visXs = torch.clamp(occXs_half + freeXs, 0.0, 1.0)

            #if type(mask_) != type(None): # featXs_input: B x NVIEWS x 4 x 64 x 64 x 64
            assert (list(occXs.shape)[3:6] == list(featXs_input.shape)[3:6])

            featXs_, validXs_, feat_loss = self.featnet(
                featXs_input_,
                summ_writer,
                mask=__p(occXs),
                set_num=feed['set_num'])
            total_loss += feat_loss
            assert feat_loss.item(
            ) == 0.0, "there is nothing to guide featnet by itself"
            # for each view features are being predicted, NOTE that nothing is brought into common view yet
            validXs, featXs = __u(validXs_), __u(featXs_)

            #### .... BEGIN Converting everything to ref frame .... ####
            validRs = utils.vox.apply_4x4_to_voxs(
                camRs_T_camXs,
                validXs,
                mem_coord_As=self.halfmem_coord_cams,
                mem_coord_Bs=self.halfmem_coord_Rs)
            visRs = utils.vox.apply_4x4_to_voxs(
                camRs_T_camXs,
                visXs,
                mem_coord_As=self.halfmem_coord_cams,
                mem_coord_Bs=self.halfmem_coord_Rs)
            featRs = utils.vox.apply_4x4_to_voxs(
                camRs_T_camXs,
                featXs,
                mem_coord_As=self.halfmem_coord_cams,
                mem_coord_Bs=self.halfmem_coord_Rs)
            if self.feat_mem_coord_Rs == None:
                self.feat_mem_coord_Rs = self.halfmem_coord_Rs
            #### .... END converting everything to ref frame .... ####

            ### ... Remember _e added at the end means it is estimated ... ###
            vis3D_e = torch.max(validRs[:, 1:], dim=1)[0] * torch.max(
                visRs[:, 1:], dim=1)[0]
            ### ... only thing which is using _e is below visualization ... ###

            if not feed['set_num'] == 1:
                summ_writer.summ_feats('3D_feats/featXs_input',
                                       torch.unbind(featXs_input, dim=1),
                                       pca=True)
                summ_writer.summ_feats('3D_feats/featXs_output',
                                       torch.unbind(featXs, dim=1),
                                       pca=True)
                summ_writer.summ_feats('3D_feats/featRs_output',
                                       torch.unbind(featRs, dim=1),
                                       pca=True)
                summ_writer.summ_feats('3D_feats/validRs',
                                       torch.unbind(validRs, dim=1),
                                       pca=False)
                summ_writer.summ_feat('3D_feats/vis3D_e', vis3D_e, pca=False)
            else:
                summ_writer.summ_feats('3D_feats_val/featXs_input',
                                       torch.unbind(featXs_input, dim=1),
                                       pca=True)
                summ_writer.summ_feats('3D_feats_val/featXs_output',
                                       torch.unbind(featXs, dim=1),
                                       pca=True)
                summ_writer.summ_feats('3D_feats_val/featRs_output',
                                       torch.unbind(featRs, dim=1),
                                       pca=True)
                summ_writer.summ_feats('3D_feats_val/validRs',
                                       torch.unbind(validRs, dim=1),
                                       pca=False)
                summ_writer.summ_feat('3D_feats_val/vis3D_e',
                                      vis3D_e,
                                      pca=False)

            #### .... featRs_without_target_view contains features from all the views
            #### .... warped and brought into common frame and aggregated .... Using
            #### .... features occupancy and target view should be predicted .... ####
            featsRs_without_target_view = torch.mean(featRs[:, 1:], dim=1)

            if self.config.do_generate_data or (self.config.do_validation
                                                and feed['set_num'] == 1):
                featRs_with_target_view = torch.mean(featRs, dim=1)

        if self.config.do_occ and self.config.occ_do_cheap:

            occRs_sup, freeRs_sup, freeXs = utils.vox.prep_occs_supervision(
                xyz_camXs,
                occRs_half,
                occXs_half,
                camRs_T_camXs,
                self.halfmem_coord_Rs,
                self.halfmem_coord_cams,
                agg=True)

            if feed['set_num'] != 1:
                summ_writer.summ_occ('occ_sup/occ_sup',
                                     occRs_sup,
                                     reduce_axes=[2])
                summ_writer.summ_occ('occ_sup/free_sup',
                                     freeRs_sup,
                                     reduce_axes=[2])
                summ_writer.summ_occs('occ_sup/freeXs_sup',
                                      torch.unbind(freeXs, dim=1),
                                      reduce_axes=[2])
                summ_writer.summ_occs('occ_sup/occXs_sup',
                                      torch.unbind(occXs_half, dim=1),
                                      reduce_axes=[2])
            else:
                summ_writer.summ_occ('occ_sup_val/occ_sup',
                                     occRs_sup,
                                     reduce_axes=[2])
                summ_writer.summ_occ('occ_sup_val/free_sup',
                                     freeRs_sup,
                                     reduce_axes=[2])
                summ_writer.summ_occs('occ_sup_val/freeXs_sup',
                                      torch.unbind(freeXs, dim=1),
                                      reduce_axes=[2])
                summ_writer.summ_occs('occ_sup_val/occXs_sup',
                                      torch.unbind(occXs_half, dim=1),
                                      reduce_axes=[2])

            occ_loss, occRs_pred_ = self.occnet(featsRs_without_target_view,
                                                occRs_sup,
                                                freeRs_sup,
                                                torch.max(validRs[:, 1:],
                                                          dim=1)[0],
                                                summ_writer,
                                                set_num=feed['set_num'])

            occRs_pred = __u(occRs_pred_)
            total_loss += occ_loss

        if self.config.do_view:
            assert self.config.do_feat
            # we warped the features into canonical view which is featR
            # now we resample to target view which is view (0) and decode
            # be sure not to pass in the features of the view to decode
            # use featRs_without_target_view as the features in the canonical view

            PH, PW = self.config.PH, self.config.PW
            sy = float(PH) / float(self.config.H)
            sx = float(PW) / float(self.config.W)

            assert (sx == 0.5)
            assert (sy == 0.5)

            # projpix_T_cams, are the intrinsics for the projection, just scale the true intrinsics
            pix_T_cams = feed["pix_T_cams"]
            projpix_T_cams = __u(
                utils.geom.scale_intrinsics(__p(pix_T_cams), sx, sy))

            # now I will project the predicted feats to target view (warp)
            feat_projtarget_view = utils.vox.apply_pixX_T_memR_to_voxR(
                projpix_T_cams[:, 0], camXs_T_camRs[:,
                                                    0], self.halfmem_coord_Rs,
                featsRs_without_target_view, self.config.view_depth, PH, PW)

            rgb_X0 = utils.basic.downsample(
                rgb_camXs[:, 0], 2)  ## NOTE: this is the ground truth

            view_loss, rgb_e, emb2D_e = self.viewnet(feat_projtarget_view,
                                                     rgb_X0,
                                                     summ_writer,
                                                     set_num=feed['set_num'])

            total_loss += view_loss

        if self.config.do_det:
            emb3D_e_R = featsRs_without_target_view
            emb3D_g_R = featRs[:,
                               0, :, :, :, :]  # 0 serves as the target view, which want to predict from the context
            self.axboxlist_memR = gt_boxesRMem_theta.clone()
            self.scorelist = scores
            # TODO: some parts of this network are confusing to me, understand it thoroughly
            detect_loss, boxlist_memR_e, scorelist_e, tidlist_e, sco, ove = self.detnet(
                self.axboxlist_memR, self.scorelist, emb3D_e_R, summ_writer)
            total_loss += detect_loss

            # unprojecting again from memory to ref cam coordinates
            boxlist_camR_e = utils.vox.convert_boxlist_memR_to_camR(
                boxlist_memR_e, self.halfmem_coord_Rs)
            boxlist_camR_g = utils.vox.convert_boxlist_memR_to_camR(
                self.axboxlist_memR, self.halfmem_coord_Rs)

            # from gt_corners see that boxlist_camR_g here is fine
            corners_max_g = torch.max(gt_boxesR_corners_, axis=1)[0]
            corners_min_g = torch.min(gt_boxesR_corners_, axis=1)[0]
            comp_center = corners_min_g + (corners_max_g - corners_min_g) / 2.0
            comp_lengths = corners_max_g - corners_min_g

            from_above_camR = boxlist_camR_g.clone()

            assert np.allclose(
                from_above_camR[:, :, :3].squeeze(1).cpu().numpy(),
                comp_center.cpu().numpy(),
                atol=1e-5)
            assert np.allclose(from_above_camR[:, :,
                                               3:6].squeeze(1).cpu().numpy(),
                               comp_lengths.cpu().numpy(),
                               atol=1e-5)

            summ_writer.summ_box_mem_on_mem(
                'detnet/gt_boxesR_mem', unp_visRs, self.axboxlist_memR,
                self.scorelist,
                torch.ones([self.config.B, 1], dtype=torch.int32))
            try:
                summ_writer.summ_box_mem_on_mem(
                    'detnet/pred_boxesR_mem', unp_visRs, boxlist_memR_e,
                    scorelist_e, torch.ones_like(scorelist_e,
                                                 dtype=torch.int32))
            except Exception as e:
                print('------ will handle this later -------')

            ### plotting of the detection boxes

            #B, C, P = list(gt_boxesR_corners.shape)
            gt_box_camR = gt_boxesR_corners  #.reshape(B, 1, C, P)
            origin_T_camRef = feed['origin_T_camRefs'][:, 0]
            camRef_T_origin = torch.inverse(origin_T_camRef)

            gt_box_camXs = utils.geom.apply_4x4_to_corners(
                camRef_T_origin, gt_box_camR)
            gt_ref_img = summ_writer.summ_box_by_corners(
                'detnet/gt_box',
                feed['rgb_camRs'].squeeze(1),
                gt_box_camXs,
                self.scorelist,
                torch.ones([self.config.B, 1], dtype=torch.int32),
                feed['pix_T_cams'][:, 0, :, :],
                only_return=True)

            # plot gt on each input view
            gt_imgs = list()
            for plt_i in range(self.config.S):
                view = utils.geom.apply_4x4_to_corners(
                    camXs_T_camRs[:, plt_i, :, :], gt_box_camR)
                gt_imgs.append(
                    summ_writer.summ_box_by_corners(
                        f'detnet/gt_view_{plt_i}',
                        feed['rgb_camXs'][:, plt_i, :, :, :],
                        view,
                        self.scorelist,
                        torch.ones([self.config.B, 1], dtype=torch.int32),
                        feed['pix_T_cams'][:, plt_i, :, :],
                        only_return=True))

            gt_imgs.append(gt_ref_img)

            # pred_box_corners_adam = utils.geom.transform_boxes_to_corners(boxlist_camR_e)
            pred_box_corners_mem = utils.geom.transform_boxes_to_corners(
                boxlist_memR_e)
            pred_box_corners_adam = utils.geom.apply_4x4_to_corners(
                utils.coordTcoord.get_ref_T_mem(B, self.halfmem_coord_Rs),
                pred_box_corners_mem)
            pred_box_ref = utils.geom.apply_4x4_to_corners(
                camRef_T_origin, pred_box_corners_adam)
            pred_ref_img = summ_writer.summ_box_by_corners(
                'detnet/pred_boxes',
                feed['rgb_camRs'].squeeze(1),
                pred_box_ref,
                scorelist_e.detach(),
                torch.ones_like(scorelist_e, dtype=torch.int32),
                feed['pix_T_cams'][:, 0, :, :],
                only_return=True)

            # plot pred on each input view
            pred_ims_list = list()
            for plt_i in range(self.config.S):
                view = utils.geom.apply_4x4_to_corners(
                    camXs_T_camRs[:, plt_i, :, :], pred_box_corners_adam)
                pred_ims_list.append(
                    summ_writer.summ_box_by_corners(
                        f'detnet/pred_view_{plt_i}',
                        feed['rgb_camXs'][:, plt_i, :, :, :],
                        view,
                        scorelist_e.detach(),
                        torch.ones_like(scorelist_e, dtype=torch.int32),
                        feed['pix_T_cams'][:, plt_i, :, :],
                        only_return=True))

            pred_ims_list.append(pred_ref_img)

            gt_imgs = torch.cat(gt_imgs, dim=0)
            pred_ims_list = torch.cat(pred_ims_list, dim=0)

            gt_grid = make_grid(gt_imgs, nrow=1)
            pred_img_grid = make_grid(pred_ims_list, nrow=1)

            summ_writer.summ_rgb('detnet/gt_grid', gt_grid.unsqueeze(0))
            summ_writer.summ_rgb('detnet/pred_img_grid',
                                 pred_img_grid.unsqueeze(0))

            ### plotting ends

            # overlap = [max(0, min(e0[i], e1[i]) - max(s0[i], s1[i])) for i in range(3)]
            # intersection = reduce(lambda x,y:x*y, overlap)
            # union = pow(box0[3], 3) + pow(box1[3], 3) - intersection
            # print(f'iou is : {intersection/union}')

            scorelist_g = self.scorelist[0:1].detach().cpu().numpy()
            boxlist_e = boxlist_camR_e[0:1].detach().cpu().numpy()
            boxlist_g = boxlist_camR_g[0:1].detach().cpu().numpy()
            scorelist_e = scorelist_e[0:1].detach().cpu().numpy()
            boxlist_e, boxlist_g, scorelist_e, scorelist_g = utils.evaluate.drop_invalid_boxes(
                boxlist_e, boxlist_g, scorelist_e, scorelist_g)
            ious = [0.3, 0.4, 0.5, 0.6, 0.7]
            maps, precisions_avg, scores_pred_val, ious_found = utils.evaluate.get_mAP(
                boxlist_e, scorelist_e, boxlist_g, ious)
            results['maps'] = maps
            results['ious'] = ious
            for ind, overlap in enumerate(ious):
                summ_writer.summ_scalar('ap/%.2f_iou' % overlap, maps[ind])
                summ_writer.summ_scalar('precision/%.2f_iou' % overlap,
                                        precisions_avg[ind])

        # do all of this computation if validation time is active
        # validation is only called after 50 step or validate after number
        if self.config.do_generate_data or (self.config.do_validation
                                            and feed['set_num'] == 1):
            # means I am executing the validation part of the code
            # Here I have the box in reference and memory is also in ref_frame
            # I should enter here while generating data
            bbox_in_ref_cam = feed['bbox_in_ref_cam']
            # based on the batch size this would be B, N, 3
            min_bounds = bbox_in_ref_cam[:, :, 0]
            max_bounds = bbox_in_ref_cam[:, :, -1]

            lengths = torch.abs(max_bounds - min_bounds)
            center = (max_bounds + min_bounds) * 0.5

            # now form the box and then covert to lrt list
            B, N = self.config.B, 1  # since i have only one box
            # 9 is cx, cy, cz, lx, ly, lz, rx, ry, rz
            boxlist = torch.zeros(B, N, 9)
            # NOTE: Note: I am assuming here that N = 1 !!!!!!
            boxlist[:, :, :3] = center  #.unsqueeze(1)
            boxlist[:, :, 3:6] = lengths  #.unsqueeze(1)

            # convert it to lrt list, it contains box length and rt to go
            # from box coordinates to ref coordinate system.
            box_lrtlist = utils.geom.convert_boxlist_to_lrtlist(boxlist)

            # now this is already in the ref coordinate system which was not
            # the case with my previous use of the crop_zoom_from_mem func.
            # Hence I had previously included camR_T_camXs which is not req here
            presumably_object_tensor = utils.vox.crop_zoom_from_mem(
                featRs_with_target_view, self.feat_mem_coord_Rs,
                box_lrtlist[:, 0, :], 32, 32, 32)
            # NOTE: As of now I am not doing backprop through this Tensor so
            # no need to keep it in gpu anymore
            results['object_tensor'] = presumably_object_tensor.detach().cpu()
            results['record_name'] = feed['record']

            # Add the plot of this to tensorboard, and also think how can you
            # visualize if the correct thing is being returned to you.
            summ_writer.summ_feats('crop_feats_val/object_tensor',
                                   tuple([presumably_object_tensor]),
                                   pca=True)

        # if hyp.do_metric_learning:
        #     B, _, _, _, _  = presumably_object_tensor.shape
        #     assert B >= 2, "Metric learner requires one positive and atleast one negative example to train"
        #     metric_loss, _ = self.metric_learner(presumably_object_tensor,feed["object_id"])
        #     total_loss += metric_loss
        #     summ_writer.summ_scalar('metric_learn/metric_loss', metric_loss.cpu().item())

        summ_writer.summ_scalar('loss', total_loss.cpu().item())
        return total_loss, results