def __getitem__(self, idx):

        # BGR image
        filename = self.image_files[idx]
        print('filename = ', filename)
        im = cv2.imread(filename)
        if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = chromatic_transform(im)
        if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = add_noise(im)
        im_tensor = torch.from_numpy(im) / 255.0

        im_tensor_bgr = im_tensor.clone()
        im_tensor_bgr = im_tensor_bgr.permute(2, 0, 1)

        im_tensor -= self._pixel_mean
        image_blob = im_tensor.permute(2, 0, 1)

        # Label
        labels_filename = filename.replace('image_color', 'annotation')
        foreground_labels = util_.imread_indexed(labels_filename)
        foreground_labels = self.process_label(foreground_labels)
        label_blob = torch.from_numpy(foreground_labels).unsqueeze(0)

        index = filename.find('OSD')
        sample = {
            'image_color': image_blob,
            'image_color_bgr': im_tensor_bgr,
            'label': label_blob,
            'filename': filename[index + 4:]
        }

        # Depth image
        if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD':
            pcd_filename = filename.replace('image_color', 'pcd')
            pcd_filename = pcd_filename.replace('png', 'pcd')
            print('pcd_filename = ', pcd_filename)
            pcloud = pcl.load(pcd_filename).to_array()
            pcloud[np.isnan(pcloud)] = 0
            xyz_img = pcloud.reshape((self._height, self._width, 3))
            depth_blob = torch.from_numpy(xyz_img).permute(2, 0, 1)
            sample['depth'] = depth_blob

        # # Depth image
        # if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD':
        #     pcd_filename = filename.replace('image_color', 'pcd')
        #     pcd_filename = pcd_filename.replace('png', 'pcd')

        #     # pcl replaced with open3d
        #     pcloud = o3d.io.read_point_cloud(pcd_filename)
        #     pcloud = np.asarray(pcloud)
        #     print(np.isnan(pcloud))
        #     pcloud[np.isnan(pcloud)] = 0
        #     xyz_img = pcloud.reshape((self._height, self._width, 3))
        #     depth_blob = torch.from_numpy(xyz_img).permute(2, 0, 1)
        #     sample['depth'] = depth_blob

        return sample
    def __getitem__(self, idx):

        # BGR image
        filename = str(self.image_paths[idx])
        im = cv2.imread(filename)
        if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = chromatic_transform(im)
        if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = add_noise(im)
        im_tensor = torch.from_numpy(im) / 255.0

        im_tensor_bgr = im_tensor.clone()
        im_tensor_bgr = im_tensor_bgr.permute(2, 0, 1)

        im_tensor -= self._pixel_mean
        image_blob = im_tensor.permute(2, 0, 1)

        # Label
        labels_filename = filename.replace('rgb', 'label')
        foreground_labels = util_.imread_indexed(labels_filename)
        # mask table as background
        foreground_labels[foreground_labels == 1] = 0
        if 'table' in labels_filename:
            foreground_labels[foreground_labels == 2] = 0
        foreground_labels = self.process_label(foreground_labels)
        label_blob = torch.from_numpy(foreground_labels).unsqueeze(0)

        index = filename.find('OCID')
        sample = {
            'image_color': image_blob,
            'image_color_bgr': im_tensor_bgr,
            'label': label_blob,
            'filename': filename[index + 5:]
        }

        # Depth image
        if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD':
            pcd_filename = filename.replace('rgb', 'pcd')
            pcd_filename = pcd_filename.replace('png', 'pcd')
            pcloud = pcl.load(pcd_filename).to_array()
            pcloud[np.isnan(pcloud)] = 0
            xyz_img = pcloud.reshape((self._height, self._width, 3))
            depth_blob = torch.from_numpy(xyz_img).permute(2, 0, 1)
            sample['depth'] = depth_blob

        return sample
Example #3
0
    def _get_image_blob(self, color_file, depth_file, scale_ind):

        # rgba
        rgba = pad_im(cv2.imread(color_file, cv2.IMREAD_UNCHANGED), 16)
        if rgba.shape[2] == 4:
            im = np.copy(rgba[:, :, :3])
            alpha = rgba[:, :, 3]
            I = np.where(alpha == 0)
            im[I[0], I[1], :] = 0
        else:
            im = rgba

        im_scale = cfg.TRAIN.SCALES_BASE[scale_ind]
        if im_scale != 1.0:
            im = cv2.resize(im,
                            None,
                            None,
                            fx=im_scale,
                            fy=im_scale,
                            interpolation=cv2.INTER_LINEAR)
        height = im.shape[0]
        width = im.shape[1]

        # chromatic transform
        if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = chromatic_transform(im)
        if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = add_noise(im)
        im_tensor = torch.from_numpy(im) / 255.0
        im_tensor -= self._pixel_mean
        image_blob = im_tensor.permute(2, 0, 1).float()

        # depth image
        im_depth = pad_im(cv2.imread(depth_file, cv2.IMREAD_UNCHANGED), 16)
        if im_scale != 1.0:
            im_depth = cv2.resize(im_depth,
                                  None,
                                  None,
                                  fx=im_scale,
                                  fy=im_scale,
                                  interpolation=cv2.INTER_NEAREST)
        im_depth = im_depth.astype('float') / 1000.0

        return image_blob, im_depth, im_scale, height, width
Example #4
0
def _get_image_blob(roidb, scale_ind, num_classes, backgrounds,
                    intrinsic_matrix, db_inds_syn, is_syn):
    """Builds an input blob from the images in the roidb at the specified
    scales.
    """
    num_images = len(roidb)
    processed_ims = []
    processed_ims_depth = []
    processed_ims_normal = []
    im_scales = []
    roidb_syn = []

    for i in xrange(num_images):

        if is_syn:
            # depth raw
            filename = cfg.TRAIN.SYNROOT + '{:06d}-depth.png'.format(
                db_inds_syn[i])
            im_depth_raw = pad_im(cv2.imread(filename, cv2.IMREAD_UNCHANGED),
                                  16)

            # rgba
            filename = cfg.TRAIN.SYNROOT + '{:06d}-color.png'.format(
                db_inds_syn[i])
            rgba = pad_im(cv2.imread(filename, cv2.IMREAD_UNCHANGED), 16)

            # sample a background image
            ind = np.random.randint(len(backgrounds), size=1)[0]
            filename = backgrounds[ind]
            background = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
            try:
                background = cv2.resize(background,
                                        (rgba.shape[1], rgba.shape[0]),
                                        interpolation=cv2.INTER_LINEAR)
            except:
                if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'NORMAL':
                    background = np.zeros((rgba.shape[0], rgba.shape[1]),
                                          dtype=np.uint16)
                else:
                    background = np.zeros((rgba.shape[0], rgba.shape[1], 3),
                                          dtype=np.uint8)
                print 'bad background image'

            if cfg.INPUT != 'DEPTH' and cfg.INPUT != 'NORMAL' and len(
                    background.shape) != 3:
                background = np.zeros((rgba.shape[0], rgba.shape[1], 3),
                                      dtype=np.uint8)
                print 'bad background image'

            # add background
            im = np.copy(rgba[:, :, :3])
            alpha = rgba[:, :, 3]
            I = np.where(alpha == 0)
            if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'NORMAL':
                im_depth_raw[I[0], I[1]] = background[I[0], I[1]] / 10
            else:
                im[I[0], I[1], :] = background[I[0], I[1], :3]
        else:
            # depth raw
            im_depth_raw = pad_im(
                cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16)

            # rgba
            rgba = pad_im(cv2.imread(roidb[i]['image'], cv2.IMREAD_UNCHANGED),
                          16)
            if rgba.shape[2] == 4:
                im = np.copy(rgba[:, :, :3])
                alpha = rgba[:, :, 3]
                I = np.where(alpha == 0)
                im[I[0], I[1], :] = 0
            else:
                im = rgba

        # chromatic transform
        if cfg.TRAIN.CHROMATIC:
            im = chromatic_transform(im)

        if cfg.TRAIN.ADD_NOISE:
            im = add_noise(im)

        if roidb[i]['flipped']:
            im = im[:, ::-1, :]

        im_orig = im.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_scale = cfg.TRAIN.SCALES_BASE[scale_ind]
        im = cv2.resize(im_orig,
                        None,
                        None,
                        fx=im_scale,
                        fy=im_scale,
                        interpolation=cv2.INTER_LINEAR)
        im_scales.append(im_scale)
        processed_ims.append(im)

        # depth
        im_depth = im_depth_raw.astype(np.float32, copy=True) / float(
            im_depth_raw.max()) * 255
        im_depth = np.tile(im_depth[:, :, np.newaxis], (1, 1, 3))

        if cfg.TRAIN.ADD_NOISE:
            im_depth = add_noise(im_depth)

        if roidb[i]['flipped']:
            im_depth = im_depth[:, ::-1]

        im_orig = im_depth.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_depth = cv2.resize(im_orig,
                              None,
                              None,
                              fx=im_scale,
                              fy=im_scale,
                              interpolation=cv2.INTER_LINEAR)
        processed_ims_depth.append(im_depth)

        # normals
        if cfg.INPUT == 'NORMAL':
            depth = im_depth_raw.astype(np.float32, copy=True) / 1000.0
            fx = intrinsic_matrix[0, 0] * im_scale
            fy = intrinsic_matrix[1, 1] * im_scale
            cx = intrinsic_matrix[0, 2] * im_scale
            cy = intrinsic_matrix[1, 2] * im_scale
            nmap = gpu_normals.gpu_normals(depth, fx, fy, cx, cy, 20.0,
                                           cfg.GPU_ID)
            im_normal = 127.5 * nmap + 127.5
            im_normal = im_normal.astype(np.uint8)
            im_normal = im_normal[:, :, (2, 1, 0)]
            im_normal = cv2.bilateralFilter(im_normal, 9, 75, 75)
            if roidb[i]['flipped']:
                im_normal = im_normal[:, ::-1, :]

            im_orig = im_normal.astype(np.float32, copy=True)
            im_orig -= cfg.PIXEL_MEANS
            im_normal = cv2.resize(im_orig,
                                   None,
                                   None,
                                   fx=im_scale,
                                   fy=im_scale,
                                   interpolation=cv2.INTER_LINEAR)
            processed_ims_normal.append(im_normal)
            blob_normal = im_list_to_blob(processed_ims_normal, 3)
        else:
            blob_normal = []

    # Create a blob to hold the input images
    blob = im_list_to_blob(processed_ims, 3)
    blob_depth = im_list_to_blob(processed_ims_depth, 3)

    return blob, blob_depth, blob_normal, im_scales
Example #5
0
def _get_image_blob(roidb, scale_ind):
    """Builds an input blob from the images in the roidb at the specified
    scales.
    """
    num_images = len(roidb)
    processed_ims = []
    processed_ims_depth = []
    processed_ims_normal = []
    im_scales = []
    if cfg.TRAIN.GAN:
        processed_ims_rescale = []

    for i in range(num_images):
        # meta data
        meta_data = scipy.io.loadmat(roidb[i]['meta_data'])
        K = meta_data['intrinsic_matrix'].astype(np.float32, copy=True)
        fx = K[0, 0]
        fy = K[1, 1]
        cx = K[0, 2]
        cy = K[1, 2]

        # depth raw
        im_depth_raw = pad_im(
            cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16)
        height = im_depth_raw.shape[0]
        width = im_depth_raw.shape[1]

        # rgba
        rgba = pad_im(cv2.imread(roidb[i]['image'], cv2.IMREAD_UNCHANGED), 16)
        if rgba.shape[2] == 4:
            im = np.copy(rgba[:, :, :3])
            alpha = rgba[:, :, 3]
            I = np.where(alpha == 0)
            im[I[0], I[1], :] = 0
        else:
            im = rgba

        # chromatic transform
        if cfg.TRAIN.CHROMATIC:
            label = pad_im(cv2.imread(roidb[i]['label'], cv2.IMREAD_UNCHANGED),
                           16)
            im = chromatic_transform(im, label)

        # mask the color image according to depth
        if cfg.EXP_DIR == 'rgbd_scene':
            I = np.where(im_depth_raw == 0)
            im[I[0], I[1], :] = 0

        if roidb[i]['flipped']:
            im = im[:, ::-1, :]

        if cfg.TRAIN.GAN:
            im_orig = im.astype(np.float32, copy=True) / 127.5 - 1
            im_scale = cfg.TRAIN.SCALES_BASE[scale_ind]
            im_rescale = cv2.resize(im_orig,
                                    None,
                                    None,
                                    fx=im_scale,
                                    fy=im_scale,
                                    interpolation=cv2.INTER_LINEAR)
            processed_ims_rescale.append(im_rescale)

        im_orig = im.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_scale = cfg.TRAIN.SCALES_BASE[scale_ind]
        im = cv2.resize(im_orig,
                        None,
                        None,
                        fx=im_scale,
                        fy=im_scale,
                        interpolation=cv2.INTER_LINEAR)
        im_scales.append(im_scale)
        processed_ims.append(im)

        # depth
        im_depth = im_depth_raw.astype(np.float32, copy=True) / float(
            im_depth_raw.max()) * 255
        im_depth = np.tile(im_depth[:, :, np.newaxis], (1, 1, 3))

        if roidb[i]['flipped']:
            im_depth = im_depth[:, ::-1]

        im_orig = im_depth.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_depth = cv2.resize(im_orig,
                              None,
                              None,
                              fx=im_scale,
                              fy=im_scale,
                              interpolation=cv2.INTER_LINEAR)
        processed_ims_depth.append(im_depth)

        # normals
        depth = im_depth_raw.astype(np.float32, copy=True) / float(
            meta_data['factor_depth'])
        nmap = gpu_normals.gpu_normals(depth, fx, fy, cx, cy, 20.0, cfg.GPU_ID)
        im_normal = 127.5 * nmap + 127.5
        im_normal = im_normal.astype(np.uint8)
        im_normal = im_normal[:, :, (2, 1, 0)]
        if roidb[i]['flipped']:
            im_normal = im_normal[:, ::-1, :]

        im_orig = im_normal.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_normal = cv2.resize(im_orig,
                               None,
                               None,
                               fx=im_scale,
                               fy=im_scale,
                               interpolation=cv2.INTER_LINEAR)
        processed_ims_normal.append(im_normal)

    # Create a blob to hold the input images
    blob = im_list_to_blob(processed_ims, 3)
    blob_depth = im_list_to_blob(processed_ims_depth, 3)
    blob_normal = im_list_to_blob(processed_ims_normal, 3)
    if cfg.TRAIN.GAN:
        blob_rescale = im_list_to_blob(processed_ims_rescale, 3)
    else:
        blob_rescale = []

    return blob, blob_rescale, blob_depth, blob_normal, im_scales
Example #6
0
    def _render_item(self):

        height = cfg.TRAIN.SYN_HEIGHT
        width = cfg.TRAIN.SYN_WIDTH
        fx = self._intrinsic_matrix[0, 0]
        fy = self._intrinsic_matrix[1, 1]
        px = self._intrinsic_matrix[0, 2]
        py = self._intrinsic_matrix[1, 2]
        zfar = 6.0
        znear = 0.25
        bound = 0.1
        qt = np.zeros((7, ), dtype=np.float32)
        image_tensor = torch.cuda.FloatTensor(height, width, 4).detach()
        seg_tensor = torch.cuda.FloatTensor(height, width, 4).detach()
        if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD':
            pc_tensor = torch.cuda.FloatTensor(height, width, 4).detach()
        else:
            pc_tensor = None
        cfg.renderer.set_projection_matrix(width, height, fx, fy, px, py,
                                           znear, zfar)
        classes = np.array(cfg.TRAIN.CLASSES)

        # sample target object
        cls_indexes = []
        cls_target = np.random.randint(len(cfg.TRAIN.CLASSES), size=1)[0]
        cls_indexes.append(cfg.TRAIN.CLASSES[cls_target])

        # sample target pose
        poses_all = []
        cls = int(cls_indexes[0])
        if self.pose_indexes[cls] >= len(self.pose_lists[cls]):
            self.pose_indexes[cls] = 0
            self.pose_lists[cls] = np.random.permutation(
                np.arange(len(self.eulers)))
        roll = self.eulers[self.pose_lists[cls][
            self.pose_indexes[cls]]][0] + 15 * np.random.randn()
        pitch = self.eulers[self.pose_lists[cls][
            self.pose_indexes[cls]]][1] + 15 * np.random.randn()
        yaw = self.eulers[self.pose_lists[cls][
            self.pose_indexes[cls]]][2] + 15 * np.random.randn()
        qt[3:] = euler2quat(roll * math.pi / 180.0, pitch * math.pi / 180.0,
                            yaw * math.pi / 180.0)
        self.pose_indexes[cls] += 1

        qt[0] = np.random.uniform(-bound, bound)
        qt[1] = np.random.uniform(-bound, bound)
        qt[2] = np.random.uniform(cfg.TRAIN.SYN_TNEAR, cfg.TRAIN.SYN_TFAR)

        # render target
        poses_all.append(qt.copy())
        cfg.renderer.set_poses(poses_all)
        cfg.renderer.set_light_pos(np.random.uniform(-0.5, 0.5, 3))
        intensity = np.random.uniform(0.8, 2)
        light_color = intensity * np.random.uniform(0.9, 1.1, 3)
        cfg.renderer.set_light_color(light_color)

        cfg.renderer.render(cls_indexes, image_tensor, seg_tensor)
        image_tensor = image_tensor.flip(0)
        seg_tensor = seg_tensor.flip(0)

        seg = torch.sum(seg_tensor[:, :, :3], dim=2)
        mask = (seg != 0).cpu().numpy()

        # sample an occluder
        cls_indexes.append(0)
        poses_all.append(np.zeros((7, ), dtype=np.float32))
        while 1:

            while 1:
                cls_occ = np.random.randint(len(self._classes_all), size=1)[0]
                if cls_occ != cls_indexes[0]:
                    cls_indexes[1] = cls_occ
                    break

            # sample poses
            cls = int(cls_indexes[1])
            if self.pose_indexes[cls] >= len(self.pose_lists[cls]):
                self.pose_indexes[cls] = 0
                self.pose_lists[cls] = np.random.permutation(
                    np.arange(len(self.eulers)))
            roll = self.eulers[self.pose_lists[cls][
                self.pose_indexes[cls]]][0] + 15 * np.random.randn()
            pitch = self.eulers[self.pose_lists[cls][
                self.pose_indexes[cls]]][1] + 15 * np.random.randn()
            yaw = self.eulers[self.pose_lists[cls][
                self.pose_indexes[cls]]][2] + 15 * np.random.randn()
            qt[3:] = euler2quat(roll * math.pi / 180.0,
                                pitch * math.pi / 180.0, yaw * math.pi / 180.0)
            self.pose_indexes[cls] += 1

            # translation, sample an object nearby
            object_id = 0
            extent = np.mean(self._extents_all[cls, :])

            flag = np.random.randint(0, 2)
            if flag == 0:
                flag = -1
            qt[0] = poses_all[object_id][
                0] + flag * extent * np.random.uniform(0.3, 0.5)
            if np.absolute(qt[0]) > bound:
                qt[0] = poses_all[object_id][
                    0] - flag * extent * np.random.uniform(0.3, 0.5)

            flag = np.random.randint(0, 2)
            if flag == 0:
                flag = -1
            qt[1] = poses_all[object_id][
                1] + flag * extent * np.random.uniform(0.3, 0.5)
            if np.absolute(qt[1]) > bound:
                qt[1] = poses_all[object_id][
                    1] - flag * extent * np.random.uniform(0.3, 0.5)

            qt[2] = poses_all[object_id][2] - extent * np.random.uniform(
                1.0, 2.0)
            if qt[2] < cfg.TRAIN.SYN_TNEAR:
                qt[2] = poses_all[object_id][2] + extent * np.random.uniform(
                    1.0, 2.0)

            poses_all[1] = qt
            cfg.renderer.set_poses(poses_all)

            # rendering
            cfg.renderer.set_light_pos(np.random.uniform(-0.5, 0.5, 3))
            intensity = np.random.uniform(0.8, 2)
            light_color = intensity * np.random.uniform(0.9, 1.1, 3)
            cfg.renderer.set_light_color(light_color)
            cfg.renderer.render(cls_indexes,
                                image_tensor,
                                seg_tensor,
                                pc2_tensor=pc_tensor)

            seg_tensor = seg_tensor.flip(0)
            if pc_tensor is not None:
                pc_tensor = pc_tensor.flip(0)
            im_label = seg_tensor.cpu().numpy()
            im_label = im_label[:, :, (2, 1, 0)] * 255
            im_label = np.round(im_label).astype(np.uint8)
            im_label = np.clip(im_label, 0, 255)
            im_label_only, im_label = self.process_label_image(im_label)

            # compute occlusion percentage
            mask_target = (im_label == cls_indexes[0] + 1).astype(np.int32)

            per_occ = 1.0 - np.sum(mask & mask_target) / np.sum(mask)
            if per_occ < 0.5:
                break

        # RGB to BGR order
        image_tensor = image_tensor.flip(0)
        im = image_tensor.cpu().numpy()
        im = np.clip(im, 0, 1)
        im = im[:, :, (2, 1, 0)] * 255
        im = im.astype(np.uint8)

        if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD':
            # XYZ coordinates in camera frame
            im_depth = pc_tensor.cpu().numpy()
            im_depth = im_depth[:, :, :3]

        label_blob = np.zeros((self.num_classes, height, width),
                              dtype=np.float32)
        for i in range(self.num_classes):
            I = np.where(im_label == classes[i] + 1)
            if len(I[0]) > 0:
                label_blob[i, I[0], I[1]] = 1.0

        # foreground mask
        seg = seg_tensor[:, :,
                         2] + 256 * seg_tensor[:, :,
                                               1] + 256 * 256 * seg_tensor[:, :,
                                                                           0]
        mask = (seg != 0).unsqueeze(2).repeat((1, 1, 3)).float().cpu()
        '''
        import matplotlib.pyplot as plt
        fig = plt.figure()
        ax = fig.add_subplot(3, 2, 1)
        plt.imshow(im[:, :, (2, 1, 0)])
        ax = fig.add_subplot(3, 2, 2)
        plt.imshow(im_label)
        print(per_occ)
        ax = fig.add_subplot(3, 2, 3)
        plt.imshow(im_depth[:, :, 0])
        ax = fig.add_subplot(3, 2, 4)
        plt.imshow(im_depth[:, :, 1])
        ax = fig.add_subplot(3, 2, 5)
        plt.imshow(im_depth[:, :, 2])
        plt.show()
        '''

        # chromatic transform
        if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = chromatic_transform(im)
        if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = add_noise(im)
        im_tensor = torch.from_numpy(im) / 255.0
        im_tensor -= self._pixel_mean

        if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD':
            im_depth_tensor = torch.from_numpy(im_depth).float()
            if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand(
                    1) > 0.1:
                im_depth_tensor = add_noise_depth(im_depth_tensor).float()
        else:
            im_depth_tensor = im_tensor.clone()

        # poses and boxes only for the target object
        pose_blob = np.zeros((1, 9), dtype=np.float32)
        gt_boxes = np.zeros((1, 5), dtype=np.float32)

        pose_blob[0, 0] = 1
        pose_blob[0, 1] = cls_target
        pose_blob[0, 2:6] = poses_all[0][3:]
        pose_blob[0, 6:] = poses_all[0][:3]

        # compute box
        x3d = np.ones((4, self._points_all.shape[1]), dtype=np.float32)
        x3d[0, :] = self._points_all[cls_target, :, 0]
        x3d[1, :] = self._points_all[cls_target, :, 1]
        x3d[2, :] = self._points_all[cls_target, :, 2]
        RT = np.zeros((3, 4), dtype=np.float32)
        RT[:3, :3] = quat2mat(pose_blob[0, 2:6])
        RT[:, 3] = pose_blob[0, 6:]
        x2d = np.matmul(self._intrinsic_matrix, np.matmul(RT, x3d))
        x2d[0, :] = np.divide(x2d[0, :], x2d[2, :])
        x2d[1, :] = np.divide(x2d[1, :], x2d[2, :])

        gt_boxes[0, 0] = np.min(x2d[0, :])
        gt_boxes[0, 1] = np.min(x2d[1, :])
        gt_boxes[0, 2] = np.max(x2d[0, :])
        gt_boxes[0, 3] = np.max(x2d[1, :])
        gt_boxes[0, 4] = cls_target

        # construct the meta data
        K = self._intrinsic_matrix
        Kinv = np.linalg.pinv(K)
        meta_data_blob = np.zeros(18, dtype=np.float32)
        meta_data_blob[0:9] = K.flatten()
        meta_data_blob[9:18] = Kinv.flatten()

        is_syn = 1
        im_info = np.array(
            [im.shape[0], im.shape[1], cfg.TRAIN.SCALES_BASE[0], is_syn],
            dtype=np.float32)
        pose_result = pose_blob.copy()

        # im is pytorch tensor in gpu
        sample = {
            'image_color': im_tensor,
            'image_depth': im_depth_tensor,
            'meta_data': meta_data_blob,
            'label_blob': label_blob,
            'mask': mask,
            'poses': pose_blob,
            'extents': self._extents,
            'points': self._point_blob,
            'gt_boxes': gt_boxes,
            'poses_result': pose_result,
            'im_info': im_info
        }

        return sample
    def __getitem__(self, idx):

        # Get scene directory, crop dose not use background
        scene_idx = idx // self.NUM_VIEWS_PER_SCENE
        scene_dir = self.scene_dirs[scene_idx]

        # Get view number
        view_num = idx % self.NUM_VIEWS_PER_SCENE
        if cfg.TRAIN.SYN_CROP:
            view_num += 2

        # Label
        foreground_labels_filename = os.path.join(
            scene_dir, 'segmentation_%05d.png' % view_num)
        foreground_labels = util_.imread_indexed(foreground_labels_filename)
        # mask table as background
        foreground_labels[foreground_labels == 1] = 0
        foreground_labels = self.process_label(foreground_labels)

        # BGR image
        filename = os.path.join(scene_dir, 'rgb_%05d.jpeg' % view_num)
        im = cv2.imread(filename)

        if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD':
            # Depth image
            depth_img_filename = os.path.join(scene_dir,
                                              'depth_%05d.png' % view_num)
            depth_img = cv2.imread(
                depth_img_filename, cv2.IMREAD_ANYDEPTH
            )  # This reads a 16-bit single-channel image. Shape: [H x W]
            xyz_img = self.process_depth(depth_img)
        else:
            xyz_img = None

        # crop
        if cfg.TRAIN.SYN_CROP:
            im, foreground_labels, xyz_img = self.pad_crop_resize(
                im, foreground_labels, xyz_img)
            foreground_labels = self.process_label(foreground_labels)

        # sample labels
        if cfg.TRAIN.EMBEDDING_SAMPLING:
            foreground_labels = self.sample_pixels(
                foreground_labels, cfg.TRAIN.EMBEDDING_SAMPLING_NUM)

        label_blob = torch.from_numpy(foreground_labels).unsqueeze(0)
        sample = {'label': label_blob}

        if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = chromatic_transform(im)
        if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = add_noise(im)
        im_tensor = torch.from_numpy(im) / 255.0
        im_tensor -= self._pixel_mean
        image_blob = im_tensor.permute(2, 0, 1)
        sample['image_color'] = image_blob

        if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD':
            depth_blob = torch.from_numpy(xyz_img).permute(2, 0, 1)
            sample['depth'] = depth_blob

        return sample
    def load(self, filename_color, filename_depth, intrinsics):
        if filename_depth is None:
            background_depth = np.zeros((3, self._height, self._width),
                                        dtype=np.float32)
            mask_depth = np.zeros((self._height, self._width),
                                  dtype=np.float32)

        if filename_depth is None and np.random.rand(
                1
        ) < cfg.TRAIN.SYN_BACKGROUND_CONSTANT_PROB:  # only for rgb cases
            # constant background image
            background_color = np.ones((self._height, self._width, 3),
                                       dtype=np.uint8)
            color = np.random.randint(256, size=3)
            background_color[:, :, 0] = color[0]
            background_color[:, :, 1] = color[1]
            background_color[:, :, 2] = color[2]
        else:
            background_color = cv2.imread(filename_color, cv2.IMREAD_UNCHANGED)

            if filename_depth is not None:
                background_depth = cv2.imread(filename_depth,
                                              cv2.IMREAD_UNCHANGED)

            try:
                # randomly crop a region as background
                bw = background_color.shape[1]
                bh = background_color.shape[0]
                x1 = npr.randint(0, int(bw / 3))
                y1 = npr.randint(0, int(bh / 3))
                x2 = npr.randint(int(2 * bw / 3), bw)
                y2 = npr.randint(int(2 * bh / 3), bh)
                background_color = background_color[y1:y2, x1:x2]
                background_color = cv2.resize(background_color,
                                              (self._width, self._height),
                                              interpolation=cv2.INTER_LINEAR)
                if len(background_color.shape) != 3:
                    background_color = cv2.cvtColor(background_color,
                                                    cv2.COLOR_GRAY2RGB)

                if filename_depth is not None:
                    background_depth = background_depth[y1:y2, x1:x2]
                    background_depth = cv2.resize(
                        background_depth, (self._width, self._height),
                        interpolation=cv2.INTER_NEAREST)
                    background_depth = self.backproject(
                        background_depth, intrinsics, self.depth_factor)

            except:
                background_color = np.zeros((self._height, self._width, 3),
                                            dtype=np.uint8)
                print('bad background_color image', filename_color)
                if filename_depth is not None:
                    background_depth = np.zeros((self._height, self._width, 3),
                                                dtype=np.float32)
                    print('bad depth background image')

            if len(background_color.shape) != 3:
                background_color = np.zeros((self._height, self._width, 3),
                                            dtype=np.uint8)
                print('bad background_color image', filename_color)

            if filename_depth is not None:
                if len(background_depth.shape) != 3:
                    background_depth = np.zeros((self._height, self._width, 3),
                                                dtype=np.float32)
                    print('bad depth background image')

                z_im = background_depth[:, :, 2]
                mask_depth = z_im > 0.0
                mask_depth = mask_depth.astype(np.float32)

                if np.random.rand(1) > 0.1:
                    background_depth = add_noise_depth(background_depth)

                background_depth = background_depth.transpose(2, 0, 1).astype(
                    np.float32)

            if np.random.rand(1) > 0.1:
                background_color = chromatic_transform(background_color)

        if np.random.rand(1) > 0.1:
            background_color = add_noise(background_color)

        background_color = background_color.astype(np.float32)

        if self.subtract_mean:
            background_color -= self._pixel_mean
        background_color = background_color.transpose(2, 0, 1) / 255.0

        sample = {
            'background_color': background_color,
            'background_depth': background_depth,
            'mask_depth': mask_depth
        }

        return sample
Example #9
0
    def _compose_item(self):

        height = cfg.TRAIN.SYN_HEIGHT
        width = cfg.TRAIN.SYN_WIDTH
        classes_all = np.array(range(len(self._classes_all)))
        mask_depth_cuda = torch.cuda.FloatTensor(1, height, width).fill_(0)

        # sample target objects
        if cfg.TRAIN.SYN_SAMPLE_OBJECT:
            maxnum = np.minimum(self.num_classes - 1, cfg.TRAIN.SYN_MAX_OBJECT)
            num = np.random.randint(cfg.TRAIN.SYN_MIN_OBJECT, maxnum + 1)
            perm = np.random.permutation(np.arange(self.num_classes - 1))
            indexes_target = perm[:num] + 1
        else:
            num = self.num_classes - 1
            indexes_target = np.arange(num) + 1
        num_target = num
        cls_indexes = [cfg.TRAIN.CLASSES[i] - 1 for i in indexes_target]

        # sample poses
        im_color = np.zeros((height, width, 3), dtype=np.uint8)
        im_label = np.zeros((height, width), dtype=np.uint8)
        im_label_all = np.zeros((height, width), dtype=np.uint8)
        gt_boxes = np.zeros((self.num_classes, 5), dtype=np.float32)
        for i in range(num):

            # select image
            cls = int(cls_indexes[i])
            if self.pose_indexes_real[cls] >= len(self.pose_lists_real[cls]):
                self.pose_indexes_real[cls] = 0
                self.pose_lists_real[cls] = np.random.permutation(
                    np.arange(len(self.pose_lists_real[cls])))
            index_image = self.pose_lists_real[cls][
                self.pose_indexes_real[cls]]
            self.pose_indexes_real[cls] += 1

            # read image
            filename = self.pose_images[cls][index_image]
            im = cv2.imread(filename, cv2.IMREAD_UNCHANGED)

            # read mask
            filename_mask = filename[:-4] + '_mask.pbm'
            mask = cv2.imread(filename_mask, cv2.IMREAD_UNCHANGED)
            mask = np.array(mask == 0).astype(np.uint8)
            kernel = np.ones((20, 20), np.uint8)
            mask = cv2.erode(mask, kernel, iterations=1)

            while 1:
                # rescale the image
                rescale_factor = np.random.uniform(0.1, 0.3)
                affine_1 = np.eye(3, dtype=np.float32)
                affine_1[0, 0] = rescale_factor * affine_1[0, 0]
                affine_1[1, 1] = rescale_factor * affine_1[1, 1]

                # translation to center
                delta_x = np.random.uniform(0.25, 0.5)
                delta_y = np.random.uniform(0.25, 0.5)
                M_translation = np.float32([[1, 0, delta_x * width],
                                            [0, 1, delta_y * height]])
                affine_2 = np.eye(3, dtype=np.float32)
                affine_2[:2, :] = M_translation

                # rotation
                degree = np.random.uniform(-180.0, 180.0)
                M_rotation = cv2.getRotationMatrix2D((width / 2, height / 2),
                                                     degree, 1)
                affine_3 = np.eye(3, dtype=np.float32)
                affine_3[:2, :] = M_rotation

                # translation again
                delta_x = np.random.uniform(-0.4, 0.4)
                delta_y = np.random.uniform(-0.4, 0.4)
                M_translation_1 = np.float32([[1, 0, delta_x * width],
                                              [0, 1, delta_y * height]])
                affine_4 = np.eye(3, dtype=np.float32)
                affine_4[:2, :] = M_translation_1

                # all together
                affine = np.dot(affine_4,
                                np.dot(affine_3, np.dot(affine_2, affine_1)))
                im_final = cv2.warpAffine(im, affine[:2, :], (width, height))
                mask_final = cv2.warpAffine(mask, affine[:2, :],
                                            (width, height))

                index_foreground = np.where(mask_final == 1)
                if len(index_foreground[0]) > 0:
                    break

            # paste object and label
            index = np.where((mask_final == 1) & (im_label_all == 0))
            im_color[index[0], index[1], :] = im_final[index[0], index[1], :]

            cls_ind = np.where(np.array(cfg.TRAIN.CLASSES) == cls + 1)[0]
            im_label[index[0], index[1]] = cls_ind

            gt_boxes[i, 0] = np.min(index_foreground[1])
            gt_boxes[i, 1] = np.min(index_foreground[0])
            gt_boxes[i, 2] = np.max(index_foreground[1])
            gt_boxes[i, 3] = np.max(index_foreground[0])
            gt_boxes[i, 4] = cls_ind

            cls_ind = np.where(classes_all == cls + 1)[0]
            im_label_all[index[0], index[1]] = cls_ind
            '''
            import matplotlib.pyplot as plt
            fig = plt.figure()
            im = im.astype(np.uint8)
            ax = fig.add_subplot(2, 3, 1)
            plt.imshow(im[:, :, (2, 1, 0)])
            ax.set_title('color')
            ax = fig.add_subplot(2, 3, 2)
            plt.imshow(im_final[:, :, (2, 1, 0)])
            ax.set_title('final')
            ax = fig.add_subplot(2, 3, 3)
            plt.imshow(mask)
            ax.set_title('mask')
            ax = fig.add_subplot(2, 3, 4)
            plt.imshow(mask_final)
            ax.set_title('mask final')
            ax = fig.add_subplot(2, 3, 5)
            plt.imshow(im_color[:, :, (2, 1, 0)])

            for j in range(gt_boxes.shape[0]):
                if gt_boxes[j, 4] == 0:
                    continue
                x1 = gt_boxes[j, 0]
                y1 = gt_boxes[j, 1]
                x2 = gt_boxes[j, 2]
                y2 = gt_boxes[j, 3]
                plt.gca().add_patch(
                    plt.Rectangle((x1, y1), x2-x1, y2-y1, fill=False, edgecolor='g', linewidth=3, clip_on=False))

            ax = fig.add_subplot(2, 3, 6)
            plt.imshow(im_label_all)
            plt.show()
            #'''

        # foreground mask
        seg = torch.from_numpy((im_label_all != 0).astype(np.float32))
        mask = seg.unsqueeze(0).repeat((3, 1, 1)).float().cuda()
        im = im_color

        # chromatic transform
        if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = chromatic_transform(im)

        im_cuda = torch.from_numpy(im).cuda().float() / 255.0
        if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im_cuda = add_noise_cuda(im_cuda)
        im_cuda -= self._pixel_mean
        im_cuda = im_cuda.permute(2, 0, 1)

        # label blob
        classes = np.array(range(self.num_classes))
        label_blob = np.zeros((self.num_classes, self._height, self._width),
                              dtype=np.float32)
        label_blob[0, :, :] = 1.0
        for i in range(1, self.num_classes):
            I = np.where(im_label == classes[i])
            if len(I[0]) > 0:
                label_blob[i, I[0], I[1]] = 1.0
                label_blob[0, I[0], I[1]] = 0.0

        # construct the meta data
        K = self._intrinsic_matrix
        K[2, 2] = 1
        Kinv = np.linalg.pinv(K)
        meta_data_blob = np.zeros(18, dtype=np.float32)
        meta_data_blob[0:9] = K.flatten()
        meta_data_blob[9:18] = Kinv.flatten()

        # no vertex regression target and poses
        pose_blob = np.zeros((self.num_classes, 9), dtype=np.float32)
        vertex_targets = np.zeros((3 * self.num_classes, height, width),
                                  dtype=np.float32)
        vertex_weights = np.zeros((3 * self.num_classes, height, width),
                                  dtype=np.float32)
        im_info = np.array(
            [im.shape[1], im.shape[2], cfg.TRAIN.SCALES_BASE[0], 1],
            dtype=np.float32)

        sample = {
            'image_color': im_cuda,
            'image_depth': im_cuda,
            'label': label_blob,
            'mask': mask,
            'mask_depth': mask_depth_cuda,
            'meta_data': meta_data_blob,
            'poses': pose_blob,
            'extents': self._extents,
            'points': self._point_blob,
            'symmetry': self._symmetry,
            'gt_boxes': gt_boxes,
            'im_info': im_info
        }

        if cfg.TRAIN.VERTEX_REG:
            sample['vertex_targets'] = vertex_targets
            sample['vertex_weights'] = vertex_weights

        return sample
Example #10
0
    def _render_item(self):

        height = cfg.TRAIN.SYN_HEIGHT
        width = cfg.TRAIN.SYN_WIDTH
        fx = self._intrinsic_matrix[0, 0]
        fy = self._intrinsic_matrix[1, 1]
        px = self._intrinsic_matrix[0, 2]
        py = self._intrinsic_matrix[1, 2]
        zfar = 6.0
        znear = 0.01

        # sample target objects
        if cfg.TRAIN.SYN_SAMPLE_OBJECT:
            maxnum = np.minimum(self.num_classes - 1, cfg.TRAIN.SYN_MAX_OBJECT)
            num = np.random.randint(cfg.TRAIN.SYN_MIN_OBJECT, maxnum + 1)
            perm = np.random.permutation(np.arange(self.num_classes - 1))
            indexes_target = perm[:num] + 1
        else:
            num = self.num_classes - 1
            indexes_target = np.arange(num) + 1
        num_target = num
        cls_indexes = [cfg.TRAIN.CLASSES[i] - 1 for i in indexes_target]

        # sample other objects as distractors
        if cfg.TRAIN.SYN_SAMPLE_DISTRACTOR:
            num_other = min(5, self._num_classes_other)
            num_selected = np.random.randint(0, num_other + 1)
            perm = np.random.permutation(np.arange(self._num_classes_other))
            indexes = perm[:num_selected]
            for i in range(num_selected):
                cls_indexes.append(self._classes_other[indexes[i]] - 1)
        else:
            num_selected = 0

        # sample poses
        num = num_target + num_selected
        poses_all = []
        for i in range(num):
            qt = np.zeros((7, ), dtype=np.float32)
            # rotation
            cls = int(cls_indexes[i])
            if self.pose_indexes[cls] >= len(self.pose_lists[cls]):
                self.pose_indexes[cls] = 0
                self.pose_lists[cls] = np.random.permutation(
                    np.arange(len(self.eulers)))
            yaw = self.eulers[self.pose_lists[cls][
                self.pose_indexes[cls]]][0] + 15 * np.random.randn()
            pitch = self.eulers[self.pose_lists[cls][
                self.pose_indexes[cls]]][1] + 15 * np.random.randn()
            pitch = np.clip(pitch, -90, 90)
            roll = self.eulers[self.pose_lists[cls][
                self.pose_indexes[cls]]][2] + 15 * np.random.randn()
            qt[3:] = euler2quat(yaw * math.pi / 180.0, pitch * math.pi / 180.0,
                                roll * math.pi / 180.0, 'syxz')
            self.pose_indexes[cls] += 1

            # translation
            bound = cfg.TRAIN.SYN_BOUND
            if i == 0 or i >= num_target or np.random.rand(1) > 0.5:
                qt[0] = np.random.uniform(-bound, bound)
                qt[1] = np.random.uniform(-bound, bound)
                qt[2] = np.random.uniform(cfg.TRAIN.SYN_TNEAR,
                                          cfg.TRAIN.SYN_TFAR)
            else:
                # sample an object nearby
                object_id = np.random.randint(0, i, size=1)[0]
                extent = 2 * np.mean(self._extents_all[cls + 1, :])

                flag = np.random.randint(0, 2)
                if flag == 0:
                    flag = -1
                qt[0] = poses_all[object_id][
                    0] + flag * extent * np.random.uniform(1.0, 1.5)
                if np.absolute(qt[0]) > bound:
                    qt[0] = poses_all[object_id][
                        0] - flag * extent * np.random.uniform(1.0, 1.5)
                if np.absolute(qt[0]) > bound:
                    qt[0] = np.random.uniform(-bound, bound)

                flag = np.random.randint(0, 2)
                if flag == 0:
                    flag = -1
                qt[1] = poses_all[object_id][
                    1] + flag * extent * np.random.uniform(1.0, 1.5)
                if np.absolute(qt[1]) > bound:
                    qt[1] = poses_all[object_id][
                        1] - flag * extent * np.random.uniform(1.0, 1.5)
                if np.absolute(qt[1]) > bound:
                    qt[1] = np.random.uniform(-bound, bound)

                qt[2] = poses_all[object_id][2] - extent * np.random.uniform(
                    2.0, 4.0)
                if qt[2] < cfg.TRAIN.SYN_TNEAR:
                    qt[2] = poses_all[object_id][
                        2] + extent * np.random.uniform(2.0, 4.0)

            poses_all.append(qt)
        cfg.renderer.set_poses(poses_all)

        # sample lighting
        cfg.renderer.set_light_pos(np.random.uniform(-0.5, 0.5, 3))

        intensity = np.random.uniform(0.8, 2)
        light_color = intensity * np.random.uniform(0.9, 1.1, 3)
        cfg.renderer.set_light_color(light_color)

        # rendering
        cfg.renderer.set_projection_matrix(width, height, fx, fy, px, py,
                                           znear, zfar)
        image_tensor = torch.cuda.FloatTensor(height, width, 4).detach()
        seg_tensor = torch.cuda.FloatTensor(height, width, 4).detach()
        pc_tensor = torch.cuda.FloatTensor(height, width, 4).detach()
        cfg.renderer.render(cls_indexes,
                            image_tensor,
                            seg_tensor,
                            pc2_tensor=pc_tensor)
        image_tensor = image_tensor.flip(0)
        seg_tensor = seg_tensor.flip(0)
        pc_tensor = pc_tensor.flip(0)

        # foreground mask
        seg = seg_tensor[:, :,
                         2] + 256 * seg_tensor[:, :,
                                               1] + 256 * 256 * seg_tensor[:, :,
                                                                           0]
        mask = (seg != 0).unsqueeze(0).repeat((3, 1, 1)).float()

        # RGB to BGR order
        im = image_tensor.cpu().numpy()
        im = np.clip(im, 0, 1)
        im = im[:, :, (2, 1, 0)] * 255
        im = im.astype(np.uint8)

        # XYZ coordinates in camera frame
        im_depth = pc_tensor.cpu().numpy()
        im_depth = im_depth[:, :, :3]
        im_depth_return = im_depth[:, :, 2].copy()

        im_label = seg_tensor.cpu().numpy()
        im_label = im_label[:, :, (2, 1, 0)] * 255
        im_label = np.round(im_label).astype(np.uint8)
        im_label = np.clip(im_label, 0, 255)
        im_label, im_label_all = self.process_label_image(im_label)

        centers = np.zeros((num, 2), dtype=np.float32)
        rcenters = cfg.renderer.get_centers()
        for i in range(num):
            centers[i, 0] = rcenters[i][1] * width
            centers[i, 1] = rcenters[i][0] * height
        centers = centers[:num_target, :]
        '''
        import matplotlib.pyplot as plt
        fig = plt.figure()
        ax = fig.add_subplot(3, 2, 1)
        plt.imshow(im[:, :, (2, 1, 0)])
        for i in range(num_target):
            plt.plot(centers[i, 0], centers[i, 1], 'yo')
        ax = fig.add_subplot(3, 2, 2)
        plt.imshow(im_label)
        ax = fig.add_subplot(3, 2, 3)
        plt.imshow(im_depth[:, :, 0])
        ax = fig.add_subplot(3, 2, 4)
        plt.imshow(im_depth[:, :, 1])
        ax = fig.add_subplot(3, 2, 5)
        plt.imshow(im_depth[:, :, 2])
        plt.show()
        #'''

        # chromatic transform
        if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = chromatic_transform(im)

        im_cuda = torch.from_numpy(im).cuda().float() / 255.0
        if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im_cuda = add_noise_cuda(im_cuda)
        im_cuda -= self._pixel_mean
        im_cuda = im_cuda.permute(2, 0, 1)

        if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD':

            # depth mask
            z_im = im_depth[:, :, 2]
            mask_depth = z_im > 0.0
            mask_depth = mask_depth.astype('float')
            mask_depth_cuda = torch.from_numpy(mask_depth).cuda().float()
            mask_depth_cuda.unsqueeze_(0)

            im_cuda_depth = torch.from_numpy(im_depth).cuda().float()
            if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand(
                    1) > 0.1:
                im_cuda_depth = add_noise_depth_cuda(im_cuda_depth)
            im_cuda_depth = im_cuda_depth.permute(2, 0, 1)
        else:
            im_cuda_depth = im_cuda.clone()
            mask_depth_cuda = torch.cuda.FloatTensor(1, height, width).fill_(0)

        # label blob
        classes = np.array(range(self.num_classes))
        label_blob = np.zeros((self.num_classes, self._height, self._width),
                              dtype=np.float32)
        label_blob[0, :, :] = 1.0
        for i in range(1, self.num_classes):
            I = np.where(im_label == classes[i])
            if len(I[0]) > 0:
                label_blob[i, I[0], I[1]] = 1.0
                label_blob[0, I[0], I[1]] = 0.0

        # poses and boxes
        pose_blob = np.zeros((self.num_classes, 9), dtype=np.float32)
        gt_boxes = np.zeros((self.num_classes, 5), dtype=np.float32)
        for i in range(num_target):
            cls = int(indexes_target[i])
            pose_blob[i, 0] = 1
            pose_blob[i, 1] = cls
            T = poses_all[i][:3]
            qt = poses_all[i][3:]

            # egocentric to allocentric
            qt_allocentric = egocentric2allocentric(qt, T)
            if qt_allocentric[0] < 0:
                qt_allocentric = -1 * qt_allocentric
            pose_blob[i, 2:6] = qt_allocentric
            pose_blob[i, 6:] = T

            # compute box
            x3d = np.ones((4, self._points_all.shape[1]), dtype=np.float32)
            x3d[0, :] = self._points_all[cls, :, 0]
            x3d[1, :] = self._points_all[cls, :, 1]
            x3d[2, :] = self._points_all[cls, :, 2]
            RT = np.zeros((3, 4), dtype=np.float32)
            RT[:3, :3] = quat2mat(qt)
            RT[:, 3] = T
            x2d = np.matmul(self._intrinsic_matrix, np.matmul(RT, x3d))
            x2d[0, :] = np.divide(x2d[0, :], x2d[2, :])
            x2d[1, :] = np.divide(x2d[1, :], x2d[2, :])

            gt_boxes[i, 0] = np.min(x2d[0, :])
            gt_boxes[i, 1] = np.min(x2d[1, :])
            gt_boxes[i, 2] = np.max(x2d[0, :])
            gt_boxes[i, 3] = np.max(x2d[1, :])
            gt_boxes[i, 4] = cls

        # construct the meta data
        """
        format of the meta_data
        intrinsic matrix: meta_data[0 ~ 8]
        inverse intrinsic matrix: meta_data[9 ~ 17]
        """
        K = self._intrinsic_matrix
        K[2, 2] = 1
        Kinv = np.linalg.pinv(K)
        meta_data_blob = np.zeros(18, dtype=np.float32)
        meta_data_blob[0:9] = K.flatten()
        meta_data_blob[9:18] = Kinv.flatten()

        # vertex regression target
        if cfg.TRAIN.VERTEX_REG:
            vertex_targets, vertex_weights = self._generate_vertex_targets(
                im_label, indexes_target, centers, poses_all, classes,
                self.num_classes)
        elif cfg.TRAIN.VERTEX_REG_DELTA and cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD':
            vertex_targets, vertex_weights = self._generate_vertex_deltas(
                im_label, indexes_target, centers, poses_all, classes,
                self.num_classes, im_depth)
        else:
            vertex_targets = []
            vertex_weights = []

        im_info = np.array(
            [im.shape[1], im.shape[2], cfg.TRAIN.SCALES_BASE[0], 1],
            dtype=np.float32)

        sample = {
            'image_color': im_cuda,
            'image_depth': im_cuda_depth,
            'im_depth': im_depth_return,
            'label': label_blob,
            'mask': mask,
            'mask_depth': mask_depth_cuda,
            'meta_data': meta_data_blob,
            'poses': pose_blob,
            'extents': self._extents,
            'points': self._point_blob,
            'symmetry': self._symmetry,
            'gt_boxes': gt_boxes,
            'im_info': im_info
        }

        if cfg.TRAIN.VERTEX_REG or cfg.TRAIN.VERTEX_REG_DELTA:
            sample['vertex_targets'] = vertex_targets
            sample['vertex_weights'] = vertex_weights

        return sample
    def __getitem__(self, idx):

        sample = self.data[idx]  # (idx: [rgb, d, seg])
        rgb_path = sample[0]
        depth_path = sample[1]
        segmentation_path = sample[2]

        # _, ax = plt.subplots(1, 3)
        # ax[0].imshow(rgb)
        # ax[1].imshow(depth)
        # ax[2].imshow(segmentation)
        # plt.show()

        foreground_labels_filename = segmentation_path
        foreground_labels = util_.imread_indexed(foreground_labels_filename)
        # mask table as background
        foreground_labels[foreground_labels == 1] = 0
        foreground_labels = self.process_label(foreground_labels)

        # BGR image
        filename = rgb_path
        im = cv2.imread(filename)

        if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD':
            # Depth image
            depth_img_filename = depth_path
            depth_img = cv2.imread(
                depth_img_filename, cv2.IMREAD_ANYDEPTH
            )  # This reads a 16-bit single-channel image. Shape: [H x W]
            xyz_img = self.process_depth(depth_img)
        else:
            xyz_img = None

        # crop
        if cfg.TRAIN.SYN_CROP:
            im, foreground_labels, xyz_img = self.pad_crop_resize(
                im, foreground_labels, xyz_img)
            foreground_labels = self.process_label(foreground_labels)

        # sample labels
        if cfg.TRAIN.EMBEDDING_SAMPLING:
            foreground_labels = self.sample_pixels(
                foreground_labels, cfg.TRAIN.EMBEDDING_SAMPLING_NUM)

        label_blob = torch.from_numpy(foreground_labels).unsqueeze(0)
        sample = {'label': label_blob}

        if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = chromatic_transform(im)
        if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = add_noise(im)

        im_tensor = torch.from_numpy(im) / 255.0
        im_tensor -= self._pixel_mean
        image_blob = im_tensor.permute(2, 0, 1)
        sample['image_color'] = image_blob

        if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD':
            depth_blob = torch.from_numpy(xyz_img).permute(2, 0, 1)
            sample['depth'] = depth_blob

        return sample
Example #12
0
def _get_image_blob(roidb, scale_ind):
    """Builds an input blob from the images in the roidb at the specified
    scales.
    """
    num_images = len(roidb)
    processed_ims = []
    processed_ims_depth = []
    processed_ims_normal = []
    im_scales = []
    for i in xrange(num_images):
        # meta data
        meta_data = scipy.io.loadmat(roidb[i]['meta_data'])
        K = meta_data['intrinsic_matrix'].astype(np.float32, copy=True)
        fx = K[0, 0]
        fy = K[1, 1]
        cx = K[0, 2]
        cy = K[1, 2]

        # depth raw
        im_depth_raw = pad_im(cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16)
        height = im_depth_raw.shape[0]
        width = im_depth_raw.shape[1]

        # rgba
        rgba = pad_im(cv2.imread(roidb[i]['image'], cv2.IMREAD_UNCHANGED), 16)
        if rgba.shape[2] == 4:
            im = np.copy(rgba[:,:,:3])
            alpha = rgba[:,:,3]
            I = np.where(alpha == 0)
            im[I[0], I[1], :] = 255
        else:
            im = rgba

        # chromatic transform
        if cfg.TRAIN.CHROMATIC:
            im = chromatic_transform(im)

        # mask the color image according to depth
        if cfg.EXP_DIR == 'rgbd_scene':
            I = np.where(im_depth_raw == 0)
            im[I[0], I[1], :] = 0

        if roidb[i]['flipped']:
            im = im[:, ::-1, :]

        im_orig = im.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_scale = cfg.TRAIN.SCALES_BASE[scale_ind]
        im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
        im_scales.append(im_scale)
        processed_ims.append(im)

        # depth
        im_depth = im_depth_raw.astype(np.float32, copy=True) / float(im_depth_raw.max()) * 255
        im_depth = np.tile(im_depth[:,:,np.newaxis], (1,1,3))

        if roidb[i]['flipped']:
            im_depth = im_depth[:, ::-1]

        im_orig = im_depth.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_depth = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
        processed_ims_depth.append(im_depth)

        # normals
        depth = im_depth_raw.astype(np.float32, copy=True) / float(meta_data['factor_depth'])
        nmap = gpu_normals.gpu_normals(depth, fx, fy, cx, cy, 20.0, cfg.GPU_ID)
        im_normal = 127.5 * nmap + 127.5
        im_normal = im_normal.astype(np.uint8)
        im_normal = im_normal[:, :, (2, 1, 0)]
        if roidb[i]['flipped']:
            im_normal = im_normal[:, ::-1, :]

        im_orig = im_normal.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_normal = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
        processed_ims_normal.append(im_normal)

    # Create a blob to hold the input images
    blob = im_list_to_blob(processed_ims, 3)
    blob_depth = im_list_to_blob(processed_ims_depth, 3)
    blob_normal = im_list_to_blob(processed_ims_normal, 3)

    return blob, blob_depth, blob_normal, im_scales