Ejemplo n.º 1
0
    def _write_background_images(self):

        cache_file = os.path.join(self._cache_path, self._name + '_backgrounds.pkl')
        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as fid:
                self._backgrounds = cPickle.load(fid)

            if self._name != 'lov_train':
                cache_file_lov = os.path.join(self._cache_path, 'lov_train_backgrounds.pkl')
                if os.path.exists(cache_file_lov):
                    with open(cache_file_lov, 'rb') as fid:
                        backgrounds_lov = cPickle.load(fid)
                        self._backgrounds = self._backgrounds + backgrounds_lov

            print '{} backgrounds loaded from {}, {} images'.format(self._name, cache_file, len(self._backgrounds))
            return

        print "building background images"

        outdir = os.path.join(self._cache_path, self._name + '_backgrounds')
        if not os.path.exists(outdir):
            os.mkdir(outdir)

        num = 1000
        perm = np.random.permutation(np.arange(len(self._roidb)))
        perm = perm[:num]
        print len(perm)

        backgrounds = [None]*num
        kernel = np.ones((50, 50), np.uint8)
        for i in xrange(num):
            index = perm[i]
            # rgba
            rgba = pad_im(cv2.imread(self._roidb[index]['image'], cv2.IMREAD_UNCHANGED), 16)
            if rgba.shape[2] == 4:
                im = np.copy(rgba[:,:,:3])
                alpha = rgba[:,:,3]
                I = np.where(alpha == 0)
                im[I[0], I[1], :] = 0
            else:
                im = rgba

            # generate background image
            mask = pad_im(cv2.imread(self._roidb[index]['label'], cv2.IMREAD_UNCHANGED), 16)
            index = np.where(mask > 0)
            mask[index[0], index[1]] = 1
            mask = cv2.dilate(mask, kernel)
            background = cv2.inpaint(im, mask, 3, cv2.INPAINT_TELEA)

            # write the image
            filename = os.path.join(self._cache_path, self._name + '_backgrounds', '%04d.jpg' % (i))
            cv2.imwrite(filename, background)
            backgrounds[i] = filename

        self._backgrounds = backgrounds
        print "build background images finished"

        with open(cache_file, 'wb') as fid:
            cPickle.dump(backgrounds, fid, cPickle.HIGHEST_PROTOCOL)
        print 'wrote backgrounds to {}'.format(cache_file)
    def callback_rgbd(self, rgb, depth):

        if depth.encoding == '32FC1':
            depth_cv = self.cv_bridge.imgmsg_to_cv2(depth)
        elif depth.encoding == '16UC1':
            depth_cv = self.cv_bridge.imgmsg_to_cv2(depth).copy().astype(np.float32)
            depth_cv /= 1000.0
        else:
            rospy.logerr_throttle(
                1, 'Unsupported depth type. Expected 16UC1 or 32FC1, got {}'.format(
                    depth.encoding))
            return

        im = self.cv_bridge.imgmsg_to_cv2(rgb, 'bgr8')

        # rescale image if necessary
        if cfg.TEST.SCALES_BASE[0] != 1:
            im_scale = cfg.TEST.SCALES_BASE[0]
            im = pad_im(cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR), 16)
            depth_cv = pad_im(cv2.resize(depth_cv, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_NEAREST), 16)

        with lock:
            self.im = im.copy()
            self.depth = depth_cv.copy()
            self.rgb_frame_id = rgb.header.frame_id
            self.rgb_frame_stamp = rgb.header.stamp
Ejemplo n.º 3
0
    def _get_image_blob(self, color_file, depth_file, scale_ind):

        # rgba
        rgba = pad_im(cv2.imread(color_file, cv2.IMREAD_UNCHANGED), 16)
        if rgba.shape[2] == 4:
            im = np.copy(rgba[:, :, :3])
            alpha = rgba[:, :, 3]
            I = np.where(alpha == 0)
            im[I[0], I[1], :] = 0
        else:
            im = rgba

        im_scale = cfg.TRAIN.SCALES_BASE[scale_ind]
        if im_scale != 1.0:
            im = cv2.resize(im,
                            None,
                            None,
                            fx=im_scale,
                            fy=im_scale,
                            interpolation=cv2.INTER_LINEAR)
        height = im.shape[0]
        width = im.shape[1]

        # chromatic transform
        if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = chromatic_transform(im)
        if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand(
                1) > 0.1:
            im = add_noise(im)
        im_tensor = torch.from_numpy(im) / 255.0
        im_tensor -= self._pixel_mean
        image_blob = im_tensor.permute(2, 0, 1).float()

        # depth image
        im_depth = pad_im(cv2.imread(depth_file, cv2.IMREAD_UNCHANGED), 16)
        if im_scale != 1.0:
            im_depth = cv2.resize(im_depth,
                                  None,
                                  None,
                                  fx=im_scale,
                                  fy=im_scale,
                                  interpolation=cv2.INTER_NEAREST)
        im_depth = im_depth.astype('float') / 1000.0

        return image_blob, im_depth, im_scale, height, width
Ejemplo n.º 4
0
def read_input_data(src_path_prefix, color, depth):
    rgba = pad_im(cv2.imread(src_path_prefix + color, cv2.IMREAD_UNCHANGED),
                  16)
    if rgba.shape[2] == 4:
        im = np.copy(rgba[:, :, :3])
        alpha = rgba[:, :, 3]
        I = np.where(alpha == 0)
        im[I[0], I[1], :] = 0
    else:
        im = rgba

    depth_cv = cv2.imread(src_path_prefix + depth, cv2.IMREAD_ANYDEPTH)
    return im, depth_cv
Ejemplo n.º 5
0
def _get_label_blob(roidb, intrinsic_matrix, num_classes, db_inds_syn,
                    im_scales, extents, is_syn):
    """ build the label blob """

    num_images = len(roidb)
    processed_depth = []
    processed_label = []
    processed_meta_data = []
    if cfg.TRAIN.VERTEX_REG_2D or cfg.TRAIN.VERTEX_REG_3D:
        processed_vertex_targets = []
        processed_vertex_weights = []
        pose_blob = np.zeros((0, 13), dtype=np.float32)
    else:
        pose_blob = []

    if not cfg.TRAIN.SEGMENTATION:
        assert len(im_scales) == 1, "Single batch only"
        assert len(roidb) == 1, "Single batch only"

    if not cfg.TRAIN.SEGMENTATION:
        assert len(im_scales) == 1, "Single batch only"
        assert len(roidb) == 1, "Single batch only"
        # gt boxes: (x1, y1, x2, y2, cls)
        gt_boxes = np.zeros((0, 5), dtype=np.float32)
        pose_blob = np.zeros((0, 13), dtype=np.float32)
    else:
        gt_boxes = []

    for i in xrange(num_images):
        im_scale = im_scales[i]

        if is_syn:
            filename = cfg.TRAIN.SYNROOT + '{:06d}-meta.mat'.format(
                db_inds_syn[i])
            meta_data = scipy.io.loadmat(filename)

            filename = cfg.TRAIN.SYNROOT + '{:06d}-depth.png'.format(
                db_inds_syn[i])
            im_depth = pad_im(cv2.imread(filename, cv2.IMREAD_UNCHANGED), 16)

            # read label image
            filename = cfg.TRAIN.SYNROOT + '{:06d}-label.png'.format(
                db_inds_syn[i])
            im = pad_im(cv2.imread(filename, cv2.IMREAD_UNCHANGED), 16)
        else:
            meta_data = scipy.io.loadmat(roidb[i]['meta_data'])
            im_depth = pad_im(
                cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16)

            # read label image
            im = pad_im(cv2.imread(roidb[i]['label'], cv2.IMREAD_UNCHANGED),
                        16)
        meta_data['cls_indexes'] = meta_data['cls_indexes'].flatten()
        height = im.shape[0]
        width = im.shape[1]

        # mask the label image according to depth
        if cfg.INPUT == 'DEPTH':
            I = np.where(im_depth == 0)
            if len(im.shape) == 2:
                im[I[0], I[1]] = 0
            else:
                im[I[0], I[1], :] = 0
        if roidb[i]['flipped']:
            if len(im.shape) == 2:
                im = im[:, ::-1]
            else:
                im = im[:, ::-1, :]
        im = cv2.resize(im,
                        None,
                        None,
                        fx=im_scale,
                        fy=im_scale,
                        interpolation=cv2.INTER_NEAREST)

        # process annotation if training for two classes
        if num_classes == 2:
            I = np.where(im == roidb[i]['cls_index'])
            im[:, :] = 0
            im[I[0], I[1]] = 1
            ind = np.where(
                meta_data['cls_indexes'] == roidb[i]['cls_index'])[0]
            meta_data['cls_indexes'] = np.ones((1, ), dtype=np.float32)
            if len(meta_data['poses'].shape) == 3:
                meta_data['poses'] = meta_data['poses'][:, :, ind]
            meta_data['center'] = meta_data['center'][ind, :]
            meta_data['box'] = meta_data['box'][ind, :]

        im_cls, im_labels = _process_label_image(im, roidb[i]['class_colors'],
                                                 roidb[i]['class_weights'])
        processed_label.append(im_cls)

        # bounding boxes
        if not cfg.TRAIN.SEGMENTATION:
            boxes = meta_data['box'].copy()
            if roidb[i]['flipped']:
                oldx1 = boxes[:, 0].copy()
                oldx2 = boxes[:, 2].copy()
                boxes[:, 0] = width - oldx2 - 1
                boxes[:, 2] = width - oldx1 - 1
            gt_box = np.concatenate(
                (boxes * im_scales[0], meta_data['cls_indexes'][:,
                                                                np.newaxis]),
                axis=1)
            gt_boxes = np.concatenate((gt_boxes, gt_box), axis=0)

            poses = meta_data['poses']
            if len(poses.shape) == 2:
                poses = np.reshape(poses, (3, 4, 1))
            if roidb[i]['flipped']:
                poses = _flip_poses(poses, meta_data['intrinsic_matrix'],
                                    width)

            num = poses.shape[2]
            qt = np.zeros((num, 13), dtype=np.float32)
            for j in xrange(num):
                R = poses[:, :3, j]
                T = poses[:, 3, j]

                qt[j, 0] = i
                qt[j, 1] = meta_data['cls_indexes'][j]
                qt[j, 2:6] = 0  # fill box later
                qt[j, 6:10] = mat2quat(R)
                qt[j, 10:] = T

            pose_blob = np.concatenate((pose_blob, qt), axis=0)

        # vertex regression targets and weights
        if cfg.TRAIN.VERTEX_REG_2D or cfg.TRAIN.VERTEX_REG_3D:
            poses = meta_data['poses']
            if len(poses.shape) == 2:
                poses = np.reshape(poses, (3, 4, 1))
            if roidb[i]['flipped']:
                poses = _flip_poses(poses, meta_data['intrinsic_matrix'],
                                    width)

            if cfg.TRAIN.VERTEX_REG_3D:
                vertmap = meta_data['vertmap']
                if roidb[i]['flipped']:
                    vertmap = vertmap[:, ::-1, :]
                vertmap = cv2.resize(vertmap,
                                     None,
                                     None,
                                     fx=im_scale,
                                     fy=im_scale,
                                     interpolation=cv2.INTER_LINEAR)
            else:
                vertmap = []

            center = meta_data['center']
            if roidb[i]['flipped']:
                center[:, 0] = width - center[:, 0]

            vertex_targets, vertex_weights = \
                _generate_vertex_targets(im, meta_data['cls_indexes'], im_scale * center, poses, num_classes, vertmap, extents)
            processed_vertex_targets.append(vertex_targets)
            processed_vertex_weights.append(vertex_weights)

            num = poses.shape[2]
            qt = np.zeros((num, 13), dtype=np.float32)
            for j in xrange(num):
                R = poses[:, :3, j]
                T = poses[:, 3, j]

                qt[j, 0] = i
                qt[j, 1] = meta_data['cls_indexes'][j]
                qt[j, 2:6] = 0  # fill box later
                qt[j, 6:10] = mat2quat(R)
                qt[j, 10:] = T

            pose_blob = np.concatenate((pose_blob, qt), axis=0)

        # depth
        if roidb[i]['flipped']:
            im_depth = im_depth[:, ::-1]
        depth = im_depth.astype(np.float32, copy=True) / float(
            meta_data['factor_depth'])
        depth = cv2.resize(depth,
                           None,
                           None,
                           fx=im_scale,
                           fy=im_scale,
                           interpolation=cv2.INTER_LINEAR)
        processed_depth.append(depth)

        # voxelization
        # points = voxelizer.backproject_camera(im_depth, meta_data)
        # voxelizer.voxelized = False
        # voxelizer.voxelize(points)
        # RT_world = meta_data['rotation_translation_matrix']

        # compute camera poses
        # RT_live = meta_data['rotation_translation_matrix']
        # pose_world2live = se3_mul(RT_live, se3_inverse(RT_world))
        # pose_live2world = se3_inverse(pose_world2live)

        # construct the meta data
        """
        format of the meta_data
        intrinsic matrix: meta_data[0 ~ 8]
        inverse intrinsic matrix: meta_data[9 ~ 17]
        pose_world2live: meta_data[18 ~ 29]
        pose_live2world: meta_data[30 ~ 41]
        voxel step size: meta_data[42, 43, 44]
        voxel min value: meta_data[45, 46, 47]
        """
        K = np.matrix(meta_data['intrinsic_matrix']) * im_scale
        K[2, 2] = 1
        Kinv = np.linalg.pinv(K)
        mdata = np.zeros(48, dtype=np.float32)
        mdata[0:9] = K.flatten()
        mdata[9:18] = Kinv.flatten()
        # mdata[18:30] = pose_world2live.flatten()
        # mdata[30:42] = pose_live2world.flatten()
        # mdata[42] = voxelizer.step_x
        # mdata[43] = voxelizer.step_y
        # mdata[44] = voxelizer.step_z
        # mdata[45] = voxelizer.min_x
        # mdata[46] = voxelizer.min_y
        # mdata[47] = voxelizer.min_z
        if cfg.FLIP_X:
            mdata[0] = -1 * mdata[0]
            mdata[9] = -1 * mdata[9]
            mdata[11] = -1 * mdata[11]
        processed_meta_data.append(mdata)

    # construct the blobs
    height = processed_depth[0].shape[0]
    width = processed_depth[0].shape[1]
    depth_blob = np.zeros((num_images, height, width, 1), dtype=np.float32)
    height = processed_label[0].shape[0]
    width = processed_label[0].shape[1]
    label_blob = np.zeros((num_images, height, width, num_classes),
                          dtype=np.float32)
    meta_data_blob = np.zeros((num_images, 1, 1, 48), dtype=np.float32)
    if cfg.TRAIN.VERTEX_REG_2D or cfg.TRAIN.VERTEX_REG_3D:
        vertex_target_blob = np.zeros(
            (num_images, height, width, 3 * num_classes), dtype=np.float32)
        vertex_weight_blob = np.zeros(
            (num_images, height, width, 3 * num_classes), dtype=np.float32)
    else:
        vertex_target_blob = []
        vertex_weight_blob = []

    for i in xrange(num_images):
        depth_blob[i, :, :, 0] = processed_depth[i]
        label_blob[i, :, :, :] = processed_label[i]
        meta_data_blob[i, 0, 0, :] = processed_meta_data[i]
        if cfg.TRAIN.VERTEX_REG_2D or cfg.TRAIN.VERTEX_REG_3D:
            vertex_target_blob[i, :, :, :] = processed_vertex_targets[i]
            vertex_weight_blob[i, :, :, :] = processed_vertex_weights[i]

    # filter bad boxes
    if not cfg.TRAIN.SEGMENTATION:
        gt_widths = gt_boxes[:, 2] - gt_boxes[:, 0] + 1.0
        gt_heights = gt_boxes[:, 3] - gt_boxes[:, 1] + 1.0
        ind = np.where((gt_widths > 0) & (gt_heights > 0))[0]
        gt_boxes = gt_boxes[ind, :]

    return depth_blob, label_blob, meta_data_blob, vertex_target_blob, vertex_weight_blob, pose_blob, gt_boxes
Ejemplo n.º 6
0
def _get_image_blob(roidb, scale_ind):
    """Builds an input blob from the images in the roidb at the specified
    scales.
    """
    num_images = len(roidb)
    processed_ims = []
    processed_ims_depth = []
    processed_ims_normal = []
    im_scales = []
    if cfg.TRAIN.GAN:
        processed_ims_rescale = []

    for i in range(num_images):
        # meta data
        meta_data = scipy.io.loadmat(roidb[i]['meta_data'])
        K = meta_data['intrinsic_matrix'].astype(np.float32, copy=True)
        fx = K[0, 0]
        fy = K[1, 1]
        cx = K[0, 2]
        cy = K[1, 2]

        # depth raw
        im_depth_raw = pad_im(
            cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16)
        height = im_depth_raw.shape[0]
        width = im_depth_raw.shape[1]

        # rgba
        rgba = pad_im(cv2.imread(roidb[i]['image'], cv2.IMREAD_UNCHANGED), 16)
        if rgba.shape[2] == 4:
            im = np.copy(rgba[:, :, :3])
            alpha = rgba[:, :, 3]
            I = np.where(alpha == 0)
            im[I[0], I[1], :] = 0
        else:
            im = rgba

        # chromatic transform
        if cfg.TRAIN.CHROMATIC:
            label = pad_im(cv2.imread(roidb[i]['label'], cv2.IMREAD_UNCHANGED),
                           16)
            im = chromatic_transform(im, label)

        # mask the color image according to depth
        if cfg.EXP_DIR == 'rgbd_scene':
            I = np.where(im_depth_raw == 0)
            im[I[0], I[1], :] = 0

        if roidb[i]['flipped']:
            im = im[:, ::-1, :]

        if cfg.TRAIN.GAN:
            im_orig = im.astype(np.float32, copy=True) / 127.5 - 1
            im_scale = cfg.TRAIN.SCALES_BASE[scale_ind]
            im_rescale = cv2.resize(im_orig,
                                    None,
                                    None,
                                    fx=im_scale,
                                    fy=im_scale,
                                    interpolation=cv2.INTER_LINEAR)
            processed_ims_rescale.append(im_rescale)

        im_orig = im.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_scale = cfg.TRAIN.SCALES_BASE[scale_ind]
        im = cv2.resize(im_orig,
                        None,
                        None,
                        fx=im_scale,
                        fy=im_scale,
                        interpolation=cv2.INTER_LINEAR)
        im_scales.append(im_scale)
        processed_ims.append(im)

        # depth
        im_depth = im_depth_raw.astype(np.float32, copy=True) / float(
            im_depth_raw.max()) * 255
        im_depth = np.tile(im_depth[:, :, np.newaxis], (1, 1, 3))

        if roidb[i]['flipped']:
            im_depth = im_depth[:, ::-1]

        im_orig = im_depth.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_depth = cv2.resize(im_orig,
                              None,
                              None,
                              fx=im_scale,
                              fy=im_scale,
                              interpolation=cv2.INTER_LINEAR)
        processed_ims_depth.append(im_depth)

        # normals
        depth = im_depth_raw.astype(np.float32, copy=True) / float(
            meta_data['factor_depth'])
        nmap = gpu_normals.gpu_normals(depth, fx, fy, cx, cy, 20.0, cfg.GPU_ID)
        im_normal = 127.5 * nmap + 127.5
        im_normal = im_normal.astype(np.uint8)
        im_normal = im_normal[:, :, (2, 1, 0)]
        if roidb[i]['flipped']:
            im_normal = im_normal[:, ::-1, :]

        im_orig = im_normal.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_normal = cv2.resize(im_orig,
                               None,
                               None,
                               fx=im_scale,
                               fy=im_scale,
                               interpolation=cv2.INTER_LINEAR)
        processed_ims_normal.append(im_normal)

    # Create a blob to hold the input images
    blob = im_list_to_blob(processed_ims, 3)
    blob_depth = im_list_to_blob(processed_ims_depth, 3)
    blob_normal = im_list_to_blob(processed_ims_normal, 3)
    if cfg.TRAIN.GAN:
        blob_rescale = im_list_to_blob(processed_ims_rescale, 3)
    else:
        blob_rescale = []

    return blob, blob_rescale, blob_depth, blob_normal, im_scales
Ejemplo n.º 7
0
def _get_label_blob(roidb, voxelizer, im_scales):
    """ build the label blob """

    num_images = len(roidb)
    num_classes = voxelizer.num_classes
    processed_depth = []
    processed_label = []
    processed_meta_data = []
    if cfg.TRAIN.VERTEX_REG:
        processed_vertex_targets = []
        processed_vertex_weights = []
        pose_blob = np.zeros((0, 13), dtype=np.float32)
    else:
        pose_blob = []

    for i in range(num_images):
        im_scale = im_scales[i]

        # load meta data
        meta_data = scipy.io.loadmat(roidb[i]['meta_data'])
        im_depth = pad_im(cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED),
                          16)

        # read label image
        im = pad_im(cv2.imread(roidb[i]['label'], cv2.IMREAD_UNCHANGED), 16)
        height = im.shape[0]
        width = im.shape[1]
        # mask the label image according to depth
        if cfg.INPUT == 'DEPTH':
            I = np.where(im_depth == 0)
            if len(im.shape) == 2:
                im[I[0], I[1]] = 0
            else:
                im[I[0], I[1], :] = 0
        if roidb[i]['flipped']:
            if len(im.shape) == 2:
                im = im[:, ::-1]
            else:
                im = im[:, ::-1, :]
        im = cv2.resize(im,
                        None,
                        None,
                        fx=im_scale,
                        fy=im_scale,
                        interpolation=cv2.INTER_NEAREST)
        if num_classes == 2:
            I = np.where(im > 0)
            im[I[0], I[1]] = 1
            for j in range(len(meta_data['cls_indexes'])):
                meta_data['cls_indexes'][j] = 1
        im_cls, im_labels = _process_label_image(im, roidb[i]['class_colors'],
                                                 roidb[i]['class_weights'])
        processed_label.append(im_cls)

        # vertex regression targets and weights
        if cfg.TRAIN.VERTEX_REG:
            poses = meta_data['poses']
            if len(poses.shape) == 2:
                poses = np.reshape(poses, (3, 4, 1))

            center_targets, center_weights = _vote_centers(
                im, meta_data['cls_indexes'], im_scale * meta_data['center'],
                poses, num_classes)
            processed_vertex_targets.append(center_targets)
            processed_vertex_weights.append(center_weights)

            num = poses.shape[2]
            qt = np.zeros((num, 13), dtype=np.float32)
            for j in range(num):
                R = poses[:, :3, j]
                T = poses[:, 3, j]

                qt[j, 0] = i
                qt[j, 1] = meta_data['cls_indexes'][j, 0]
                qt[j, 2:6] = 0  # fill box later, roidb[i]['boxes'][j, :]
                qt[j, 6:10] = mat2quat(R)
                qt[j, 10:] = T

            pose_blob = np.concatenate((pose_blob, qt), axis=0)

        # depth
        if roidb[i]['flipped']:
            im_depth = im_depth[:, ::-1]
        depth = im_depth.astype(np.float32, copy=True) / float(
            meta_data['factor_depth'])
        depth = cv2.resize(depth,
                           None,
                           None,
                           fx=im_scale,
                           fy=im_scale,
                           interpolation=cv2.INTER_LINEAR)
        processed_depth.append(depth)

        # voxelization
        # points = voxelizer.backproject_camera(im_depth, meta_data)
        # voxelizer.voxelized = False
        # voxelizer.voxelize(points)
        # RT_world = meta_data['rotation_translation_matrix']

        # compute camera poses
        # RT_live = meta_data['rotation_translation_matrix']
        # pose_world2live = se3_mul(RT_live, se3_inverse(RT_world))
        # pose_live2world = se3_inverse(pose_world2live)

        # construct the meta data
        """
        format of the meta_data
        intrinsic matrix: meta_data[0 ~ 8]
        inverse intrinsic matrix: meta_data[9 ~ 17]
        pose_world2live: meta_data[18 ~ 29]
        pose_live2world: meta_data[30 ~ 41]
        voxel step size: meta_data[42, 43, 44]
        voxel min value: meta_data[45, 46, 47]
        """
        K = np.matrix(meta_data['intrinsic_matrix']) * im_scale
        K[2, 2] = 1
        Kinv = np.linalg.pinv(K)
        mdata = np.zeros(48, dtype=np.float32)
        mdata[0:9] = K.flatten()
        mdata[9:18] = Kinv.flatten()
        # mdata[18:30] = pose_world2live.flatten()
        # mdata[30:42] = pose_live2world.flatten()
        # mdata[42] = voxelizer.step_x
        # mdata[43] = voxelizer.step_y
        # mdata[44] = voxelizer.step_z
        # mdata[45] = voxelizer.min_x
        # mdata[46] = voxelizer.min_y
        # mdata[47] = voxelizer.min_z
        if cfg.FLIP_X:
            mdata[0] = -1 * mdata[0]
            mdata[9] = -1 * mdata[9]
            mdata[11] = -1 * mdata[11]
        processed_meta_data.append(mdata)

    # construct the blobs
    height = processed_depth[0].shape[0]
    width = processed_depth[0].shape[1]
    depth_blob = np.zeros((num_images, height, width, 1), dtype=np.float32)
    label_blob = np.zeros((num_images, height, width, num_classes),
                          dtype=np.float32)
    meta_data_blob = np.zeros((num_images, 1, 1, 48), dtype=np.float32)
    if cfg.TRAIN.VERTEX_REG:
        vertex_target_blob = np.zeros(
            (num_images, height, width, 3 * num_classes), dtype=np.float32)
        vertex_weight_blob = np.zeros(
            (num_images, height, width, 3 * num_classes), dtype=np.float32)
    else:
        vertex_target_blob = []
        vertex_weight_blob = []

    if cfg.TRAIN.GAN:
        gan_z_blob = np.random.uniform(-1, 1,
                                       [num_images, 100]).astype(np.float32)
    else:
        gan_z_blob = []

    for i in range(num_images):
        depth_blob[i, :, :, 0] = processed_depth[i]
        label_blob[i, :, :, :] = processed_label[i]
        meta_data_blob[i, 0, 0, :] = processed_meta_data[i]
        if cfg.TRAIN.VERTEX_REG:
            vertex_target_blob[i, :, :, :] = processed_vertex_targets[i]
            vertex_weight_blob[i, :, :, :] = processed_vertex_weights[i]

    return depth_blob, label_blob, meta_data_blob, vertex_target_blob, vertex_weight_blob, pose_blob, gan_z_blob
Ejemplo n.º 8
0
def run_network(sess, net, imdb, images, meta_data):
    """
    :param sess: TensorFlow session
    :param net: Pretrained neural network to run model over.
    :param imdb: TODO: Find out essential features of this object.
    :param images: [(rgb_image[0], depth_image[0]), ...]
    :param meta_data: Dictionary including camera intrinsics under 'intrinsic_matrix',
                      and scale factor under 'factor_depth' (default is 10,000).
    """

    n_images = len(images)
    segmentations = [[] for _ in range(n_images)]

    # timers
    _t = {'im_segment': Timer(), 'misc': Timer()}

    # voxelizer
    voxelizer = Voxelizer(cfg.TEST.GRID_SIZE, imdb.num_classes)
    voxelizer.setup(-3, -3, -3, 3, 3, 4)

    # construct colors
    colors = np.zeros((3 * imdb.num_classes), dtype=np.uint8)
    for i in range(imdb.num_classes):
        colors[i * 3 + 0] = imdb._class_colors[i][0]
        colors[i * 3 + 1] = imdb._class_colors[i][1]
        colors[i * 3 + 2] = imdb._class_colors[i][2]

    perm = list(range(n_images))

    if (cfg.TEST.VERTEX_REG_2D
            and cfg.TEST.POSE_REFINE) or (cfg.TEST.VERTEX_REG_3D
                                          and cfg.TEST.POSE_REG):
        import libsynthesizer
        synthesizer = libsynthesizer.Synthesizer(cfg.CAD, cfg.POSE)
        synthesizer.setup(cfg.TRAIN.SYN_WIDTH, cfg.TRAIN.SYN_HEIGHT)

    batched_detections = []

    for i in perm:

        raw_rgb, raw_depth = images[i]

        # read color image
        rgba = pad_im(raw_rgb, 16)
        if rgba.shape[2] == 4:
            im = np.copy(rgba[:, :, :3])
            alpha = rgba[:, :, 3]
            I = np.where(alpha == 0)
            im[I[0], I[1], :] = 0
        else:
            im = rgba

        im_depth = pad_im(raw_depth, 16)

        _t['im_segment'].tic()

        labels, probs, vertex_pred, rois, poses = im_segment_single_frame(
            sess, net, im, im_depth, meta_data, voxelizer, imdb._extents,
            imdb._points_all, imdb._symmetry, imdb.num_classes)

        detections = []

        for j in range(rois.shape[0]):
            cls_idx = int(rois[j, 1])
            if cls_idx > 0:
                # projection
                # RT = np.zeros((3, 4), dtype=np.float32)
                # RT[:3, :3] = quat2mat(poses[j, :4])
                # RT[:, 3] = poses[j, 4:7]

                # transform to world pose
                pose_t = np.zeros((6, ), dtype=np.float32)
                pose_t[:3] = poses[j, 4:7]
                # pose_t[[0,2]] = pose_t[[2,0]]

                # flip z-axis to match renderer
                pose_t[2] = -pose_t[2]
                poses[j, [1, 2]] = -poses[j, [1, 2]]

                pose_t[3:] = quat2euler(poses[j, :4], axes='sxyz')
                cls = imdb._classes[cls_idx]
                detections.append((cls, pose_t))

        batched_detections.append(detections)

        labels = unpad_im(labels, 16)
        im_scale = cfg.TEST.SCALES_BASE[0]
        # build the label image
        im_label = imdb.labels_to_image(im, labels)

        poses_new = []
        poses_icp = []
        if cfg.TEST.VERTEX_REG_2D:
            if cfg.TEST.POSE_REG:
                # pose refinement
                fx = meta_data['intrinsic_matrix'][0, 0] * im_scale
                fy = meta_data['intrinsic_matrix'][1, 1] * im_scale
                px = meta_data['intrinsic_matrix'][0, 2] * im_scale
                py = meta_data['intrinsic_matrix'][1, 2] * im_scale
                factor = meta_data['factor_depth']
                znear = 0.25
                zfar = 6.0
                poses_new = np.zeros((poses.shape[0], 7), dtype=np.float32)
                poses_icp = np.zeros((poses.shape[0], 7), dtype=np.float32)
                error_threshold = 0.01
                if cfg.TEST.POSE_REFINE:
                    labels_icp = labels.copy()
                    rois_icp = rois
                    if imdb.num_classes == 2:
                        I = np.where(labels_icp > 0)
                        labels_icp[I[0], I[1]] = imdb._cls_index
                        rois_icp = rois.copy()
                        rois_icp[:, 1] = imdb._cls_index
                    im_depth = cv2.resize(im_depth,
                                          None,
                                          None,
                                          fx=im_scale,
                                          fy=im_scale,
                                          interpolation=cv2.INTER_LINEAR)

                    parameters = np.zeros((7, ), dtype=np.float32)
                    parameters[0] = fx
                    parameters[1] = fy
                    parameters[2] = px
                    parameters[3] = py
                    parameters[4] = znear
                    parameters[5] = zfar
                    parameters[6] = factor

                    height = labels_icp.shape[0]
                    width = labels_icp.shape[1]
                    num_roi = rois_icp.shape[0]
                    channel_roi = rois_icp.shape[1]
                    synthesizer.icp_python(labels_icp, im_depth, parameters, height, width, num_roi, channel_roi, \
                                           rois_icp, poses, poses_new, poses_icp, error_threshold)

        _t['im_segment'].toc()

        _t['misc'].tic()
        labels_new = cv2.resize(labels,
                                None,
                                None,
                                fx=1.0 / im_scale,
                                fy=1.0 / im_scale,
                                interpolation=cv2.INTER_NEAREST)
        seg = {
            'labels': labels_new,
            'rois': rois,
            'poses': poses,
            'poses_refined': poses_new,
            'poses_icp': poses_icp
        }

        segmentations[i] = seg
        _t['misc'].toc()

        print(('im_segment: {:d}/{:d} {:.3f}s {:.3f}s' \
              .format(i, n_images, _t['im_segment'].diff, _t['misc'].diff)))

        if cfg.TEST.VISUALIZE:
            img_dir = os.path.join("output", "vis")
            os.makedirs(img_dir, exist_ok=True)
            vertmap = _extract_vertmap(labels, vertex_pred, imdb._extents,
                                       imdb.num_classes)
            vis_segmentations_vertmaps_detection(
                im,
                im_depth,
                im_label,
                imdb._class_colors,
                vertmap,
                labels,
                rois,
                poses,
                poses_icp,
                meta_data['intrinsic_matrix'],
                imdb.num_classes,
                imdb._classes,
                imdb._points_all,
                f_name=os.path.join(img_dir, "%i.png") % i)

    return batched_detections
Ejemplo n.º 9
0
def read_label_data(src_path_prefix, meta_data, num_classes, im_scales,
                    extents, blob_height, blob_width, depth, cls, instance,
                    objects):
    """ build the label blob """
    num_images = 1
    processed_depth = []
    processed_label = []
    processed_meta_data = []
    vertex_target_blob = np.zeros(
        (num_images, blob_height, blob_width, 3 * num_classes),
        dtype=np.float32)
    vertex_weight_blob = np.zeros(
        (num_images, blob_height, blob_width, 3 * num_classes),
        dtype=np.float32)
    pose_blob = np.zeros((0, 13), dtype=np.float32)

    gt_boxes = []

    for i in xrange(num_images):
        im_scale = im_scales[i]

        meta_data['cls_indexes'] = meta_data['cls_indexes'].flatten()
        if os.path.exists(src_path_prefix + depth):
            im_depth = pad_im(
                cv2.imread(src_path_prefix + depth, cv2.IMREAD_UNCHANGED), 16)
        else:
            im_depth = np.zeros((blob_height, blob_width), dtype=np.float32)

        # read label image
        im = pad_im(cv2.imread(src_path_prefix + cls, cv2.IMREAD_UNCHANGED),
                    16)

        im = cv2.resize(im,
                        None,
                        None,
                        fx=im_scale,
                        fy=im_scale,
                        interpolation=cv2.INTER_NEAREST)

        # process annotation if training for two classes
        I = np.where(im == 1)
        im[:, :] = 0
        im[I[0], I[1]] = 1
        ind = np.where(meta_data['cls_indexes'] == 1)[0]
        cls_indexes_old = ind
        meta_data['cls_indexes'] = np.ones((len(ind), ), dtype=np.float32)
        if len(meta_data['poses'].shape) == 2:
            meta_data['poses'] = np.reshape(meta_data['poses'], (3, 4, 1))
        meta_data['poses'] = meta_data['poses'][:, :, ind]
        meta_data['center'] = meta_data['center'][ind, :]

        im_labels = im.copy()
        processed_label.append(im_labels.astype(np.int32))

        # vertex regression targets and weights
        poses = meta_data['poses']
        if len(poses.shape) == 2:
            poses = np.reshape(poses, (3, 4, 1))

        vertmap = []

        center = meta_data['center']

        # check if mutiple same instances
        cls_indexes = meta_data['cls_indexes']
        if len(np.unique(cls_indexes)) < len(cls_indexes):
            is_multi_instances = 1
            # read mask image
            mask_img = cv2.imread(src_path_prefix + instance,
                                  cv2.IMREAD_UNCHANGED)
            if objects:
                mask_img = linear_instance_segmentation_mask_image(
                    objects, mask_img)
            try:
                # The mask image needs to be croped for simulation/dope data, because their masks are not black/white but are color masks.
                mask_img = mask_img[:, :, 0]
            except IndexError:
                pass
            mask = pad_im(mask_img, 16)
        else:
            is_multi_instances = 0
            mask = []

        vertex_target_blob[i, :, :, :], vertex_weight_blob[
            i, :, :, :] = _generate_vertex_targets(
                im, meta_data['cls_indexes'], im_scale * center, poses,
                num_classes, vertmap, extents, mask, is_multi_instances,
                cls_indexes_old, vertex_target_blob[i, :, :, :],
                vertex_weight_blob[i, :, :, :])

        num = poses.shape[2]
        qt = np.zeros((num, 13), dtype=np.float32)
        for j in xrange(num):
            R = poses[:, :3, j]
            T = poses[:, 3, j]

            qt[j, 0] = i
            qt[j, 1] = meta_data['cls_indexes'][j]
            qt[j, 2:6] = 0  # fill box later
            qt[j, 6:10] = mat2quat(R)
            qt[j, 10:] = T

        pose_blob = np.concatenate((pose_blob, qt), axis=0)

        # construct the meta data
        """
        format of the meta_data
        intrinsic matrix: meta_data[0 ~ 8]
        inverse intrinsic matrix: meta_data[9 ~ 17]
        pose_world2live: meta_data[18 ~ 29]
        pose_live2world: meta_data[30 ~ 41]
        voxel step size: meta_data[42, 43, 44]
        voxel min value: meta_data[45, 46, 47]
        """
        K = np.matrix(meta_data['intrinsic_matrix']) * im_scale
        K[2, 2] = 1
        Kinv = np.linalg.pinv(K)
        mdata = np.zeros(48, dtype=np.float32)
        mdata[0:9] = K.flatten()
        mdata[9:18] = Kinv.flatten()
        processed_meta_data.append(mdata)

        # depth
        depth = im_depth.astype(np.float32, copy=True) / float(
            meta_data['factor_depth'])
        depth = cv2.resize(depth,
                           None,
                           None,
                           fx=im_scale,
                           fy=im_scale,
                           interpolation=cv2.INTER_LINEAR)
        processed_depth.append(depth)

    # construct the blobs
    depth_blob = np.zeros((num_images, blob_height, blob_width, 1),
                          dtype=np.float32)
    meta_data_blob = np.zeros((num_images, 1, 1, 48), dtype=np.float32)

    for i in xrange(num_images):
        depth_blob[i, :, :, 0] = processed_depth[i]
        meta_data_blob[i, 0, 0, :] = processed_meta_data[i]

    label_blob = np.zeros((num_images, blob_height, blob_width),
                          dtype=np.int32)

    for i in xrange(num_images):
        label_blob[i, :, :] = processed_label[i]

    return depth_blob, label_blob, meta_data_blob, vertex_target_blob, vertex_weight_blob, pose_blob, gt_boxes, mask
Ejemplo n.º 10
0
def _get_label_blob(roidb, voxelizer):
    """ build the label blob """

    num_images = len(roidb)
    num_classes = voxelizer.num_classes
    processed_depth = []
    processed_label = []
    processed_meta_data = []
    if cfg.TRAIN.VERTEX_REG:
        processed_vertex_targets = []
        processed_vertex_weights = []
        processed_vertex_images = []

    for i in xrange(num_images):
        # load meta data
        meta_data = scipy.io.loadmat(roidb[i]['meta_data'])
        im_depth = pad_im(cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16)

        # read label image
        im = pad_im(cv2.imread(roidb[i]['label'], cv2.IMREAD_UNCHANGED), 16)
        height = im.shape[0]
        width = im.shape[1]
        # mask the label image according to depth
        if cfg.INPUT == 'DEPTH':
            I = np.where(im_depth == 0)
            if len(im.shape) == 2:
                im[I[0], I[1]] = 0
            else:
                im[I[0], I[1], :] = 0
        if roidb[i]['flipped']:
            if len(im.shape) == 2:
                im = im[:, ::-1]
            else:
                im = im[:, ::-1, :]
        im_cls, im_labels = _process_label_image(im, roidb[i]['class_colors'], roidb[i]['class_weights'])
        processed_label.append(im_cls)

        # vertex regression targets and weights
        if cfg.TRAIN.VERTEX_REG:
            # read vertmap image
            vertmap = pad_im(cv2.imread(roidb[i]['vertmap'], cv2.IMREAD_UNCHANGED), 16)
            if roidb[i]['flipped']:
                vertmap = vertmap[:, ::-1, :]
            vertmap = vertmap[:, :, (2, 1, 0)]
            vertmap = vertmap.astype(np.float32) / 255.0
            vertex_targets, vertex_weights = _get_vertex_regression_labels(im_labels, vertmap, roidb[i]['class_extents'], num_classes)
            processed_vertex_targets.append(vertex_targets)
            processed_vertex_weights.append(vertex_weights)
            processed_vertex_images.append(vertmap)
            # center_targets, center_weights = _vote_centers(im, meta_data['cls_indexes'], meta_data['center'], num_classes)
            # processed_vertex_targets.append(np.concatenate((center_targets, vertex_targets), axis=2))
            # processed_vertex_weights.append(np.concatenate((center_weights, vertex_weights), axis=2))

        # depth
        if roidb[i]['flipped']:
            im_depth = im_depth[:, ::-1]
        depth = im_depth.astype(np.float32, copy=True) / float(meta_data['factor_depth'])
        processed_depth.append(depth)

        # voxelization
        points = voxelizer.backproject_camera(im_depth, meta_data)
        voxelizer.voxelized = False
        voxelizer.voxelize(points)
        RT_world = meta_data['rotation_translation_matrix']

        # compute camera poses
        RT_live = meta_data['rotation_translation_matrix']
        pose_world2live = se3_mul(RT_live, se3_inverse(RT_world))
        pose_live2world = se3_inverse(pose_world2live)

        # construct the meta data
        """
        format of the meta_data
        intrinsic matrix: meta_data[0 ~ 8]
        inverse intrinsic matrix: meta_data[9 ~ 17]
        pose_world2live: meta_data[18 ~ 29]
        pose_live2world: meta_data[30 ~ 41]
        voxel step size: meta_data[42, 43, 44]
        voxel min value: meta_data[45, 46, 47]
        """
        K = np.matrix(meta_data['intrinsic_matrix'])
        Kinv = np.linalg.pinv(K)
        mdata = np.zeros(48, dtype=np.float32)
        mdata[0:9] = K.flatten()
        mdata[9:18] = Kinv.flatten()
        mdata[18:30] = pose_world2live.flatten()
        mdata[30:42] = pose_live2world.flatten()
        mdata[42] = voxelizer.step_x
        mdata[43] = voxelizer.step_y
        mdata[44] = voxelizer.step_z
        mdata[45] = voxelizer.min_x
        mdata[46] = voxelizer.min_y
        mdata[47] = voxelizer.min_z
        if cfg.FLIP_X:
            mdata[0] = -1 * mdata[0]
            mdata[9] = -1 * mdata[9]
            mdata[11] = -1 * mdata[11]
        processed_meta_data.append(mdata)

    # construct the blobs
    height = processed_depth[0].shape[0]
    width = processed_depth[0].shape[1]
    depth_blob = np.zeros((num_images, height, width, 1), dtype=np.float32)
    label_blob = np.zeros((num_images, height, width, num_classes), dtype=np.float32)
    meta_data_blob = np.zeros((num_images, 1, 1, 48), dtype=np.float32)
    if cfg.TRAIN.VERTEX_REG:
        vertex_target_blob = np.zeros((num_images, height, width, 2 * num_classes), dtype=np.float32)
        vertex_weight_blob = np.zeros((num_images, height, width, 2 * num_classes), dtype=np.float32)
        vertex_image_blob = np.zeros((num_images, height, width, 3), dtype=np.float32)
    else:
        vertex_target_blob = []
        vertex_weight_blob = []
        vertex_image_blob = []

    for i in xrange(num_images):
        depth_blob[i,:,:,0] = processed_depth[i]
        label_blob[i,:,:,:] = processed_label[i]
        meta_data_blob[i,0,0,:] = processed_meta_data[i]
        if cfg.TRAIN.VERTEX_REG:
            vertex_target_blob[i,:,:,:] = processed_vertex_targets[i]
            vertex_weight_blob[i,:,:,:] = processed_vertex_weights[i]
            vertex_image_blob[i,:,:,:] = processed_vertex_images[i]
    
    return depth_blob, label_blob,  meta_data_blob, vertex_target_blob, vertex_weight_blob, vertex_image_blob
Ejemplo n.º 11
0
def test_net(net, imdb):

    output_dir = get_output_dir(imdb, net)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    seg_file = os.path.join(output_dir, 'segmentations.pkl')
    print imdb.name
    if os.path.exists(seg_file):
        with open(seg_file, 'rb') as fid:
            segmentations = cPickle.load(fid)
        imdb.evaluate_segmentations(segmentations, output_dir)
        return

    """Test a Fast R-CNN network on an image database."""
    num_images = len(imdb.image_index)
    segmentations = [[] for _ in xrange(num_images)]

    # timers
    _t = {'im_segment' : Timer(), 'misc' : Timer()}

    if cfg.TEST.VISUALIZE:
        perm = np.random.permutation(np.arange(num_images))
    else:
        perm = xrange(num_images)

    for i in perm:
        # read color image
        rgba = pad_im(cv2.imread(imdb.image_path_at(i), cv2.IMREAD_UNCHANGED), 16)
        if rgba.shape[2] == 4:
            im = np.copy(rgba[:,:,:3])
            alpha = rgba[:,:,3]
            I = np.where(alpha == 0)
            im[I[0], I[1], :] = 255
        else:
            im = rgba

        # read depth image
        im_depth = cv2.imread(imdb.depth_path_at(i), cv2.IMREAD_UNCHANGED)

        _t['im_segment'].tic()
        labels = im_segment(net, im, im_depth, imdb.num_classes)
        _t['im_segment'].toc()

        # build the label image
        im_label = imdb.labels_to_image(im, labels)

        _t['misc'].tic()
        seg = {'labels': labels}
        segmentations[i] = seg
        _t['misc'].toc()

        # read label image
        labels_gt = pad_im(cv2.imread(imdb.label_path_at(i), cv2.IMREAD_UNCHANGED), 16)
        if len(labels_gt.shape) == 2:
            im_label_gt = imdb.labels_to_image(im, labels_gt)
        else:
            im_label_gt = np.copy(labels_gt[:,:,:3])
            im_label_gt[:,:,0] = labels_gt[:,:,2]
            im_label_gt[:,:,2] = labels_gt[:,:,0]

        if cfg.TEST.VISUALIZE:
            vis_segmentations(im, im_depth, im_label, im_label_gt, imdb._class_colors)
        print 'im_segment: {:d}/{:d} {:.3f}s {:.3f}s' \
              .format(i + 1, num_images, _t['im_segment'].average_time, _t['misc'].average_time)

    seg_file = os.path.join(output_dir, 'segmentations.pkl')
    with open(seg_file, 'wb') as f:
        cPickle.dump(segmentations, f, cPickle.HIGHEST_PROTOCOL)

    # evaluation
    imdb.evaluate_segmentations(segmentations, output_dir)
Ejemplo n.º 12
0
def _get_label_blob(roidb, voxelizer):
    """ build the label blob """

    num_images = len(roidb)
    num_classes = voxelizer.num_classes
    processed_depth = []
    processed_label = []
    processed_meta_data = []
    if cfg.TRAIN.VERTEX_REG:
        processed_vertex_targets = []
        processed_vertex_weights = []

    for i in xrange(num_images):
        # load meta data
        meta_data = scipy.io.loadmat(roidb[i]['meta_data'])
        im_depth = pad_im(cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16)

        # read label image
        im = pad_im(cv2.imread(roidb[i]['label'], cv2.IMREAD_UNCHANGED), 16)
        height = im.shape[0]
        width = im.shape[1]
        # mask the label image according to depth
        if cfg.INPUT == 'DEPTH':
            I = np.where(im_depth == 0)
            if len(im.shape) == 2:
                im[I[0], I[1]] = 0
            else:
                im[I[0], I[1], :] = 0
        if roidb[i]['flipped']:
            if len(im.shape) == 2:
                im = im[:, ::-1]
            else:
                im = im[:, ::-1, :]
        im_cls = _process_label_image(im, roidb[i]['class_colors'], roidb[i]['class_weights'])
        processed_label.append(im_cls)

        # vertex regression targets and weights
        if cfg.TRAIN.VERTEX_REG:
            vertmap = meta_data['vertmap']
            if roidb[i]['flipped']:
                vertmap = vertmap[:, ::-1, :]
            vertex_targets, vertex_weights = _get_vertex_regression_labels(im, vertmap, num_classes)
            processed_vertex_targets.append(vertex_targets)
            processed_vertex_weights.append(vertex_weights)

        # depth
        if roidb[i]['flipped']:
            im_depth = im_depth[:, ::-1]
        depth = im_depth.astype(np.float32, copy=True) / float(meta_data['factor_depth'])
        processed_depth.append(depth)

        # voxelization
        points = voxelizer.backproject_camera(im_depth, meta_data)
        voxelizer.voxelized = False
        voxelizer.voxelize(points)
        RT_world = meta_data['rotation_translation_matrix']

        # compute camera poses
        RT_live = meta_data['rotation_translation_matrix']
        pose_world2live = se3_mul(RT_live, se3_inverse(RT_world))
        pose_live2world = se3_inverse(pose_world2live)

        # construct the meta data
        """
        format of the meta_data
        intrinsic matrix: meta_data[0 ~ 8]
        inverse intrinsic matrix: meta_data[9 ~ 17]
        pose_world2live: meta_data[18 ~ 29]
        pose_live2world: meta_data[30 ~ 41]
        voxel step size: meta_data[42, 43, 44]
        voxel min value: meta_data[45, 46, 47]
        """
        K = np.matrix(meta_data['intrinsic_matrix'])
        Kinv = np.linalg.pinv(K)
        mdata = np.zeros(48, dtype=np.float32)
        mdata[0:9] = K.flatten()
        mdata[9:18] = Kinv.flatten()
        mdata[18:30] = pose_world2live.flatten()
        mdata[30:42] = pose_live2world.flatten()
        mdata[42] = voxelizer.step_x
        mdata[43] = voxelizer.step_y
        mdata[44] = voxelizer.step_z
        mdata[45] = voxelizer.min_x
        mdata[46] = voxelizer.min_y
        mdata[47] = voxelizer.min_z
        if cfg.FLIP_X:
            mdata[0] = -1 * mdata[0]
            mdata[9] = -1 * mdata[9]
            mdata[11] = -1 * mdata[11]
        processed_meta_data.append(mdata)

    # construct the blobs
    height = processed_depth[0].shape[0]
    width = processed_depth[0].shape[1]
    depth_blob = np.zeros((num_images, height, width, 1), dtype=np.float32)
    label_blob = np.zeros((num_images, height, width, 1), dtype=np.float32)
    meta_data_blob = np.zeros((num_images, 1, 1, 48), dtype=np.float32)
    if cfg.TRAIN.VERTEX_REG:
        vertex_target_blob = np.zeros((num_images, height, width, 3 * num_classes), dtype=np.float32)
        vertex_weight_blob = np.zeros((num_images, height, width, 3 * num_classes), dtype=np.float32)
    else:
        vertex_target_blob = []
        vertex_weight_blob = []

    for i in xrange(num_images):
        depth_blob[i,:,:,0] = processed_depth[i]
        label_blob[i,:,:,:] = processed_label[i]
        meta_data_blob[i,0,0,:] = processed_meta_data[i]
        if cfg.TRAIN.VERTEX_REG:
            vertex_target_blob[i,:,:,:] = processed_vertex_targets[i]
            vertex_weight_blob[i,:,:,:] = processed_vertex_weights[i]

    channel_swap = (0, 3, 1, 2)
    depth_blob = depth_blob.transpose(channel_swap)
    label_blob = label_blob.transpose(channel_swap)
    meta_data_blob = meta_data_blob.transpose(channel_swap)
    if cfg.TRAIN.VERTEX_REG:
        vertex_target_blob = vertex_target_blob.transpose(channel_swap)
        vertex_weight_blob = vertex_weight_blob.transpose(channel_swap)
    
    return depth_blob, label_blob, meta_data_blob, vertex_target_blob, vertex_weight_blob
Ejemplo n.º 13
0
def _get_image_blob(roidb, scale_ind, num_classes, backgrounds,
                    intrinsic_matrix, db_inds_syn, is_syn):
    """Builds an input blob from the images in the roidb at the specified
    scales.
    """
    num_images = len(roidb)
    processed_ims = []
    processed_ims_depth = []
    processed_ims_normal = []
    im_scales = []
    roidb_syn = []

    for i in xrange(num_images):

        if is_syn:
            # depth raw
            filename = cfg.TRAIN.SYNROOT + '{:06d}-depth.png'.format(
                db_inds_syn[i])
            im_depth_raw = pad_im(cv2.imread(filename, cv2.IMREAD_UNCHANGED),
                                  16)

            # rgba
            filename = cfg.TRAIN.SYNROOT + '{:06d}-color.png'.format(
                db_inds_syn[i])
            rgba = pad_im(cv2.imread(filename, cv2.IMREAD_UNCHANGED), 16)

            # sample a background image
            ind = np.random.randint(len(backgrounds), size=1)[0]
            filename = backgrounds[ind]
            background = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
            try:
                background = cv2.resize(background,
                                        (rgba.shape[1], rgba.shape[0]),
                                        interpolation=cv2.INTER_LINEAR)
            except:
                if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'NORMAL':
                    background = np.zeros((rgba.shape[0], rgba.shape[1]),
                                          dtype=np.uint16)
                else:
                    background = np.zeros((rgba.shape[0], rgba.shape[1], 3),
                                          dtype=np.uint8)
                print 'bad background image'

            if cfg.INPUT != 'DEPTH' and cfg.INPUT != 'NORMAL' and len(
                    background.shape) != 3:
                background = np.zeros((rgba.shape[0], rgba.shape[1], 3),
                                      dtype=np.uint8)
                print 'bad background image'

            # add background
            im = np.copy(rgba[:, :, :3])
            alpha = rgba[:, :, 3]
            I = np.where(alpha == 0)
            if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'NORMAL':
                im_depth_raw[I[0], I[1]] = background[I[0], I[1]] / 10
            else:
                im[I[0], I[1], :] = background[I[0], I[1], :3]
        else:
            # depth raw
            im_depth_raw = pad_im(
                cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16)

            # rgba
            rgba = pad_im(cv2.imread(roidb[i]['image'], cv2.IMREAD_UNCHANGED),
                          16)
            if rgba.shape[2] == 4:
                im = np.copy(rgba[:, :, :3])
                alpha = rgba[:, :, 3]
                I = np.where(alpha == 0)
                im[I[0], I[1], :] = 0
            else:
                im = rgba

        # chromatic transform
        if cfg.TRAIN.CHROMATIC:
            im = chromatic_transform(im)

        if cfg.TRAIN.ADD_NOISE:
            im = add_noise(im)

        if roidb[i]['flipped']:
            im = im[:, ::-1, :]

        im_orig = im.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_scale = cfg.TRAIN.SCALES_BASE[scale_ind]
        im = cv2.resize(im_orig,
                        None,
                        None,
                        fx=im_scale,
                        fy=im_scale,
                        interpolation=cv2.INTER_LINEAR)
        im_scales.append(im_scale)
        processed_ims.append(im)

        # depth
        im_depth = im_depth_raw.astype(np.float32, copy=True) / float(
            im_depth_raw.max()) * 255
        im_depth = np.tile(im_depth[:, :, np.newaxis], (1, 1, 3))

        if cfg.TRAIN.ADD_NOISE:
            im_depth = add_noise(im_depth)

        if roidb[i]['flipped']:
            im_depth = im_depth[:, ::-1]

        im_orig = im_depth.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_depth = cv2.resize(im_orig,
                              None,
                              None,
                              fx=im_scale,
                              fy=im_scale,
                              interpolation=cv2.INTER_LINEAR)
        processed_ims_depth.append(im_depth)

        # normals
        if cfg.INPUT == 'NORMAL':
            depth = im_depth_raw.astype(np.float32, copy=True) / 1000.0
            fx = intrinsic_matrix[0, 0] * im_scale
            fy = intrinsic_matrix[1, 1] * im_scale
            cx = intrinsic_matrix[0, 2] * im_scale
            cy = intrinsic_matrix[1, 2] * im_scale
            nmap = gpu_normals.gpu_normals(depth, fx, fy, cx, cy, 20.0,
                                           cfg.GPU_ID)
            im_normal = 127.5 * nmap + 127.5
            im_normal = im_normal.astype(np.uint8)
            im_normal = im_normal[:, :, (2, 1, 0)]
            im_normal = cv2.bilateralFilter(im_normal, 9, 75, 75)
            if roidb[i]['flipped']:
                im_normal = im_normal[:, ::-1, :]

            im_orig = im_normal.astype(np.float32, copy=True)
            im_orig -= cfg.PIXEL_MEANS
            im_normal = cv2.resize(im_orig,
                                   None,
                                   None,
                                   fx=im_scale,
                                   fy=im_scale,
                                   interpolation=cv2.INTER_LINEAR)
            processed_ims_normal.append(im_normal)
            blob_normal = im_list_to_blob(processed_ims_normal, 3)
        else:
            blob_normal = []

    # Create a blob to hold the input images
    blob = im_list_to_blob(processed_ims, 3)
    blob_depth = im_list_to_blob(processed_ims_depth, 3)

    return blob, blob_depth, blob_normal, im_scales
Ejemplo n.º 14
0
def test_net_single_frame(sess, net, imdb, weights_filename, rig_filename, is_kfusion):

    output_dir = get_output_dir(imdb, weights_filename)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    seg_file = os.path.join(output_dir, 'segmentations.pkl')
    print imdb.name
    if os.path.exists(seg_file):
        with open(seg_file, 'rb') as fid:
            segmentations = cPickle.load(fid)
        imdb.evaluate_segmentations(segmentations, output_dir)
        return

    """Test a FCN on an image database."""
    num_images = len(imdb.image_index)
    segmentations = [[] for _ in xrange(num_images)]

    # timers
    _t = {'im_segment' : Timer(), 'misc' : Timer()}

    # kinect fusion
    if is_kfusion:
        KF = kfusion.PyKinectFusion(rig_filename)

    # pose estimation
    if cfg.TEST.VERTEX_REG and cfg.TEST.RANSAC:
        RANSAC = ransac.PyRansac3D()

    # construct colors
    colors = np.zeros((3 * imdb.num_classes), dtype=np.uint8)
    for i in range(imdb.num_classes):
        colors[i * 3 + 0] = imdb._class_colors[i][0]
        colors[i * 3 + 1] = imdb._class_colors[i][1]
        colors[i * 3 + 2] = imdb._class_colors[i][2]

    if cfg.TEST.VISUALIZE:
        # perm = np.random.permutation(np.arange(num_images))
        perm = xrange(0, num_images, 5)
    else:
        perm = xrange(num_images)

    video_index = ''
    have_prediction = False
    for i in perm:

        # parse image name
        image_index = imdb.image_index[i]
        pos = image_index.find('/')
        if video_index == '':
            video_index = image_index[:pos]
            have_prediction = False
        else:
            if video_index != image_index[:pos]:
                have_prediction = False
                video_index = image_index[:pos]
                print 'start video {}'.format(video_index)

        # read color image
        rgba = pad_im(cv2.imread(imdb.image_path_at(i), cv2.IMREAD_UNCHANGED), 16)
        if rgba.shape[2] == 4:
            im = np.copy(rgba[:,:,:3])
            alpha = rgba[:,:,3]
            I = np.where(alpha == 0)
            im[I[0], I[1], :] = 0
        else:
            im = rgba

        # read depth image
        im_depth = pad_im(cv2.imread(imdb.depth_path_at(i), cv2.IMREAD_UNCHANGED), 16)

        # load meta data
        meta_data = scipy.io.loadmat(imdb.metadata_path_at(i))

        # read label image
        labels_gt = pad_im(cv2.imread(imdb.label_path_at(i), cv2.IMREAD_UNCHANGED), 16)
        if len(labels_gt.shape) == 2:
            im_label_gt = imdb.labels_to_image(im, labels_gt)
        else:
            im_label_gt = np.copy(labels_gt[:,:,:3])
            im_label_gt[:,:,0] = labels_gt[:,:,2]
            im_label_gt[:,:,2] = labels_gt[:,:,0]

        _t['im_segment'].tic()
        labels, probs, vertex_pred = im_segment_single_frame(sess, net, im, im_depth, meta_data, imdb.num_classes)
        if cfg.TEST.VERTEX_REG:
            vertmap = _extract_vertmap(labels, vertex_pred, imdb._extents, imdb.num_classes)
            if cfg.TEST.RANSAC:
                # pose estimation using RANSAC
                fx = meta_data['intrinsic_matrix'][0, 0]
                fy = meta_data['intrinsic_matrix'][1, 1]
                px = meta_data['intrinsic_matrix'][0, 2]
                py = meta_data['intrinsic_matrix'][1, 2]
                depth_factor = meta_data['factor_depth'][0, 0]
                poses = RANSAC.estimate_pose(im_depth, probs, vertex_pred[0,:,:,:] / cfg.TRAIN.VERTEX_W, imdb._extents, fx, fy, px, py, depth_factor)

                # print gt poses
                # cls_indexes = meta_data['cls_indexes']
                # poses_gt = meta_data['poses']
                # for j in xrange(len(cls_indexes)):
                #    print 'object {}'.format(cls_indexes[j])
                #    print poses_gt[:,:,j]
            else:
                poses = []

        _t['im_segment'].toc()

        _t['misc'].tic()
        labels = unpad_im(labels, 16)
        # build the label image
        im_label = imdb.labels_to_image(im, labels)

        if not have_prediction:    
            if is_kfusion:
                KF.set_voxel_grid(-3, -3, -3, 6, 6, 7)

        # run kinect fusion
        if is_kfusion:
            height = im.shape[0]
            width = im.shape[1]
            labels_kfusion = np.zeros((height, width), dtype=np.int32)

            im_rgb = np.copy(im)
            im_rgb[:, :, 0] = im[:, :, 2]
            im_rgb[:, :, 2] = im[:, :, 0]
            KF.feed_data(im_depth, im_rgb, im.shape[1], im.shape[0], float(meta_data['factor_depth']))
            KF.back_project();
            if have_prediction:
                pose_world2live, pose_live2world = KF.solve_pose()

            KF.feed_label(im_label, probs, colors)
            KF.fuse_depth()
            labels_kfusion = KF.extract_surface(labels_kfusion)
            im_label_kfusion = imdb.labels_to_image(im, labels_kfusion)
            KF.render()
            filename = os.path.join(output_dir, 'images', '{:04d}'.format(i))
            KF.draw(filename, 0)
        have_prediction = True

        if is_kfusion:
            seg = {'labels': labels_kfusion}
        else:
            seg = {'labels': labels}
        segmentations[i] = seg

        _t['misc'].toc()

        print 'im_segment {}: {:d}/{:d} {:.3f}s {:.3f}s' \
              .format(video_index, i + 1, num_images, _t['im_segment'].diff, _t['misc'].diff)

        if cfg.TEST.VISUALIZE:
            if cfg.TEST.VERTEX_REG:
                # centers_gt = _vote_centers(labels_gt, meta_data['cls_indexes'], meta_data['center'], imdb.num_classes)
                vertmap_gt = pad_im(cv2.imread(imdb.vertmap_path_at(i), cv2.IMREAD_UNCHANGED), 16)
                vertmap_gt = vertmap_gt[:, :, (2, 1, 0)]
                vertmap_gt = vertmap_gt.astype(np.float32) / 255.0
                vertmap_gt = _unscale_vertmap(vertmap_gt, imdb._process_label_image(labels_gt), imdb._extents, imdb.num_classes)
                print 'visualization'
                vis_segmentations_vertmaps(im, im_depth, im_label, im_label_gt, imdb._class_colors, \
                    vertmap_gt, vertmap, labels, labels_gt, poses, meta_data['intrinsic_matrix'])
            else:
                vis_segmentations(im, im_depth, im_label, im_label_gt, imdb._class_colors)

    seg_file = os.path.join(output_dir, 'segmentations.pkl')
    with open(seg_file, 'wb') as f:
        cPickle.dump(segmentations, f, cPickle.HIGHEST_PROTOCOL)

    # evaluation
    imdb.evaluate_segmentations(segmentations, output_dir)
Ejemplo n.º 15
0
def test_net(sess, net, imdb, weights_filename, rig_filename, is_kfusion):

    output_dir = get_output_dir(imdb, weights_filename)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    seg_file = os.path.join(output_dir, 'segmentations.pkl')
    print imdb.name
    if os.path.exists(seg_file):
        with open(seg_file, 'rb') as fid:
            segmentations = cPickle.load(fid)
        imdb.evaluate_segmentations(segmentations, output_dir)
        return

    """Test a FCN on an image database."""
    num_images = len(imdb.image_index)
    segmentations = [[] for _ in xrange(num_images)]

    # timers
    _t = {'im_segment' : Timer(), 'misc' : Timer()}

    # voxelizer
    voxelizer = Voxelizer(cfg.TEST.GRID_SIZE, imdb.num_classes)
    voxelizer.setup(-3, -3, -3, 3, 3, 4)
    # voxelizer.setup(-2, -2, -2, 2, 2, 2)

    # kinect fusion
    if is_kfusion:
        KF = kfusion.PyKinectFusion(rig_filename)

    # construct colors
    colors = np.zeros((3 * imdb.num_classes), dtype=np.uint8)
    for i in range(imdb.num_classes):
        colors[i * 3 + 0] = imdb._class_colors[i][0]
        colors[i * 3 + 1] = imdb._class_colors[i][1]
        colors[i * 3 + 2] = imdb._class_colors[i][2]

    if cfg.TEST.VISUALIZE:
        perm = np.random.permutation(np.arange(num_images))
    else:
        perm = xrange(num_images)

    video_index = ''
    have_prediction = False
    for i in perm:
        rgba = pad_im(cv2.imread(imdb.image_path_at(i), cv2.IMREAD_UNCHANGED), 16)
        height = rgba.shape[0]
        width = rgba.shape[1]

        # parse image name
        image_index = imdb.image_index[i]
        pos = image_index.find('/')
        if video_index == '':
            video_index = image_index[:pos]
            have_prediction = False
            state = np.zeros((1, height, width, cfg.TRAIN.NUM_UNITS), dtype=np.float32)
            weights = np.ones((1, height, width, cfg.TRAIN.NUM_UNITS), dtype=np.float32)
            points = np.zeros((1, height, width, 3), dtype=np.float32)
        else:
            if video_index != image_index[:pos]:
                have_prediction = False
                video_index = image_index[:pos]
                state = np.zeros((1, height, width, cfg.TRAIN.NUM_UNITS), dtype=np.float32)
                weights = np.ones((1, height, width, cfg.TRAIN.NUM_UNITS), dtype=np.float32)
                points = np.zeros((1, height, width, 3), dtype=np.float32)
                print 'start video {}'.format(video_index)

        # read color image
        if rgba.shape[2] == 4:
            im = np.copy(rgba[:,:,:3])
            alpha = rgba[:,:,3]
            I = np.where(alpha == 0)
            im[I[0], I[1], :] = 0
        else:
            im = rgba

        # read depth image
        im_depth = pad_im(cv2.imread(imdb.depth_path_at(i), cv2.IMREAD_UNCHANGED), 16)

        # load meta data
        meta_data = scipy.io.loadmat(imdb.metadata_path_at(i))

        # backprojection for the first frame
        if not have_prediction:    
            if is_kfusion:
                # KF.set_voxel_grid(-3, -3, -3, 6, 6, 7)
                KF.set_voxel_grid(voxelizer.min_x, voxelizer.min_y, voxelizer.min_z, voxelizer.max_x-voxelizer.min_x, voxelizer.max_y-voxelizer.min_y, voxelizer.max_z-voxelizer.min_z)
                # identity transformation
                RT_world = np.zeros((3,4), dtype=np.float32)
                RT_world[0, 0] = 1
                RT_world[1, 1] = 1
                RT_world[2, 2] = 1
            else:
                # store the RT for the first frame
                RT_world = meta_data['rotation_translation_matrix']

        # run kinect fusion
        if is_kfusion:
            im_rgb = np.copy(im)
            im_rgb[:, :, 0] = im[:, :, 2]
            im_rgb[:, :, 2] = im[:, :, 0]
            KF.feed_data(im_depth, im_rgb, im.shape[1], im.shape[0], float(meta_data['factor_depth']))
            KF.back_project();
            if have_prediction:
                pose_world2live, pose_live2world = KF.solve_pose()
                RT_live = pose_world2live
            else:
                RT_live = RT_world
        else:
            # compute camera poses
            RT_live = meta_data['rotation_translation_matrix']

        pose_world2live = se3_mul(RT_live, se3_inverse(RT_world))
        pose_live2world = se3_inverse(pose_world2live)

        _t['im_segment'].tic()
        labels, probs, state, weights, points = im_segment(sess, net, im, im_depth, state, weights, points, meta_data, voxelizer, pose_world2live, pose_live2world)
        _t['im_segment'].toc()
        # time.sleep(3)

        _t['misc'].tic()
        labels = unpad_im(labels, 16)

        # build the label image
        im_label = imdb.labels_to_image(im, labels)

        if is_kfusion:
            labels_kfusion = np.zeros((height, width), dtype=np.int32)
            if probs.shape[2] < 10:
                probs_new = np.zeros((probs.shape[0], probs.shape[1], 10), dtype=np.float32)
                probs_new[:,:,:imdb.num_classes] = probs
                probs = probs_new
            KF.feed_label(im_label, probs, colors)
            KF.fuse_depth()
            labels_kfusion = KF.extract_surface(labels_kfusion)
            im_label_kfusion = imdb.labels_to_image(im, labels_kfusion)
            KF.render()
            filename = os.path.join(output_dir, 'images', '{:04d}'.format(i))
            KF.draw(filename, 0)
        have_prediction = True

        # compute the delta transformation between frames
        RT_world = RT_live

        if is_kfusion:
            seg = {'labels': labels_kfusion}
        else:
            seg = {'labels': labels}
        segmentations[i] = seg

        _t['misc'].toc()

        if cfg.TEST.VISUALIZE:
            # read label image
            labels_gt = pad_im(cv2.imread(imdb.label_path_at(i), cv2.IMREAD_UNCHANGED), 16)
            if len(labels_gt.shape) == 2:
                im_label_gt = imdb.labels_to_image(im, labels_gt)
            else:
                im_label_gt = np.copy(labels_gt[:,:,:3])
                im_label_gt[:,:,0] = labels_gt[:,:,2]
                im_label_gt[:,:,2] = labels_gt[:,:,0]
            vis_segmentations(im, im_depth, im_label, im_label_gt, imdb._class_colors)

        print 'im_segment: {:d}/{:d} {:.3f}s {:.3f}s' \
              .format(i + 1, num_images, _t['im_segment'].diff, _t['misc'].diff)

    if is_kfusion:
        KF.draw(filename, 1)

    seg_file = os.path.join(output_dir, 'segmentations.pkl')
    with open(seg_file, 'wb') as f:
        cPickle.dump(segmentations, f, cPickle.HIGHEST_PROTOCOL)

    # evaluation
    imdb.evaluate_segmentations(segmentations, output_dir)
Ejemplo n.º 16
0
def _get_label_blob(roidb, voxelizer):
    """ build the label blob """

    num_images = len(roidb)
    processed_depth = []
    processed_label = []
    processed_meta_data = []

    for i in xrange(num_images):
        # load meta data
        meta_data = scipy.io.loadmat(roidb[i]['meta_data'])
        im_depth = pad_im(cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16)

        # read label image
        im = pad_im(cv2.imread(roidb[i]['label'], cv2.IMREAD_UNCHANGED), 16)
        # mask the label image according to depth
        if cfg.INPUT == 'DEPTH':
            I = np.where(im_depth == 0)
            if len(im.shape) == 2:
                im[I[0], I[1]] = 0
            else:
                im[I[0], I[1], :] = 0
        if roidb[i]['flipped']:
            im = im[:, ::-1, :]
        im_cls = _process_label_image(im, roidb[i]['class_colors'], roidb[i]['class_weights'])
        processed_label.append(im_cls)

        # depth
        if roidb[i]['flipped']:
            im_depth = im_depth[:, ::-1]
        depth = im_depth.astype(np.float32, copy=True) / float(meta_data['factor_depth'])
        processed_depth.append(depth)

        # voxelization
        if i % cfg.TRAIN.NUM_STEPS == 0:
            points = voxelizer.backproject_camera(im_depth, meta_data)
            voxelizer.voxelized = False
            voxelizer.voxelize(points)
            # store the RT for the first frame
            RT_world = meta_data['rotation_translation_matrix']

        # compute camera poses
        RT_live = meta_data['rotation_translation_matrix']
        pose_world2live = se3_mul(RT_live, se3_inverse(RT_world))
        pose_live2world = se3_inverse(pose_world2live)

        # construct the meta data
        """
        format of the meta_data
        intrinsic matrix: meta_data[0 ~ 8]
        inverse intrinsic matrix: meta_data[9 ~ 17]
        pose_world2live: meta_data[18 ~ 29]
        pose_live2world: meta_data[30 ~ 41]
        voxel step size: meta_data[42, 43, 44]
        voxel min value: meta_data[45, 46, 47]
        """
        K = np.matrix(meta_data['intrinsic_matrix'])
        Kinv = np.linalg.pinv(K)
        mdata = np.zeros(48, dtype=np.float32)
        mdata[0:9] = K.flatten()
        mdata[9:18] = Kinv.flatten()
        mdata[18:30] = pose_world2live.flatten()
        mdata[30:42] = pose_live2world.flatten()
        mdata[42] = voxelizer.step_x
        mdata[43] = voxelizer.step_y
        mdata[44] = voxelizer.step_z
        mdata[45] = voxelizer.min_x
        mdata[46] = voxelizer.min_y
        mdata[47] = voxelizer.min_z
        if cfg.FLIP_X:
            mdata[0] = -1 * mdata[0]
            mdata[9] = -1 * mdata[9]
            mdata[11] = -1 * mdata[11]
        processed_meta_data.append(mdata)

        # compute the delta transformation between frames
        RT_world = RT_live

    # construct the blobs
    height = processed_depth[0].shape[0]
    width = processed_depth[0].shape[1]
    num_classes = voxelizer.num_classes
    depth_blob = np.zeros((num_images, height, width, 1), dtype=np.float32)
    label_blob = np.zeros((num_images, height, width, num_classes), dtype=np.float32)
    meta_data_blob = np.zeros((num_images, 1, 1, 48), dtype=np.float32)
    for i in xrange(num_images):
        depth_blob[i,:,:,0] = processed_depth[i]
        label_blob[i,:,:,:] = processed_label[i]
        meta_data_blob[i,0,0,:] = processed_meta_data[i]

    state_blob = np.zeros((cfg.TRAIN.IMS_PER_BATCH, height, width, cfg.TRAIN.NUM_UNITS), dtype=np.float32)
    weights_blob = np.ones((cfg.TRAIN.IMS_PER_BATCH, height, width, cfg.TRAIN.NUM_UNITS), dtype=np.float32)
    points_blob = np.zeros((cfg.TRAIN.IMS_PER_BATCH, height, width, 3), dtype=np.float32)

    return depth_blob, label_blob, meta_data_blob, state_blob, weights_blob, points_blob
Ejemplo n.º 17
0
def test_net(sess, net, imdb, weights_filename, rig_filename, is_kfusion):
    output_dir = get_output_dir(imdb, weights_filename)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print 'The Output DIR is:', output_dir

    # seg_file = os.path.join(output_dir, 'segmentations.pkl')
    # print imdb.name
    # if os.path.exists(seg_file):
    #     with open(seg_file, 'rb') as fid:
    #         segmentations = cPickle.load(fid)
    #     imdb.evaluate_segmentations(segmentations, output_dir)
    #     return
    """Test a FCN on an image database."""
    print 'Test a FCN on an image database'
    num_images = len(imdb.image_index)
    # segmentations = [[] for _ in xrange(num_images)]

    # segmentations = [[] for _ in xrange(100)]

    # timers
    _t = {'im_segment': Timer(), 'misc': Timer()}

    # voxelizer
    voxelizer = Voxelizer(cfg.TEST.GRID_SIZE, imdb.num_classes)
    voxelizer.setup(-3, -3, -3, 3, 3, 4)
    # voxelizer.setup(-2, -2, -2, 2, 2, 2)

    # construct colors
    colors = np.zeros((3 * imdb.num_classes), dtype=np.uint8)
    for i in range(imdb.num_classes):
        colors[i * 3 + 0] = imdb._class_colors[i][0]
        colors[i * 3 + 1] = imdb._class_colors[i][1]
        colors[i * 3 + 2] = imdb._class_colors[i][2]
    # print colors

    if cfg.TEST.VISUALIZE:
        perm = np.random.permutation(np.arange(num_images))
    else:
        perm = xrange(num_images)

    video_index = ''
    have_prediction = False
    i = 0
    while True:
        print i
        # if i>=100:
        #     seg_file = os.path.join('/home/weizhang/DA-RNN/data/LabScene/data/0000/', 'segmentations.pkl')
        #     with open(seg_file, 'wb') as f:
        #         cPickle.dump(segmentations, f, cPickle.HIGHEST_PROTOCOL)
        #     sys.exit()
        # im, im_depth = rgbd_getter.data_getter()
        # start_time = time.time()

        data_chunk = rgbd_getter.data_getter()

        # print "--- %s seconds ---" % (time.time() - start_time)

        im = data_chunk['rgb_image']
        im_depth = data_chunk['depth_image']

        # rgba = cv2.imread(imdb.image_path_at(i), cv2.IMREAD_UNCHANGED)
        # path = '/home/weizhang/DA-RNN/data/LabScene/data/0000/' + '{:04d}_rgba.png'.format(i)
        #
        # im = cv2.imread(path, cv2.IMREAD_UNCHANGED)

        rgba = im[..., [2, 1, 0]]
        rgba = rgba.astype(np.uint8)
        rgba = pad_im(rgba, 16)

        # rgba = pad_im(cv2.imread('/home/weizhang/DA-RNN/data/RGBDScene/data/scene_01/{:05d}-color.png'.format(i), cv2.IMREAD_UNCHANGED), 16)
        height = rgba.shape[0]
        width = rgba.shape[1]

        # parse image name
        image_index = imdb.image_index[i]
        # pos = image_index.find('/')
        # if video_index == '':
        #     video_index = image_index[:pos]
        #     have_prediction = False
        #     state = np.zeros((1, height, width, cfg.TRAIN.NUM_UNITS), dtype=np.float32)
        #     weights = np.ones((1, height, width, cfg.TRAIN.NUM_UNITS), dtype=np.float32)
        #     points = np.zeros((1, height, width, 3), dtype=np.float32)
        # else:
        #     if video_index != image_index[:pos]:
        #         have_prediction = False
        #         video_index = image_index[:pos]
        #         state = np.zeros((1, height, width, cfg.TRAIN.NUM_UNITS), dtype=np.float32)
        #         weights = np.ones((1, height, width, cfg.TRAIN.NUM_UNITS), dtype=np.float32)
        #         points = np.zeros((1, height, width, 3), dtype=np.float32)
        #         print 'start video {}'.format(video_index)

        if i == 0:
            have_prediction = False
            state = np.zeros((1, height, width, cfg.TRAIN.NUM_UNITS),
                             dtype=np.float32)
            weights = np.ones((1, height, width, cfg.TRAIN.NUM_UNITS),
                              dtype=np.float32)
            points = np.zeros((1, height, width, 3), dtype=np.float32)

        # read color image
        if rgba.shape[2] == 4:
            im = np.copy(rgba[:, :, :3])
            alpha = rgba[:, :, 3]
            I = np.where(alpha == 0)
            im[I[0], I[1], :] = 0
        else:
            im = rgba

        # read depth image
        # path = '/home/weizhang/DA-RNN/data/LabScene/data/0000/' + '{:04d}_depth.png'.format(i)
        # im_depth = cv2.imread(path, -1)

        # thres = np.percentile(im_depth,60)
        # idx = np.where(im_depth>thres)
        im_depth = pad_im(im_depth, 16)

        # im_depth = cv2.imread('/home/weizhang/DA-RNN/data/RGBDScene/data/scene_01/{:05d}-color.png'.format(i), cv2.IMREAD_UNCHANGED)
        # im_depth = cv2.cvtColor(im_depth, cv2.COLOR_BGR2GRAY)
        # im_depth = im_depth.astype(np.uint16)
        # im_depth = pad_im(im_depth, 16)

        # load meta data
        # meta_data = form_meta_data()
        meta_data = data_chunk['meta_data']

        # backprojection for the first frame
        if not have_prediction:
            RT_world = meta_data['rotation_translation_matrix']

        RT_live = meta_data['rotation_translation_matrix']

        pose_world2live = se3_mul(RT_live, se3_inverse(RT_world))
        pose_live2world = se3_inverse(pose_world2live)

        # print "--- %s seconds ---" % (time.time() - start_time)

        _t['im_segment'].tic()
        print 'before feed dict----------------------------------'
        labels, probs, state, weights, points = im_segment(
            sess, net, im, im_depth, state, weights, points, meta_data,
            voxelizer, pose_world2live, pose_live2world)
        print 'after feed dict----------------------------------'
        _t['im_segment'].toc()

        # print "--- %s seconds ---" % (time.time() - start_time)
        # time.sleep(3)

        _t['misc'].tic()
        labels = unpad_im(labels, 16)

        # build the label image
        im_label = imdb.labels_to_image(im, labels)
        # im_label[idx[0],idx[1],0] = 0
        # im_label[idx[0], idx[1], 1] = 0
        # im_label[idx[0], idx[1], 2] = 0
        # label_path = '/home/weizhang/DA-RNN/data/LabScene/data/0000/' + '{:04d}_label.png'.format(i)
        # cv2.imwrite(label_path,im_label)

        # print "--- %s seconds ---" % (time.time() - start_time)

        im_label_post, lbl_pcd_color = post_proc_da.post_proc(
            im, data_chunk['point_cloud_array'], im_label,
            data_chunk['camera_info'], data_chunk['rgb_image'])

        # print "--- %s seconds ---" % (time.time() - start_time)
        # kernel = np.ones((3,3),np.uint8)
        #
        # im_ero = cv2.erode(im_label,kernel,iterations=1)
        #
        # label_path = '/home/weizhang/DA-RNN/data/LabScene/data/0000/' + '{:04d}_ero_3by3.png'.format(i)
        #
        # cv2.imwrite(label_path,im_ero)

        # label_path = '/home/weizhang/DA-RNN/data/LabScene/data/0000/' + '{:04d}_label.png'.format(i)
        # cv2.imwrite(label_path,im_label)
        # Press Q on keyboard to  exit
        # if cv2.waitKey(25) & 0xFF == ord('q'):
        #     break

        have_prediction = True

        # compute the delta transformation between frames
        RT_world = RT_live

        # seg = {'labels': labels}
        # segmentations[i] = seg

        _t['misc'].toc()

        print 'im_segment: {:d}/{:d} {:.3f}s {:.3f}s' \
            .format(i + 1, num_images, _t['im_segment'].diff, _t['misc'].diff)

        # csv_file_path = os.path.join('/home/weizhang/Documents/domain-adaptation/data/LabScene/data/0025/', "lbl_pcd_color_{:04d}.csv".format(i))
        # np.savetxt(csv_file_path, lbl_pcd_color, delimiter=",")

        # s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        # s.bind((HOST, PORT))
        # s.listen(10)
        # conn, addr = s.accept()
        # conn.sendall(b'Hello, world')

        if cfg.TEST.VISUALIZE:
            # read label image
            labels_gt = pad_im(
                cv2.imread(imdb.label_path_at(i), cv2.IMREAD_UNCHANGED), 16)
            if len(labels_gt.shape) == 2:
                im_label_gt = imdb.labels_to_image(im, labels_gt)
            else:
                im_label_gt = np.copy(labels_gt[:, :, :3])
                im_label_gt[:, :, 0] = labels_gt[:, :, 2]
                im_label_gt[:, :, 2] = labels_gt[:, :, 0]
            vis_segmentations(im, im_depth, im_label, im_label_post,
                              imdb._class_colors)

        # print 'im_segment: {:d}/{:d} {:.3f}s {:.3f}s' \
        #     .format(i + 1, num_images, _t['im_segment'].diff, _t['misc'].diff)

        # data = s.recv(1024)

        i += 1
Ejemplo n.º 18
0
def _get_image_blob(roidb, scale_ind):
    """Builds an input blob from the images in the roidb at the specified
    scales.
    """
    num_images = len(roidb)
    processed_ims = []
    processed_ims_depth = []
    processed_ims_normal = []
    im_scales = []
    for i in xrange(num_images):
        # meta data
        meta_data = scipy.io.loadmat(roidb[i]['meta_data'])
        K = meta_data['intrinsic_matrix'].astype(np.float32, copy=True)
        fx = K[0, 0]
        fy = K[1, 1]
        cx = K[0, 2]
        cy = K[1, 2]

        # depth raw
        im_depth_raw = pad_im(cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16)
        height = im_depth_raw.shape[0]
        width = im_depth_raw.shape[1]

        # rgba
        rgba = pad_im(cv2.imread(roidb[i]['image'], cv2.IMREAD_UNCHANGED), 16)
        if rgba.shape[2] == 4:
            im = np.copy(rgba[:,:,:3])
            alpha = rgba[:,:,3]
            I = np.where(alpha == 0)
            im[I[0], I[1], :] = 255
        else:
            im = rgba

        # chromatic transform
        if cfg.TRAIN.CHROMATIC:
            im = chromatic_transform(im)

        # mask the color image according to depth
        if cfg.EXP_DIR == 'rgbd_scene':
            I = np.where(im_depth_raw == 0)
            im[I[0], I[1], :] = 0

        if roidb[i]['flipped']:
            im = im[:, ::-1, :]

        im_orig = im.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_scale = cfg.TRAIN.SCALES_BASE[scale_ind]
        im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
        im_scales.append(im_scale)
        processed_ims.append(im)

        # depth
        im_depth = im_depth_raw.astype(np.float32, copy=True) / float(im_depth_raw.max()) * 255
        im_depth = np.tile(im_depth[:,:,np.newaxis], (1,1,3))

        if roidb[i]['flipped']:
            im_depth = im_depth[:, ::-1]

        im_orig = im_depth.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_depth = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
        processed_ims_depth.append(im_depth)

        # normals
        depth = im_depth_raw.astype(np.float32, copy=True) / float(meta_data['factor_depth'])
        nmap = gpu_normals.gpu_normals(depth, fx, fy, cx, cy, 20.0, cfg.GPU_ID)
        im_normal = 127.5 * nmap + 127.5
        im_normal = im_normal.astype(np.uint8)
        im_normal = im_normal[:, :, (2, 1, 0)]
        if roidb[i]['flipped']:
            im_normal = im_normal[:, ::-1, :]

        im_orig = im_normal.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        im_normal = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
        processed_ims_normal.append(im_normal)

    # Create a blob to hold the input images
    blob = im_list_to_blob(processed_ims, 3)
    blob_depth = im_list_to_blob(processed_ims_depth, 3)
    blob_normal = im_list_to_blob(processed_ims_normal, 3)

    return blob, blob_depth, blob_normal, im_scales
Ejemplo n.º 19
0
        print('loading 3D models')
        cfg.renderer = RendererAdapter(width=cfg.TRAIN.SYN_WIDTH,
                                       height=cfg.TRAIN.SYN_HEIGHT)
        cfg.renderer.load_object(int(obj))

        # initialize tensors for testing
        test_data = init_tensors()

        result_file = f'/cvlabdata2/cvlab/datasets_protopap/linemod/test/{int(obj):06d}/scene_gt.json'
        print(f'fetching poses from {result_file}')
        with open(result_file, 'r') as f:
            results = json.load(f)

        # for each image
        for i in index_images:
            im = pad_im(cv2.imread(images_color[i], cv2.IMREAD_COLOR), 16)
            print(images_color[i])
            if len(images_depth) > 0 and osp.exists(images_depth[i]):
                depth = pad_im(
                    cv2.imread(images_depth[i], cv2.IMREAD_UNCHANGED), 16)
                depth = depth.astype('float') / 1000.0
                print(images_depth[i])
            else:
                depth = None
                print('no depth image')

            # rescale image if necessary
            if cfg.TEST.SCALES_BASE[0] != 1:
                im_scale = cfg.TEST.SCALES_BASE[0]
                im = pad_im(
                    cv2.resize(im,
Ejemplo n.º 20
0
    def _get_label_blob(self, roidb, num_classes, im_scale, height, width):
        """ build the label blob """

        meta_data = scipy.io.loadmat(roidb['meta_data'])
        meta_data['cls_indexes'] = meta_data['cls_indexes'].flatten()
        classes = np.array(cfg.TRAIN.CLASSES)

        # read label image
        im_label = pad_im(cv2.imread(roidb['label'], cv2.IMREAD_UNCHANGED), 16)
        if roidb['flipped']:
            if len(im_label.shape) == 2:
                im_label = im_label[:, ::-1]
            else:
                im_label = im_label[:, ::-1, :]
        if im_scale != 1.0:
            im_label = cv2.resize(im_label, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_NEAREST)

        label_blob = np.zeros((num_classes, height, width), dtype=np.float32)
        label_blob[0, :, :] = 1.0
        for i in range(1, num_classes):
            I = np.where(im_label == classes[i])
            if len(I[0]) > 0:
                label_blob[i, I[0], I[1]] = 1.0
                label_blob[0, I[0], I[1]] = 0.0

        # foreground mask
        seg = torch.from_numpy((im_label != 0).astype(np.float32))
        mask = seg.unsqueeze(0).repeat((3, 1, 1)).float()

        # poses
        poses = meta_data['poses']
        if len(poses.shape) == 2:
            poses = np.reshape(poses, (3, 4, 1))
        if roidb['flipped']:
            poses = _flip_poses(poses, meta_data['intrinsic_matrix'], width)

        num = poses.shape[2]
        pose_blob = np.zeros((num_classes, 9), dtype=np.float32)
        gt_boxes = np.zeros((num_classes, 5), dtype=np.float32)
        count = 0
        for i in range(num):
            cls = int(meta_data['cls_indexes'][i])
            ind = np.where(classes == cls)[0]
            if len(ind) > 0:
                R = poses[:, :3, i]
                T = poses[:, 3, i]
                pose_blob[count, 0] = 1
                pose_blob[count, 1] = ind
                qt = mat2quat(R)

                # egocentric to allocentric
                qt_allocentric = egocentric2allocentric(qt, T)
                if qt_allocentric[0] < 0:
                   qt_allocentric = -1 * qt_allocentric
                pose_blob[count, 2:6] = qt_allocentric
                pose_blob[count, 6:] = T

                # compute box
                x3d = np.ones((4, self._points_all.shape[1]), dtype=np.float32)
                x3d[0, :] = self._points_all[ind,:,0]
                x3d[1, :] = self._points_all[ind,:,1]
                x3d[2, :] = self._points_all[ind,:,2]
                RT = np.zeros((3, 4), dtype=np.float32)
                RT[:3, :3] = quat2mat(qt)
                RT[:, 3] = T
                x2d = np.matmul(meta_data['intrinsic_matrix'], np.matmul(RT, x3d))
                x2d[0, :] = np.divide(x2d[0, :], x2d[2, :])
                x2d[1, :] = np.divide(x2d[1, :], x2d[2, :])
        
                gt_boxes[count, 0] = np.min(x2d[0, :]) * im_scale
                gt_boxes[count, 1] = np.min(x2d[1, :]) * im_scale
                gt_boxes[count, 2] = np.max(x2d[0, :]) * im_scale
                gt_boxes[count, 3] = np.max(x2d[1, :]) * im_scale
                gt_boxes[count, 4] = ind
                count += 1

        # construct the meta data
        """
        format of the meta_data
        intrinsic matrix: meta_data[0 ~ 8]
        inverse intrinsic matrix: meta_data[9 ~ 17]
        """
        K = np.matrix(meta_data['intrinsic_matrix']) * im_scale
        K[2, 2] = 1
        Kinv = np.linalg.pinv(K)
        meta_data_blob = np.zeros(18, dtype=np.float32)
        meta_data_blob[0:9] = K.flatten()
        meta_data_blob[9:18] = Kinv.flatten()

        # vertex regression target
        if cfg.TRAIN.VERTEX_REG:
            center = meta_data['center']
            if roidb['flipped']:
                center[:, 0] = width - center[:, 0]
            vertex_targets, vertex_weights = self._generate_vertex_targets(im_label,
                meta_data['cls_indexes'], center, poses, classes, num_classes)
        else:
            vertex_targets = []
            vertex_weights = []

        return label_blob, mask, meta_data_blob, pose_blob, gt_boxes, vertex_targets, vertex_weights