Exemple #1
0
def get_stereo_point_cloud(sample_name, calib_dir, disp_dir):
    """
    Gets the point cloud for an image calculated from the disparity map

    :param sample_name: sample name
    :param calib_dir: directory with calibration files
    :param disp_dir: directory with disparity images

    :return: (3, N) point_cloud in the form [[x,...][y,...][z,...]]
    """

    # Read calibration info
    frame_calib = calib_utils.get_frame_calib(calib_dir, sample_name)
    stereo_calibration_info = calib_utils.get_stereo_calibration(frame_calib.p2,
                                                                 frame_calib.p3)

    # Read disparity
    disp = cv2.imread(disp_dir + '/{}.png'.format(sample_name),
                      cv2.IMREAD_ANYDEPTH)
    disp = np.float32(disp)
    disp = np.divide(disp, 256)
    disp[disp == 0] = 0.1

    # Calculate the point cloud
    point_cloud = calib_utils.depth_from_disparity(disp, stereo_calibration_info)

    return point_cloud
def main():

    # Paths
    kitti_dir = os.path.expanduser('~/Kitti/object/')
    data_split_dir = 'training'

    image_dir = os.path.join(kitti_dir, data_split_dir) + '/image_2'
    label_dir = os.path.join(kitti_dir, data_split_dir) + '/label_2'
    calib_dir = os.path.join(kitti_dir, data_split_dir) + '/calib'

    sample_name = '000050'

    frame_calib = calib_utils.get_frame_calib(calib_dir, sample_name)
    cam_p = frame_calib.p2

    f, axes = vis_utils.plots_from_sample_name(image_dir, sample_name, 2, 1)

    # Load labels
    obj_labels = obj_utils.read_labels(label_dir, sample_name)
    for obj in obj_labels:

        # Draw 2D and 3D boxes
        vis_utils.draw_obj_as_box_2d(axes[0], obj)
        vis_utils.draw_obj_as_box_3d(axes[1], obj, cam_p)

    plt.show(block=True)
Exemple #3
0
    def test_tf_project_pc_to_image(self):
        """Check that tf_project_pc_to_image matches numpy version"""

        dataset = DatasetBuilder.build_kitti_dataset(
            DatasetBuilder.KITTI_TRAINVAL)

        np.random.seed(12345)
        point_cloud_batch = np.random.rand(32, 3, 2304)
        frame_calib = calib_utils.get_frame_calib(dataset.calib_dir, '000050')
        cam_p = frame_calib.p2

        exp_proj_uv = [
            calib_utils.project_pc_to_image(point_cloud, cam_p)
            for point_cloud in point_cloud_batch
        ]

        tf_proj_uv = calib_utils.tf_project_pc_to_image(
            point_cloud_batch, cam_p, 32)

        with self.test_session() as sess:
            proj_uv_out = sess.run(tf_proj_uv)

        np.testing.assert_allclose(exp_proj_uv, proj_uv_out)
def main():

    ##############################
    # Options
    ##############################

    point_cloud_source = 'depth_2_multiscale'

    samples_to_use = None  # all samples

    dataset = DatasetBuilder.build_kitti_dataset(DatasetBuilder.KITTI_TRAINVAL)

    out_instance_dir = 'outputs/instance_2_{}'.format(point_cloud_source)

    required_classes = [
        'Car',
        'Pedestrian',
        'Cyclist',
        'Van',
        'Truck',
        'Person_sitting',
        'Tram',
        'Misc',
    ]

    ##############################
    # End of Options
    ##############################

    # Create instance folder
    os.makedirs(out_instance_dir, exist_ok=True)

    # Get frame ids to process
    if samples_to_use is None:
        samples_to_use = dataset.get_sample_names()

    # Begin instance mask generation
    for sample_idx, sample_name in enumerate(samples_to_use):

        sys.stdout.write(
            '\r{} / {} Generating {} instances for sample {}'.format(
                sample_idx, dataset.num_samples - 1, point_cloud_source,
                sample_name))

        # Get image
        image = obj_utils.get_image(sample_name, dataset.image_2_dir)
        image_shape = image.shape[0:2]

        # Get calibration
        frame_calib = calib_utils.get_frame_calib(dataset.calib_dir,
                                                  sample_name)

        # Get point cloud
        if point_cloud_source.startswith('depth'):
            point_cloud = obj_utils.get_depth_map_point_cloud(
                sample_name, frame_calib, dataset.depth_dir)

        elif point_cloud_source == 'velo':
            point_cloud = obj_utils.get_lidar_point_cloud_for_cam(
                sample_name, frame_calib, dataset.velo_dir, image_shape)
        else:
            raise ValueError('Invalid point cloud source', point_cloud_source)

        # Filter according to classes
        obj_labels = obj_utils.read_labels(dataset.kitti_label_dir,
                                           sample_name)
        obj_labels, _ = obj_utils.filter_labels_by_class(
            obj_labels, required_classes)

        # Get 2D and 3D bounding boxes from labels
        gt_boxes_2d = [
            box_3d_encoder.object_label_to_box_2d(obj_label)
            for obj_label in obj_labels
        ]
        gt_boxes_3d = [
            box_3d_encoder.object_label_to_box_3d(obj_label)
            for obj_label in obj_labels
        ]

        instance_image = np.full(image_shape, 255, dtype=np.uint8)

        # Start instance index at 0 and generate instance masks for all boxes
        inst_idx = 0
        for obj_label, box_2d, box_3d in zip(obj_labels, gt_boxes_2d,
                                             gt_boxes_3d):

            # Apply inflation and offset to box_3d
            modified_box_3d = modify_box_3d(box_3d, obj_label)

            # Get points in 3D box
            box_points, mask = obj_utils.points_in_box_3d(
                modified_box_3d, point_cloud.T)

            # Get points in 2D box
            points_in_im = calib_utils.project_pc_to_image(
                box_points.T, cam_p=frame_calib.p2)
            mask_2d = \
                (points_in_im[0] >= box_2d[1]) & \
                (points_in_im[0] <= box_2d[3]) & \
                (points_in_im[1] >= box_2d[0]) & \
                (points_in_im[1] <= box_2d[2])

            if point_cloud_source.startswith('depth'):
                mask_points_in_im = np.where(mask.reshape(image_shape))
                mask_points_in_im = [
                    mask_points_in_im[0][mask_2d],
                    mask_points_in_im[1][mask_2d]
                ]
                instance_pixels = np.asarray(
                    [mask_points_in_im[1], mask_points_in_im[0]])
            elif point_cloud_source == 'velo':
                # image_points = box_utils.project_to_image(
                #     box_points.T, frame.p_left).astype(np.int32)
                pass

            # Guarantees that indices don't exceed image dimensions
            instance_pixels[0, :] = np.clip(instance_pixels[0, :], 0,
                                            image_shape[1] - 1)
            instance_pixels[1, :] = np.clip(instance_pixels[1, :], 0,
                                            image_shape[0] - 1)

            instance_image[instance_pixels[1, :],
                           instance_pixels[0, :]] = np.uint8(inst_idx)

            inst_idx += 1

        # Write image to directory
        cv2.imwrite(out_instance_dir + '/{}.png'.format(sample_name),
                    instance_image, [cv2.IMWRITE_PNG_COMPRESSION, 1])
def save_predictions_box_3d_in_kitti_format(score_threshold,
                                            dataset,
                                            predictions_base_dir,
                                            predictions_box_3d_dir,
                                            predictions_box_2d_dir,
                                            global_step,
                                            project_3d_box=False):
    """Converts and saves predictions (box_3d) into text files required for KITTI evaluation

    Args:
        score_threshold: score threshold to filter predictions
        dataset: Dataset object
        predictions_box_3d_dir: predictions (box_3d) folder
        predictions_box_2d_dir: predictions (box_2d) folder
        predictions_base_dir: predictions base folder
        global_step: global step
        project_3d_box: Bool whether to project 3D box to image space to get 2D box
    """

    score_threshold = round(score_threshold, 3)
    data_split = dataset.data_split

    # Output folder
    kitti_predictions_3d_dir = predictions_base_dir + \
        '/kitti_predictions_3d/{}/{}/{}/data'.format(data_split, score_threshold, global_step)
    if not os.path.exists(kitti_predictions_3d_dir):
        os.makedirs(kitti_predictions_3d_dir)

    # Do conversion
    num_samples = dataset.num_samples
    num_valid_samples = 0

    print('\nGlobal step:', global_step)
    print('Converting detections from:', predictions_box_3d_dir)
    print('3D Detections being saved to:', kitti_predictions_3d_dir)

    for sample_idx in range(num_samples):

        # Print progress
        sys.stdout.write('\rConverting {} / {}'.format(sample_idx + 1,
                                                       num_samples))
        sys.stdout.flush()

        sample_name = dataset.sample_list[sample_idx].name

        prediction_file = sample_name + '.txt'
        kitti_predictions_3d_file_path = kitti_predictions_3d_dir + '/' + prediction_file
        predictions_3d_file_path = predictions_box_3d_dir + '/' + prediction_file
        predictions_2d_file_path = predictions_box_2d_dir + '/' + prediction_file

        # If no predictions, skip to next file
        if not os.path.exists(predictions_3d_file_path):
            np.savetxt(kitti_predictions_3d_file_path, [])
            continue

        all_predictions_3d = np.loadtxt(predictions_3d_file_path)
        if len(all_predictions_3d) == 0:
            np.savetxt(kitti_predictions_3d_file_path, [])
            continue

        all_predictions_3d = all_predictions_3d.reshape(-1, 9)
        all_predictions_2d = np.loadtxt(predictions_2d_file_path).reshape(
            -1, 7)

        # # Swap l, w for predictions where w > l
        # swapped_indices = all_predictions[:, 4] > all_predictions[:, 3]
        # fixed_predictions = np.copy(all_predictions)
        # fixed_predictions[swapped_indices, 3] = all_predictions[
        #     swapped_indices, 4]
        # fixed_predictions[swapped_indices, 4] = all_predictions[
        #     swapped_indices, 3]

        score_filter = all_predictions_3d[:, 7] >= score_threshold
        all_predictions_3d = all_predictions_3d[score_filter]
        all_predictions_2d = all_predictions_2d[score_filter]

        # If no predictions, skip to next file
        if len(all_predictions_3d) == 0:
            np.savetxt(kitti_predictions_3d_file_path, [])
            continue

        # Project to image space
        sample_name = prediction_file.split('.')[0]

        # Load image for truncation
        image = Image.open(dataset.get_rgb_image_path(sample_name))
        # TODO: Check which camera
        cam_p = calib_utils.get_frame_calib(dataset.calib_dir, sample_name).p2

        if project_3d_box:
            boxes = []
            image_filter = []
            for i in range(len(all_predictions_3d)):
                box_3d = all_predictions_3d[i, 0:7]
                img_box = box_3d_projector.project_to_image_space(
                    box_3d, cam_p, truncate=True, image_size=image.size)

                # Skip invalid boxes (outside image space)
                if img_box is None:
                    image_filter.append(False)
                    continue

                image_filter.append(True)
                boxes.append(img_box)

            boxes_2d = np.asarray(boxes)
            all_predictions_3d = all_predictions_3d[image_filter]
            all_predictions_2d = all_predictions_2d[image_filter]

        else:
            # Get 2D boxes from 2D predictions
            boxes_2d = all_predictions_2d[:, [1, 0, 3, 2]]

        # If no predictions, skip to next file
        if len(all_predictions_3d) == 0:
            np.savetxt(kitti_predictions_3d_file_path, [])
            continue

        num_valid_samples += 1

        # To keep each value in its appropriate position, an array of zeros
        # (N, 16) is allocated but only values [4:16] are used
        kitti_predictions = np.zeros([len(all_predictions_3d), 16])

        # Get object types
        all_pred_classes = all_predictions_3d[:, 8].astype(np.int32)
        obj_types = [
            dataset.classes[class_idx] for class_idx in all_pred_classes
        ]

        # Truncation and Occlusion are always empty (see below)

        # Alpha
        kitti_predictions[:, 3] = all_predictions_2d[:, 4]

        # 2D predictions
        kitti_predictions[:, 4:8] = boxes_2d

        # 3D predictions
        # (l, w, h)
        kitti_predictions[:, 8] = all_predictions_3d[:, 5]
        kitti_predictions[:, 9] = all_predictions_3d[:, 4]
        kitti_predictions[:, 10] = all_predictions_3d[:, 3]
        # (x, y, z)
        kitti_predictions[:, 11:14] = all_predictions_3d[:, 0:3]
        # (ry, score)
        kitti_predictions[:, 14:16] = all_predictions_3d[:, 6:8]

        # Round detections to 3 decimal places
        kitti_predictions = np.round(kitti_predictions, 3)

        # Empty Truncation, Occlusion
        kitti_empty_1 = -1 * np.ones(
            (len(kitti_predictions), 2), dtype=np.int32)

        # Stack 3D predictions text
        kitti_text_3d = np.column_stack(
            [obj_types, kitti_empty_1, kitti_predictions[:, 3:16]])

        # Save to text files
        np.savetxt(kitti_predictions_3d_file_path,
                   kitti_text_3d,
                   newline='\r\n',
                   fmt='%s')

    print('\nNum valid:', num_valid_samples)
    print('Num samples:', num_samples)
Exemple #6
0
def main():
    """Interpolates the lidar point cloud to and saves a dense depth map of the scene.
    """

    ##############################
    # Options
    ##############################

    dataset = DatasetBuilder.build_kitti_dataset(DatasetBuilder.KITTI_TRAINVAL)
    data_split = dataset.data_split

    # Fill algorithm
    fill_type = 'multiscale'

    save_depth_maps = True

    out_depth_map_dir = 'outputs/{}/depth_2_{}'.format(data_split, fill_type)

    samples_to_use = None
    ##############################
    # End of Options
    ##############################
    os.makedirs(out_depth_map_dir, exist_ok=True)

    # Rolling average array of times for time estimation
    avg_time_arr_length = 5
    last_fill_times = np.repeat([1.0], avg_time_arr_length)
    last_total_times = np.repeat([1.0], avg_time_arr_length)

    if samples_to_use is None:
        samples_to_use = [sample.name for sample in dataset.sample_list]

    for sample_idx, sample_name in enumerate(samples_to_use):

        # Calculate average time with last n fill times
        avg_fill_time = np.mean(last_fill_times)
        avg_total_time = np.mean(last_total_times)

        # Print progress
        sys.stdout.write('\rProcessing {} / {}, Idx {}, Avg Fill Time: {:.5f}s, '
                         'Avg Time: {:.5f}s, Est Time: {:.3f}s'.format(
                             sample_idx, dataset.num_samples - 1, sample_name,
                             avg_fill_time, avg_total_time,
                             avg_total_time * (dataset.num_samples - sample_idx)))
        sys.stdout.flush()

        # Start timing
        start_total_time = time.time()

        # Load sample info
        image = obj_utils.get_image(sample_name, dataset.image_2_dir)
        image_shape = image.shape[0:2]
        frame_calib = calib_utils.get_frame_calib(dataset.calib_dir, sample_name)
        cam_p = frame_calib.p2

        # Load point cloud
        point_cloud = obj_utils.get_lidar_point_cloud(sample_name, frame_calib, dataset.velo_dir)

        # Fill depth map
        if fill_type == 'multiscale':
            # Project point cloud to depth map
            projected_depths = depth_map_utils.project_depths(point_cloud, cam_p, image_shape)

            start_fill_time = time.time()
            final_depth_map, _ = ip_basic.fill_in_multiscale(projected_depths)
            end_fill_time = time.time()
        else:
            raise ValueError('Invalid fill algorithm')

        # Save depth maps
        if save_depth_maps:
            out_depth_map_path = out_depth_map_dir + '/{}.png'.format(sample_name)
            depth_map_utils.save_depth_map(out_depth_map_path, final_depth_map)

        # Stop timing
        end_total_time = time.time()

        # Update fill times
        last_fill_times = np.roll(last_fill_times, -1)
        last_fill_times[-1] = end_fill_time - start_fill_time

        # Update total times
        last_total_times = np.roll(last_total_times, -1)
        last_total_times[-1] = end_total_time - start_total_time
Exemple #7
0
def score_boxes(dataset,
                sample_name,
                img_shape,
                boxes_2d,
                boxes_3d,
                valid_scores,
                max_depth=45.0):
    """Score 3D boxes based on 2D classification, depth, and fit between
    projected 3D box and the 2D detection

    Args:
        dataset: Dataset object
        sample_name: Sample name, e.g. '000050'
        img_shape: Image shape [h, w]
        boxes_2d: List of 2D boxes
        boxes_3d: List of 3D boxes
        valid_scores: List of box scores
        max_depth: Maximum depth, default 45m (95% of KITTI objects)
    """

    all_new_scores = np.zeros_like(valid_scores)
    for pred_idx, (box_2d, box_3d) in enumerate(zip(boxes_2d, boxes_3d)):

        # Project 3D box to 2D [x1, y1, x2, y2]
        cam_p = calib_utils.get_frame_calib(dataset.calib_dir, sample_name).p2

        projected_box_3d = box_3d_projector.project_to_image_space(
            box_3d,
            cam_p,
            truncate=True,
            image_size=(img_shape[1], img_shape[0]))

        # Change box_2d to iou format
        box_2d_iou_fmt = np.squeeze(
            box_3d_encoder.boxes_2d_to_iou_fmt([box_2d]))

        if projected_box_3d is None:
            # Truncated box
            new_score_box_fit = 0.1

        else:
            # Calculate corner error
            height = box_2d_iou_fmt[3] - box_2d_iou_fmt[1]
            width = box_2d_iou_fmt[2] - box_2d_iou_fmt[0]

            x1_err = np.abs((box_2d_iou_fmt[0] - projected_box_3d[0]) / width)
            x2_err = np.abs((box_2d_iou_fmt[2] - projected_box_3d[2]) / width)

            y1_err = np.abs((box_2d_iou_fmt[1] - projected_box_3d[1]) / height)
            y2_err = np.abs((box_2d_iou_fmt[3] - projected_box_3d[3]) / height)

            corner_err = x1_err + x2_err + y1_err + y2_err

            new_score_box_fit = 1.0 - corner_err

        depth = box_3d[2]
        new_score_depth = np.clip(1.0 - (depth / max_depth), 0.1, 1.0)

        new_score_depth_box_fit = (new_score_depth + new_score_box_fit) / 2.0

        mscnn_score = valid_scores[pred_idx]
        new_score = 0.95 * mscnn_score + 0.05 * new_score_depth_box_fit
        all_new_scores[pred_idx] = new_score

    return all_new_scores
Exemple #8
0
    def get_sample_dict(self, indices):
        """ Loads input-output data for a set of samples. Should only be
            called when a particular sample dict is required. Otherwise,
            samples should be provided by the next_batch function

        Args:
            indices: A list of sample indices from the dataset.sample_list
                to be loaded

        Return:
            samples: a list of data sample dicts
        """
        sample_dicts = []
        for sample_idx in indices:

            sample = self.sample_list[sample_idx]
            sample_name = sample.name

            # Load image (BGR -> RGB)
            bgr_image = cv2.imread(self.get_rgb_image_path(sample_name))
            rgb_image = bgr_image[..., ::-1]
            image_shape = rgb_image.shape[0:2]
            image_input = rgb_image

            # Get calibration
            frame_calib = calib_utils.get_frame_calib(self.calib_dir,
                                                      sample_name)
            cam_p = frame_calib.p2

            # Only read labels if they exist
            if self.train_val_test in ['train', 'val']:

                # Read KITTI object labels
                kitti_obj_labels = obj_utils.read_labels(
                    self.kitti_label_dir, sample_name)

                if self.use_mscnn_detections and self.train_val_test == 'val':
                    # Read mscnn obj labels and replace the KITTI obj label box coords and scores
                    mscnn_obj_labels = obj_utils.read_labels(
                        self.mscnn_label_dir, sample_name)

                    obj_labels = obj_utils.merge_kitti_and_mscnn_obj_labels(
                        kitti_obj_labels,
                        mscnn_obj_labels,
                        min_iou=self.mscnn_merge_min_iou,
                        default_score_type='distance')
                else:
                    obj_labels = kitti_obj_labels

                num_all_objs = len(obj_labels)

                # Filter labels
                obj_labels, obj_mask = obj_utils.apply_obj_filter(
                    obj_labels, self.obj_filter)
                num_objs = len(obj_labels)
                if num_objs < 1:
                    sample_dicts.append(None)
                    continue

                if self.use_mscnn_detections:
                    # Get filtered original kitti_obj_labels
                    kitti_obj_labels, kitti_obj_mask = obj_utils.apply_obj_filter(
                        kitti_obj_labels, self.obj_filter)
                    num_kitti_objs = len(kitti_obj_labels)
                    if num_kitti_objs < 1:
                        sample_dicts.append(None)
                        continue

                # Load instance masks
                instance_image = instance_utils.get_instance_image(
                    sample_name, self.instance_dir)
                instance_masks = instance_utils.get_instance_mask_list(
                    instance_image, num_all_objs)
                instance_masks = instance_masks[obj_mask]

                if self.oversample:
                    # Oversample to required number of boxes
                    num_to_oversample = self.num_boxes - num_objs

                    oversample_indices = np.random.choice(num_objs,
                                                          num_to_oversample,
                                                          replace=True)
                    oversample_indices = np.hstack(
                        [np.arange(0, num_objs), oversample_indices])
                    obj_labels = obj_labels[oversample_indices]
                    instance_masks = instance_masks[oversample_indices]

                # Augmentation if in train mode
                if self.train_val_test == 'train':

                    # Image augmentation
                    use_image_aug = self.aug_config.use_image_aug
                    if use_image_aug:
                        image_input = kitti_aug.apply_image_noise(rgb_image)

                    # Box jittering
                    box_jitter_type = self.aug_config.box_jitter_type
                    if box_jitter_type is None:
                        pass
                    elif box_jitter_type == 'oversample':
                        # Replace oversampled boxes with jittered boxes
                        if not self.oversample:
                            raise ValueError(
                                'Must oversample object labels to use {} '
                                'box jitter type'.format(box_jitter_type))
                        aug_labels = kitti_aug.jitter_obj_boxes_2d(
                            obj_labels[num_objs:], 0.7, image_shape)
                        obj_labels[num_objs:] = aug_labels
                    elif box_jitter_type == 'oversample_gt':
                        # Replace oversampled boxes with jittered gt boxes
                        if not self.oversample:
                            raise ValueError(
                                'Must oversample object labels to use {} '
                                'box jitter type'.format(box_jitter_type))

                        # Get enough gt boxes to jitter
                        gt_num_to_oversample = self.num_boxes - num_objs
                        gt_oversample_indices = np.random.choice(
                            num_kitti_objs, gt_num_to_oversample, replace=True)
                        kitti_obj_labels = kitti_obj_labels[
                            gt_oversample_indices]

                        aug_labels = kitti_aug.jitter_obj_boxes_2d(
                            kitti_obj_labels, 0.7, image_shape)
                        obj_labels[num_objs:] = aug_labels
                    elif box_jitter_type == 'all':
                        # Apply data augmentation on all labels
                        obj_labels = kitti_aug.jitter_obj_boxes_2d(
                            obj_labels, 0.7, image_shape)
                    else:
                        raise ValueError('Invalid box_jitter_type',
                                         box_jitter_type)

                # TODO: Do this some other way
                # Get 2D and 3D boxes
                label_boxes_2d = obj_utils.boxes_2d_from_obj_labels(obj_labels)
                label_boxes_3d = obj_utils.boxes_3d_from_obj_labels(obj_labels)
                label_alphas = np.asarray(
                    [obj_label.alpha for obj_label in obj_labels],
                    dtype=np.float32)

                label_alpha_bins, label_alpha_regs, label_valid_alpha_bins = \
                    zip(*[orientation_encoder.np_orientation_to_angle_bin(
                        obj_label.alpha, self.num_alpha_bins, self.alpha_bin_overlap)
                        for obj_label in obj_labels])

                # Get viewing angles
                label_viewing_angles_2d = np.asarray([
                    obj_utils.get_viewing_angle_box_2d(box_2d, cam_p)
                    for box_2d in label_boxes_2d
                ],
                                                     dtype=np.float32)
                label_viewing_angles_3d = np.asarray([
                    obj_utils.get_viewing_angle_box_3d(box_3d, cam_p)
                    for box_3d in label_boxes_3d
                ],
                                                     dtype=np.float32)

                # Parse class indices
                label_class_indices = [
                    obj_utils.class_str_to_index(obj_label.type, self.classes)
                    for obj_label in obj_labels
                ]
                label_class_indices = np.expand_dims(np.asarray(
                    label_class_indices, dtype=np.int32),
                                                     axis=1)
                label_class_strs = [obj_label.type for obj_label in obj_labels]

                # Get proposal z centroid offset
                prop_cen_z_offset_list = np.asarray([
                    instance_utils.get_prop_cen_z_offset(class_str)
                    for class_str in label_class_strs
                ])

                # Get xyz map in cam_N frame
                depth_map = obj_utils.get_depth_map(sample_name,
                                                    self.depth_dir)

                # Get scores
                label_scores = np.asarray(
                    [obj_label.score for obj_label in obj_labels], np.float32)

                # Get lwh average
                lwh_means = np.asarray([
                    obj_utils.get_mean_lwh_and_std_dev(class_str)[0]
                    for class_str in label_class_strs
                ])

            elif self.train_val_test == 'test':
                # Read object test labels
                obj_labels = obj_utils.read_labels(self.mscnn_label_dir,
                                                   sample_name)
                num_objs = len(obj_labels)
                if num_objs < 1:
                    sample_dicts.append(None)
                    continue

                # Just filter classes
                obj_labels, obj_mask = obj_utils.apply_obj_filter(
                    obj_labels, self.obj_filter)
                num_objs = len(obj_labels)
                if num_objs < 1:
                    sample_dicts.append(None)
                    continue

                # Oversample to required number of boxes
                num_to_oversample = self.num_boxes - num_objs
                oversample_indices = np.random.choice(num_objs,
                                                      num_to_oversample,
                                                      replace=True)
                oversample_indices = np.hstack(
                    [np.arange(0, num_objs), oversample_indices])
                obj_labels = obj_labels[oversample_indices]

                # Get 2D boxes
                label_boxes_2d = obj_utils.boxes_2d_from_obj_labels(obj_labels)

                # Get score
                label_scores = np.asarray(
                    [obj_label.score for obj_label in obj_labels], np.float32)

                # Calculate viewing angles
                label_viewing_angles_2d = np.asarray([
                    obj_utils.get_viewing_angle_box_2d(box_2d, cam_p)
                    for box_2d in label_boxes_2d
                ],
                                                     dtype=np.float32)

                label_class_indices = [
                    obj_utils.class_str_to_index(obj_label.type, self.classes)
                    for obj_label in obj_labels
                ]
                label_class_indices = np.expand_dims(np.asarray(
                    label_class_indices, dtype=np.int32),
                                                     axis=1)
                label_class_strs = [obj_label.type for obj_label in obj_labels]

                # Get lwh average
                lwh_means = np.asarray([
                    obj_utils.get_mean_lwh_and_std_dev(class_str)[0]
                    for class_str in label_class_strs
                ])

                # Get proposal z centroid offset
                prop_cen_z_offset_list = np.asarray([
                    instance_utils.get_prop_cen_z_offset(class_str)
                    for class_str in label_class_strs
                ])

            else:
                raise ValueError('Invalid run mode', self.train_val_test)

            # Common inputs for all train_val_test modes
            # Normalize 2D boxes
            label_boxes_2d_norm = label_boxes_2d / np.tile(image_shape, 2)

            sample_dict = {
                constants.SAMPLE_NUM_OBJS:
                num_objs,
                constants.SAMPLE_IMAGE_INPUT:
                image_input,
                constants.SAMPLE_CAM_P:
                cam_p,
                constants.SAMPLE_NAME:
                sample_name,
                constants.SAMPLE_LABEL_BOXES_2D_NORM:
                label_boxes_2d_norm,
                constants.SAMPLE_LABEL_BOXES_2D:
                label_boxes_2d,
                constants.SAMPLE_LABEL_SCORES:
                label_scores,
                constants.SAMPLE_LABEL_CLASS_STRS:
                np.expand_dims(label_class_strs, 1),
                constants.SAMPLE_LABEL_CLASS_INDICES:
                label_class_indices,
                constants.SAMPLE_MEAN_LWH:
                lwh_means,
                constants.SAMPLE_PROP_CEN_Z_OFFSET:
                prop_cen_z_offset_list,
                constants.SAMPLE_VIEWING_ANGLES_2D:
                label_viewing_angles_2d,
            }

            if self.train_val_test in ['train', 'val']:

                sample_dict.update({
                    constants.SAMPLE_LABEL_BOXES_3D:
                    label_boxes_3d,
                    constants.SAMPLE_ALPHAS:
                    label_alphas,
                    constants.SAMPLE_ALPHA_BINS:
                    np.asarray(label_alpha_bins),
                    constants.SAMPLE_ALPHA_REGS:
                    np.asarray(label_alpha_regs),
                    constants.SAMPLE_ALPHA_VALID_BINS:
                    np.asarray(label_valid_alpha_bins),
                    constants.SAMPLE_VIEWING_ANGLES_3D:
                    label_viewing_angles_3d,
                    constants.SAMPLE_INSTANCE_MASKS:
                    instance_masks,
                    constants.SAMPLE_DEPTH_MAP:
                    depth_map,
                })

            elif self.train_val_test == 'test':
                # No additional labels for test mode
                pass

            sample_dicts.append(sample_dict)

        return sample_dicts