Esempio n. 1
0
    def preprocess_data(self, images):
        # T x H x W x C
        images = [image[:, :, ::-1] for image in images]
        images = np.concatenate([image[np.newaxis] for image in images])
        images = torch.from_numpy(images).float()
        images = images / 255.
        images -= self.data_mean
        images /= self.data_std
        images = images.permute(3, 0, 1, 2)  # -> C x T x H x W
        images, _ = transform.random_short_side_scale_jitter(
            images, self.min_scale, self.max_scale)
        images, _ = transform.uniform_crop(images, self.crop_size, 0)
        images = images.unsqueeze(0)
        # Fast Path Way
        index = torch.linspace(0, images.shape[2] - 1, self.num_frames).long()
        fast_pathway = torch.index_select(images, 2, index)
        # Slow Path Way
        index = torch.linspace(0, fast_pathway.shape[2] - 1,
                               fast_pathway.shape[2] // self.alpha).long()
        slow_pathway = torch.index_select(fast_pathway, 2, index)
        inputs = [slow_pathway, fast_pathway]

        for i in range(2):
            inputs[i] = inputs[i].to(self.device)
        return inputs
Esempio n. 2
0
def spatial_sampling(
    frames,
    spatial_idx=-1,
    min_scale=256,
    max_scale=320,
    crop_size=224,
    random_horizontal_flip=True,
    inverse_uniform_sampling=False,
):
    """
    Perform spatial sampling on the given video frames. If spatial_idx is
    -1, perform random scale, random crop, and random flip on the given
    frames. If spatial_idx is 0, 1, or 2, perform spatial uniform sampling
    with the given spatial_idx.
    Args:
        frames (tensor): frames of images sampled from the video. The
            dimension is `num frames` x `height` x `width` x `channel`.
        spatial_idx (int): if -1, perform random spatial sampling. If 0, 1,
            or 2, perform left, center, right crop if width is larger than
            height, and perform top, center, buttom crop if height is larger
            than width.
        min_scale (int): the minimal size of scaling.
        max_scale (int): the maximal size of scaling.
        crop_size (int): the size of height and width used to crop the
            frames.
        inverse_uniform_sampling (bool): if True, sample uniformly in
            [1 / max_scale, 1 / min_scale] and take a reciprocal to get the
            scale. If False, take a uniform sample from [min_scale,
            max_scale].
    Returns:
        frames (tensor): spatially sampled frames.
    """
    assert spatial_idx in [-1, 0, 1, 2]
    if spatial_idx == -1:
        frames, _ = transform.random_short_side_scale_jitter(
            images=frames,
            min_size=min_scale,
            max_size=max_scale,
            inverse_uniform_sampling=inverse_uniform_sampling,
        )
        frames, _ = transform.random_crop(frames, crop_size)
        if random_horizontal_flip:
            frames, _ = transform.horizontal_flip(0.5, frames)
    else:
        # The testing is deterministic and no jitter should be performed.
        # min_scale, max_scale, and crop_size are expect to be the same.
        assert len({min_scale, max_scale, crop_size}) == 1
        frames, _ = transform.random_short_side_scale_jitter(
            frames, min_scale, max_scale)
        frames, _ = transform.uniform_crop(frames, crop_size, spatial_idx)
    return frames
Esempio n. 3
0
 def _prepare_im_res(self, im_path):
     # Prepare resnet style augmentation.
     im = self.load_image(im_path)
     # Train and test setups differ
     train_size, test_size = (
         self.cfg.DATA.TRAIN_CROP_SIZE,
         self.cfg.DATA.TEST_CROP_SIZE,
     )
     if self.mode == "train":
         # For training use random_sized_crop, horizontal_flip, augment, lighting
         im = transform.random_sized_crop_img(
             im,
             train_size,
             jitter_scale=self.cfg.DATA.TRAIN_JITTER_SCALES_RELATIVE,
             jitter_aspect=self.cfg.DATA.TRAIN_JITTER_ASPECT_RELATIVE,
         )
         im, _ = transform.horizontal_flip(prob=0.5, images=im)
         # im = transforms.augment(im, cfg.TRAIN.AUGMENT)
         im = transform.lighting_jitter(
             im,
             0.1,
             self.cfg.DATA.TRAIN_PCA_EIGVAL,
             self.cfg.DATA.TRAIN_PCA_EIGVEC,
         )
     else:
         # For testing use scale and center crop
         im, _ = transform.uniform_crop(im,
                                        test_size,
                                        spatial_idx=1,
                                        scale_size=train_size)
     # For training and testing use color normalization
     im = transform.color_normalization(im, self.cfg.DATA.MEAN,
                                        self.cfg.DATA.STD)
     # Convert HWC/RGB/float to CHW/BGR/float format
     # im = np.ascontiguousarray(im[:, :, ::-1].transpose([2, 0, 1]))
     return im
Esempio n. 4
0
    def _images_and_boxes_preprocessing(self, imgs, boxes):
        """
        This function performs preprocessing for the input images and
        corresponding boxes for one clip.

        Args:
            imgs (tensor): the images.
            boxes (ndarray): the boxes for the current clip.

        Returns:
            imgs (tensor): list of preprocessed images.
            boxes (ndarray): preprocessed boxes.
        """
        # Image [0, 255] -> [0, 1].
        imgs = imgs.float()
        imgs = imgs / 255.0

        height, width = imgs.shape[2], imgs.shape[3]
        # The format of boxes is [x1, y1, x2, y2]. The input boxes are in the
        # range of [0, 1].
        boxes[:, [0, 2]] *= width
        boxes[:, [1, 3]] *= height
        boxes = transform.clip_boxes_to_image(boxes, height, width)

        if self._split == "train":
            # Train split
            imgs, boxes = transform.random_short_side_scale_jitter(
                imgs,
                min_size=self._jitter_min_scale,
                max_size=self._jitter_max_scale,
                boxes=boxes,
            )
            imgs, boxes = transform.random_crop(imgs,
                                                self._crop_size,
                                                boxes=boxes)

            # Random flip.
            imgs, boxes = transform.horizontal_flip(0.5, imgs, boxes=boxes)

        elif self._split == "val":
            # Val split
            # Resize short side to crop_size. Non-local and STRG uses 256.
            imgs, boxes = transform.random_short_side_scale_jitter(
                imgs,
                min_size=self._crop_size,
                max_size=self._crop_size,
                boxes=boxes,
            )

            # Apply center crop for val split
            imgs, boxes = transform.uniform_crop(imgs,
                                                 size=self._crop_size,
                                                 spatial_idx=1,
                                                 boxes=boxes)

            if self._test_force_flip:
                imgs, boxes = transform.horizontal_flip(1, imgs, boxes=boxes)
        elif self._split == "test":
            # Test split
            # Resize short side to crop_size. Non-local and STRG uses 256.
            imgs, boxes = transform.random_short_side_scale_jitter(
                imgs,
                min_size=self._crop_size,
                max_size=self._crop_size,
                boxes=boxes,
            )

            if self._test_force_flip:
                imgs, boxes = transform.horizontal_flip(1, imgs, boxes=boxes)
        else:
            raise NotImplementedError("{} split not supported yet!".format(
                self._split))

        if self.cfg.AVA.MANUAL_ROUND:
            imgs = (imgs * 255).byte().float() / 255

        # Do color augmentation (after divided by 255.0).
        if self._split == "train" and self._use_color_augmentation:
            if not self._pca_jitter_only:
                imgs = transform.color_jitter(
                    imgs,
                    img_brightness=0.4,
                    img_contrast=0.4,
                    img_saturation=0.4,
                )

            imgs = transform.lighting(
                imgs,
                alphastd=0.1,
                eigval=np.array(self._pca_eigval).astype(np.float32),
                eigvec=np.array(self._pca_eigvec).astype(np.float32),
            )

        # Normalize images by mean and std.
        imgs = transform.color_normalization(
            imgs,
            np.array(self._data_mean, dtype=np.float32),
            np.array(self._data_std, dtype=np.float32),
        )

        if not self._use_bgr:
            # Convert image format from BGR to RGB.
            # Note that Kinetics pre-training uses RGB!
            imgs = imgs[:, [2, 1, 0], ...]

        boxes = transform.clip_boxes_to_image(boxes, self._crop_size,
                                              self._crop_size)

        return imgs, boxes