Beispiel #1
0
 def __dataloader(self, train: bool):
     """Train/validation loaders."""
     if train:
         dataset = self.create_dataset(
             data_path=self.data_path + "/train_faces.csv",
             clip_sampler=make_clip_sampler("random", self.clip_duration),
             transform=self.train_transform)
     else:
         dataset = self.create_dataset(
             data_path=self.data_path + "/valid_faces.csv",
             clip_sampler=make_clip_sampler("uniform", self.clip_duration),
             transform=self.valid_transform)
     return DataLoader(dataset=dataset,
                       batch_size=self.batch_size,
                       num_workers=self.num_workers)
 def val_dataloader(self):
     self.val_dataset = LimitDataset(
         LabeledVideoDataset(
             self.val_paths,
             clip_sampler=make_clip_sampler('uniform', self.clip_duration),
             decode_audio=False,
             transform=self.val_transforms,
         ))
     return DataLoader(self.val_dataset,
                       batch_size=self.batch_size,
                       num_workers=self.num_workers)
Beispiel #3
0
def Ptvssv2(cfg, mode):
    """
    Construct PyTorchVideo Something-Something v2 SSv2 video loader.
    Load SSv2 data (frame paths, labels, etc. ) to SSv2 Dataset object.
    The dataset could be downloaded from Chrades official website
    (https://20bn.com/datasets/something-something).
    Please see datasets/DATASET.md for more information about the data format.
    For training and validation, a single  clip is randomly sampled from every
    video with random cropping and scaling. For testing, multiple clips are
    uniformaly sampled from every video with uniform cropping. For uniform cropping,
    we take the left, center, and right crop if the width is larger than height,
    or take top, center, and bottom crop if the height is larger than the width.
    Args:
        cfg (CfgNode): configs.
        mode (string): Options includes `train`, `val`, or `test` mode.
    """
    # Only support train, val, and test mode.
    assert mode in [
        "train",
        "val",
        "test",
    ], "Split '{}' not supported".format(mode)

    logger.info("Constructing Ptvcharades {}...".format(mode))

    if mode in ["train", "val"]:
        num_clips = 1
        num_crops = 1

        transform = Compose([
            ApplyTransformToKey(
                key="video",
                transform=Compose([
                    Lambda(div255),
                    NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
                    RandomShortSideScale(
                        min_size=cfg.DATA.TRAIN_JITTER_SCALES[0],
                        max_size=cfg.DATA.TRAIN_JITTER_SCALES[1],
                    ),
                    RandomCropVideo(cfg.DATA.TRAIN_CROP_SIZE),
                    Lambda(rgb2bgr),
                ] + ([RandomHorizontalFlipVideo(
                    p=0.5)] if cfg.DATA.RANDOM_FLIP else []) +
                                  [PackPathway(cfg)]),
            ),
            DictToTuple(num_clips, num_crops),
        ])
        clip_sampler = make_clip_sampler(
            "constant_clips_per_video",
            1,  # Put arbitrary duration as ssv2 always needs full video clip.
            num_clips,
            num_crops,
        )
        if cfg.NUM_GPUS > 1:
            video_sampler = DistributedSampler
        else:
            video_sampler = (RandomSampler
                             if mode == "train" else SequentialSampler)
    else:
        assert cfg.TEST.NUM_ENSEMBLE_VIEWS == 1
        num_clips = cfg.TEST.NUM_ENSEMBLE_VIEWS
        num_crops = cfg.TEST.NUM_SPATIAL_CROPS

        transform = Compose([
            ApplyTransformToKey(
                key="video",
                transform=Compose([
                    Lambda(div255),
                    NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
                    ShortSideScale(size=cfg.DATA.TEST_CROP_SIZE),
                ]),
            ),
            UniformCropVideo(size=cfg.DATA.TEST_CROP_SIZE),
            ApplyTransformToKey(
                key="video",
                transform=Compose(
                    [Lambda(rgb2bgr), PackPathway(cfg)], ),
            ),
            DictToTuple(num_clips, num_crops),
        ])
        clip_sampler = make_clip_sampler(
            "constant_clips_per_video",
            1,  # Put arbitrary duration as ssv2 always needs full video clip.
            num_clips,
            num_crops,
        )
        video_sampler = (DistributedSampler
                         if cfg.NUM_GPUS > 1 else SequentialSampler)

    label_name_file = os.path.join(cfg.DATA.PATH_TO_DATA_DIR,
                                   "something-something-v2-labels.json")
    video_label_file = os.path.join(
        cfg.DATA.PATH_TO_DATA_DIR, "something-something-v2-{}.json".format(
            "train" if mode == "train" else "validation"))
    data_path = os.path.join(
        cfg.DATA.PATH_TO_DATA_DIR,
        "{}.csv".format("train" if mode == "train" else "val"),
    )
    dataset = SSv2(
        label_name_file=label_name_file,
        video_label_file=video_label_file,
        video_path_label_file=data_path,
        clip_sampler=clip_sampler,
        video_sampler=video_sampler,
        transform=transform,
        video_path_prefix=cfg.DATA.PATH_PREFIX,
        frames_per_clip=cfg.DATA.NUM_FRAMES,
        rand_sample_frames=mode == "train",
    )

    logger.info("Constructing ssv2 dataloader (size: {}) from {}".format(
        len(dataset._path_to_videos), data_path))

    return PTVDatasetWrapper(
        num_videos=len(dataset._path_to_videos),
        clips_per_video=num_clips,
        crops_per_clip=num_crops,
        dataset=dataset,
    )
Beispiel #4
0
def Ptvcharades(cfg, mode):
    """
    Construct PyTorchVideo Charades video loader.
    Load Charades data (frame paths, labels, etc. ) to Charades Dataset object.
    The dataset could be downloaded from Chrades official website
    (https://allenai.org/plato/charades/).
    Please see datasets/DATASET.md for more information about the data format.
    For `train` and `val` mode, a single clip is randomly sampled from every video
    with random cropping, scaling, and flipping. For `test` mode, multiple clips are
    uniformaly sampled from every video with center cropping.
    Args:
        cfg (CfgNode): configs.
        mode (string): Options includes `train`, `val`, or `test` mode.
            For the train and val mode, the data loader will take data
            from the train or val set, and sample one clip per video.
            For the test mode, the data loader will take data from test set,
            and sample multiple clips per video.
    """
    # Only support train, val, and test mode.
    assert mode in [
        "train",
        "val",
        "test",
    ], "Split '{}' not supported".format(mode)

    logger.info("Constructing Ptvcharades {}...".format(mode))

    clip_duration = ((cfg.DATA.NUM_FRAMES - 1) * cfg.DATA.SAMPLING_RATE +
                     1) / cfg.DATA.TARGET_FPS

    if mode in ["train", "val"]:
        num_clips = 1
        num_crops = 1

        transform = Compose([
            ApplyTransformToKey(
                key="video",
                transform=Compose([
                    Lambda(div255),
                    NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
                    RandomShortSideScale(
                        min_size=cfg.DATA.TRAIN_JITTER_SCALES[0],
                        max_size=cfg.DATA.TRAIN_JITTER_SCALES[1],
                    ),
                    RandomCropVideo(cfg.DATA.TRAIN_CROP_SIZE),
                    Lambda(rgb2bgr),
                ] + ([RandomHorizontalFlipVideo(
                    p=0.5)] if cfg.DATA.RANDOM_FLIP else []) +
                                  [PackPathway(cfg)]),
            ),
            Lambda(
                functools.partial(
                    process_charades_label,
                    mode=mode,
                    num_classes=cfg.MODEL.NUM_CLASSES,
                )),
            DictToTuple(num_clips, num_crops),
        ])
        clip_sampler = make_clip_sampler("random", clip_duration)
        if cfg.NUM_GPUS > 1:
            video_sampler = DistributedSampler
        else:
            video_sampler = (RandomSampler
                             if mode == "train" else SequentialSampler)
    else:
        num_clips = cfg.TEST.NUM_ENSEMBLE_VIEWS
        num_crops = cfg.TEST.NUM_SPATIAL_CROPS

        transform = Compose([
            ApplyTransformToKey(
                key="video",
                transform=Compose([
                    Lambda(div255),
                    NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
                    ShortSideScale(size=cfg.DATA.TEST_CROP_SIZE),
                ]),
            ),
            UniformCropVideo(size=cfg.DATA.TEST_CROP_SIZE),
            Lambda(
                functools.partial(
                    process_charades_label,
                    mode=mode,
                    num_classes=cfg.MODEL.NUM_CLASSES,
                )),
            ApplyTransformToKey(
                key="video",
                transform=Compose(
                    [Lambda(rgb2bgr), PackPathway(cfg)], ),
            ),
            DictToTuple(num_clips, num_crops),
        ])
        clip_sampler = make_clip_sampler(
            "constant_clips_per_video",
            clip_duration,
            num_clips,
            num_crops,
        )
        video_sampler = (DistributedSampler
                         if cfg.NUM_GPUS > 1 else SequentialSampler)

    data_path = os.path.join(cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format(mode))
    dataset = Charades(
        data_path=data_path,
        clip_sampler=clip_sampler,
        video_sampler=video_sampler,
        transform=transform,
        video_path_prefix=cfg.DATA.PATH_PREFIX,
        frames_per_clip=cfg.DATA.NUM_FRAMES,
    )

    logger.info("Constructing charades dataloader (size: {}) from {}".format(
        len(dataset._path_to_videos), data_path))

    return PTVDatasetWrapper(
        num_videos=len(dataset._path_to_videos),
        clips_per_video=num_clips,
        crops_per_clip=num_crops,
        dataset=dataset,
    )
Beispiel #5
0
def Ptvkinetics(cfg, mode):
    """
    Construct the Kinetics video loader with a given csv file. The format of
    the csv file is:
    ```
    path_to_video_1 label_1
    path_to_video_2 label_2
    ...
    path_to_video_N label_N
    ```
    For `train` and `val` mode, a single clip is randomly sampled from every video
    with random cropping, scaling, and flipping. For `test` mode, multiple clips are
    uniformaly sampled from every video with center cropping.
    Args:
        cfg (CfgNode): configs.
        mode (string): Options includes `train`, `val`, or `test` mode.
            For the train and val mode, the data loader will take data
            from the train or val set, and sample one clip per video.
            For the test mode, the data loader will take data from test set,
            and sample multiple clips per video.
    """
    # Only support train, val, and test mode.
    assert mode in [
        "train",
        "val",
        "test",
    ], "Split '{}' not supported".format(mode)

    logger.info("Constructing Ptvkinetics {}...".format(mode))

    clip_duration = (cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE /
                     cfg.DATA.TARGET_FPS)
    path_to_file = os.path.join(cfg.DATA.PATH_TO_DATA_DIR,
                                "{}.csv".format(mode))
    labeled_video_paths = LabeledVideoPaths.from_path(path_to_file)
    num_videos = len(labeled_video_paths)
    labeled_video_paths.path_prefix = cfg.DATA.PATH_PREFIX
    logger.info("Constructing kinetics dataloader (size: {}) from {}".format(
        num_videos, path_to_file))

    if mode in ["train", "val"]:
        num_clips = 1
        num_crops = 1

        transform = Compose([
            ApplyTransformToKey(
                key="video",
                transform=Compose([
                    UniformTemporalSubsample(cfg.DATA.NUM_FRAMES),
                    Lambda(div255),
                    NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
                    RandomShortSideScale(
                        min_size=cfg.DATA.TRAIN_JITTER_SCALES[0],
                        max_size=cfg.DATA.TRAIN_JITTER_SCALES[1],
                    ),
                    RandomCropVideo(cfg.DATA.TRAIN_CROP_SIZE),
                ] + ([RandomHorizontalFlipVideo(
                    p=0.5)] if cfg.DATA.RANDOM_FLIP else []) +
                                  [PackPathway(cfg)]),
            ),
            DictToTuple(num_clips, num_crops),
        ])

        clip_sampler = make_clip_sampler("random", clip_duration)
        if cfg.NUM_GPUS > 1:
            video_sampler = DistributedSampler
        else:
            video_sampler = (RandomSampler
                             if mode == "train" else SequentialSampler)
    else:
        num_clips = cfg.TEST.NUM_ENSEMBLE_VIEWS
        num_crops = cfg.TEST.NUM_SPATIAL_CROPS

        transform = Compose([
            ApplyTransformToKey(
                key="video",
                transform=Compose([
                    UniformTemporalSubsample(cfg.DATA.NUM_FRAMES),
                    Lambda(div255),
                    NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
                    ShortSideScale(size=cfg.DATA.TRAIN_JITTER_SCALES[0]),
                ]),
            ),
            UniformCropVideo(size=cfg.DATA.TEST_CROP_SIZE),
            ApplyTransformToKey(key="video", transform=PackPathway(cfg)),
            DictToTuple(num_clips, num_crops),
        ])
        clip_sampler = make_clip_sampler(
            "constant_clips_per_video",
            clip_duration,
            num_clips,
            num_crops,
        )
        video_sampler = (DistributedSampler
                         if cfg.NUM_GPUS > 1 else SequentialSampler)

    return PTVDatasetWrapper(
        num_videos=num_videos,
        clips_per_video=num_clips,
        crops_per_clip=num_crops,
        dataset=LabeledVideoDataset(
            labeled_video_paths=labeled_video_paths,
            clip_sampler=clip_sampler,
            video_sampler=video_sampler,
            transform=transform,
            decode_audio=False,
        ),
    )