def run_distributed(rank, size, decoder, clip_duration, data_name,
                    return_dict):
    """
    This function is run by each distributed process. It samples videos
    based on the distributed split (determined by the
    DistributedSampler) and returns the dataset clips in the return_dict.
    """
    os.environ["MASTER_ADDR"] = "127.0.0.1"
    os.environ["MASTER_PORT"] = "29500"
    dist.init_process_group("gloo", rank=rank, world_size=size)
    clip_sampler = make_clip_sampler("uniform", clip_duration)
    labeled_video_paths = LabeledVideoPaths.from_path(data_name)
    dataset = LabeledVideoDataset(
        labeled_video_paths,
        clip_sampler=clip_sampler,
        video_sampler=DistributedSampler,
        decode_audio=False,
        decoder=decoder,
    )
    test_dataloader = DataLoader(dataset, batch_size=None, num_workers=1)

    # Run two epochs, simulating use in a training loop
    dataset.video_sampler.set_epoch(0)
    epoch_1 = [(sample["label"], sample["video"])
               for sample in test_dataloader]
    dataset.video_sampler.set_epoch(1)
    epoch_2 = [(sample["label"], sample["video"])
               for sample in test_dataloader]
    return_dict[rank] = {"epoch_1": epoch_1, "epoch_2": epoch_2}
Ejemplo n.º 2
0
 def __init__(
     self,
     running_stage: RunningStage,
     data: Any,
     *args,
     clip_sampler: str = "random",
     clip_duration: float = 2,
     video_sampler: Type[Sampler] = torch.utils.data.RandomSampler,
     decode_audio=False,
     decoder: str = "pyav",
     clip_sampler_kwargs: Optional[Dict] = None,
     data_folder: str = "",
     **kwargs,
 ):
     if not _PYTORCHVIDEO_AVAILABLE:
         raise ModuleNotFoundError(
             "Please, run `pip install pytorchvideo`.")
     self.video_sampler = video_sampler or torch.utils.data.RandomSampler
     clip_sampler_kwargs = clip_sampler_kwargs or {}
     self.clip_sampler = make_clip_sampler(clip_sampler, clip_duration,
                                           **clip_sampler_kwargs)
     self.decode_audio = decode_audio
     self.decoder = decoder
     self.clip_duration = clip_duration
     self._data_folder = data_folder
     super().__init__(running_stage, data, *args, **kwargs)
Ejemplo n.º 3
0
    def test_multiple_labels_per_frame(self):
        frame_names = [f"{str(i)}.png" for i in range(3)]

        # Create csv containing a test frame videos.
        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as f:
            f.write(
                "original_vido_id video_id frame_id path labels\n".encode())
            with temp_frame_video(frame_names) as (frame_1_video_dir, data_1):
                for i, frame_name in enumerate(frame_names):
                    original_video_id = str(frame_1_video_dir)
                    video_id = "1"
                    frame_id = str(i)
                    path = pathlib.Path(frame_1_video_dir) / frame_name
                    label = "0,100"
                    f.write(
                        f"{original_video_id} {video_id} {frame_id} {path} {label}\n"
                        .encode())

                f.close()

                clip_sampler = make_clip_sampler(
                    "random",
                    0.1,  # Total duration of 3 frames at 30fps is 0.1 seconds.                )
                )
                dataset = Charades(f.name,
                                   clip_sampler=clip_sampler,
                                   video_sampler=SequentialSampler)

                sample = next(dataset)
                self.assertEqual(sample["label"],
                                 [[0, 100], [0, 100], [0, 100]])
                self.assertTrue(sample["video"].equal(data_1))
Ejemplo n.º 4
0
    def test_single_clip_per_video_works(self):
        with temp_ssv2_dataset() as (
                label_name_file,
                video_label_file,
                video_path_file,
                video_1,
                video_2,
        ):

            # Put arbitrary duration as ssv2 always needs full video clip.
            clip_sampler = make_clip_sampler("constant_clips_per_video", 1.0,
                                             1)
            # Expect taking 2 frames (1-th and 4-th among 7 frames).
            dataset = SSv2(
                label_name_file,
                video_label_file,
                video_path_file,
                clip_sampler=clip_sampler,
                video_sampler=SequentialSampler,
                frames_per_clip=2,
            )
            expected = [(0, video_1), (1, video_2)]
            for sample, expected_sample in zip(dataset, expected):
                self.assertEqual(sample["label"], expected_sample[0])
                self.assertTrue(sample["video"].equal(expected_sample[1][:,
                                                                         (1,
                                                                          4)]))
    def test_video_name_with_whitespace_works(self, decoder):
        num_frames = 10
        fps = 5
        with temp_encoded_video(num_frames=num_frames,
                                fps=fps,
                                prefix="pre fix") as (
                                    video_file_name,
                                    data,
                                ):
            with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f:
                f.write(f"{video_file_name} 0\n".encode())
                f.write(f"{video_file_name} 1\n".encode())

            total_duration = num_frames / fps
            clip_sampler = make_clip_sampler("uniform", total_duration)
            labeled_video_paths = LabeledVideoPaths.from_path(f.name)
            dataset = LabeledVideoDataset(
                labeled_video_paths,
                clip_sampler=clip_sampler,
                video_sampler=SequentialSampler,
                decode_audio=False,
                decoder=decoder,
            )

            expected = [(0, data), (1, data)]
            for i, sample in enumerate(dataset):
                self.assertTrue(sample["video"].equal(expected[i][1]))
                self.assertEqual(sample["label"], expected[i][0])
    def test_sampling_with_more_processes_than_videos(self, decoder):
        with mock_encoded_video_dataset_file() as (
                mock_csv,
                label_videos,
                total_duration,
        ):
            half_duration = total_duration / 2 - self._EPS
            clip_sampler = make_clip_sampler("uniform", half_duration)
            labeled_video_paths = LabeledVideoPaths.from_path(mock_csv)
            dataset = LabeledVideoDataset(
                labeled_video_paths,
                clip_sampler=clip_sampler,
                video_sampler=SequentialSampler,
                decode_audio=False,
                decoder=decoder,
            )

            # Split each full video into two clips.
            expected = []
            for label, data in label_videos:
                num_frames = data.shape[0]
                half_frames = num_frames // 2
                first_half_data = data[:, :half_frames]
                second_half_data = data[:, half_frames:]
                expected.append((label, first_half_data))
                expected.append((label, second_half_data))

            test_dataloader = DataLoader(dataset,
                                         batch_size=None,
                                         num_workers=16)
            actual = [(sample["label"], sample["video"])
                      for sample in test_dataloader]
            assert_unordered_list_compare_true(self, expected, actual)
Ejemplo n.º 7
0
def _make_clip_sampler(
    clip_sampler: Union[str, "ClipSampler"] = "random",
    clip_duration: float = 2,
    clip_sampler_kwargs: Dict[str, Any] = None,
) -> "ClipSampler":
    if clip_sampler_kwargs is None:
        clip_sampler_kwargs = {}
    return make_clip_sampler(clip_sampler, clip_duration, **clip_sampler_kwargs)
Ejemplo n.º 8
0
    def __init__(
        self,
        train_transform: Optional[Dict[str, Callable]] = None,
        val_transform: Optional[Dict[str, Callable]] = None,
        test_transform: Optional[Dict[str, Callable]] = None,
        predict_transform: Optional[Dict[str, Callable]] = None,
        clip_sampler: Union[str, 'ClipSampler'] = "random",
        clip_duration: float = 2,
        clip_sampler_kwargs: Dict[str, Any] = None,
        video_sampler: Type[Sampler] = torch.utils.data.RandomSampler,
        decode_audio: bool = True,
        decoder: str = "pyav",
    ):
        self.clip_sampler = clip_sampler
        self.clip_duration = clip_duration
        self.clip_sampler_kwargs = clip_sampler_kwargs
        self.video_sampler = video_sampler
        self.decode_audio = decode_audio
        self.decoder = decoder

        if not _PYTORCHVIDEO_AVAILABLE:
            raise ModuleNotFoundError(
                "Please, run `pip install pytorchvideo`.")

        if not clip_sampler_kwargs:
            clip_sampler_kwargs = {}

        if not clip_sampler:
            raise MisconfigurationException(
                "clip_sampler should be provided as a string or ``pytorchvideo.data.clip_sampling.ClipSampler``"
            )

        clip_sampler = make_clip_sampler(clip_sampler, clip_duration,
                                         **clip_sampler_kwargs)

        super().__init__(
            train_transform=train_transform,
            val_transform=val_transform,
            test_transform=test_transform,
            predict_transform=predict_transform,
            data_sources={
                DefaultDataSources.FILES:
                VideoClassificationPathsDataSource(
                    clip_sampler,
                    video_sampler=video_sampler,
                    decode_audio=decode_audio,
                    decoder=decoder,
                ),
                DefaultDataSources.FOLDERS:
                VideoClassificationPathsDataSource(
                    clip_sampler,
                    video_sampler=video_sampler,
                    decode_audio=decode_audio,
                    decoder=decoder,
                ),
            },
            default_data_source=DefaultDataSources.FILES,
        )
    def test_reading_from_directory_structure(self, decoder):
        # For an unknown reason this import has to be here for `buck test` to work.
        import torchvision.io as io

        with tempfile.TemporaryDirectory() as root_dir:

            # Create test directory structure with two classes and a video in each.
            root_dir_name = pathlib.Path(root_dir)
            test_class_1 = root_dir_name / "running"
            test_class_1.mkdir()
            data_1 = create_dummy_video_frames(15, 10, 10)
            test_class_2 = root_dir_name / "cleaning windows"
            test_class_2.mkdir()
            data_2 = create_dummy_video_frames(20, 15, 15)
            with tempfile.NamedTemporaryFile(
                    suffix=".mp4",
                    dir=test_class_1) as f_1, tempfile.NamedTemporaryFile(
                        suffix=".mp4", dir=test_class_2) as f_2:
                f_1.close()
                f_2.close()

                # Write lossless video for each class.
                io.write_video(
                    f_1.name,
                    data_1,
                    fps=30,
                    video_codec="libx264rgb",
                    options={"crf": "0"},
                )
                io.write_video(
                    f_2.name,
                    data_2,
                    fps=30,
                    video_codec="libx264rgb",
                    options={"crf": "0"},
                )

                clip_sampler = make_clip_sampler("uniform", 3)
                labeled_video_paths = LabeledVideoPaths.from_path(root_dir)
                dataset = LabeledVideoDataset(
                    labeled_video_paths,
                    clip_sampler=clip_sampler,
                    video_sampler=SequentialSampler,
                    decode_audio=False,
                    decoder=decoder,
                )

                # Videos are sorted alphabetically so "cleaning windows" (i.e. data_2)
                # will be first.
                sample_1 = next(dataset)
                self.assertEqual(sample_1["label"], 0)
                self.assertTrue(sample_1["video"].equal(
                    thwc_to_cthw(data_2).to(torch.float32)))

                sample_2 = next(dataset)
                self.assertEqual(sample_2["label"], 1)
                self.assertTrue(sample_2["video"].equal(
                    thwc_to_cthw(data_1).to(torch.float32)))
Ejemplo n.º 10
0
    def test_video_only_frame_video_dataset(self):
        total_duration = 2.0
        with mock_json_annotations() as (annotation_json, labels, duration):
            clip_sampler = make_clip_sampler("random", total_duration)
            dataset = json_dataset.video_only_dataset(
                data_path=annotation_json,
                clip_sampler=clip_sampler,
                decode_audio=False,
            )

            self.assertEqual(dataset.num_videos, 2)
            self.assertEqual(len(list(iter(dataset))), 2)
Ejemplo n.º 11
0
    def test_recognition_uniform_clip_sampler(self):
        total_duration = 0.05
        with mock_json_annotations() as (annotation_json, labels, duration):
            clip_sampler = make_clip_sampler("uniform", total_duration)
            dataset = json_dataset.clip_recognition_dataset(
                data_path=annotation_json,
                clip_sampler=clip_sampler,
                decode_audio=False,
            )

            self.assertEqual(dataset.num_videos, 4)
            self.assertEqual(len(list(iter(dataset))), 4)
Ejemplo n.º 12
0
 def test_single_clip_per_video_works(self):
     with temp_charades_dataset() as (filename, video_1, video_2):
         clip_sampler = make_clip_sampler(
             "uniform",
             0.1  # Total duration of 3 frames at 30fps is 0.1 seconds.
         )
         dataset = Charades(filename,
                            clip_sampler=clip_sampler,
                            video_sampler=SequentialSampler)
         expected = [([[0], [0], [0]], video_1), ([[1], [1], [1]], video_2)]
         for sample, expected_sample in zip(dataset, expected):
             self.assertEqual(sample["label"], expected_sample[0])
             self.assertTrue(sample["video"].equal(expected_sample[1]))
    def test_random_video_sampler(self, decoder):
        with mock_encoded_video_dataset_file() as (mock_csv, expected,
                                                   total_duration):
            clip_sampler = make_clip_sampler("uniform", total_duration)
            dataset = labeled_video_dataset(
                data_path=mock_csv,
                clip_sampler=clip_sampler,
                video_sampler=RandomSampler,
                decode_audio=False,
                decoder=decoder,
            )

            for _ in range(2):
                actual = [(sample["label"], sample["video"])
                          for sample in dataset]
                assert_unordered_list_compare_true(self, expected, actual)
    def test_sampling_with_non_divisible_processes_by_clips(self, decoder):

        # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2
        # clips respectively.
        num_frames = 10
        fps = 5
        with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as (
                video_file_name_1,
                data_1,
        ):
            with temp_encoded_video(num_frames=num_frames, fps=fps) as (
                    video_file_name_2,
                    data_2,
            ):
                with tempfile.NamedTemporaryFile(delete=False,
                                                 suffix=".txt") as f:
                    f.write(f"{video_file_name_1} 0\n".encode())
                    f.write(f"{video_file_name_2} 1\n".encode())

                total_duration = num_frames / fps
                half_duration = total_duration / 2 - self._EPS
                clip_sampler = make_clip_sampler("uniform", half_duration)
                labeled_video_paths = LabeledVideoPaths.from_path(f.name)
                dataset = LabeledVideoDataset(
                    labeled_video_paths,
                    clip_sampler=clip_sampler,
                    video_sampler=SequentialSampler,
                    decode_audio=False,
                    decoder=decoder,
                )

                half_frames = num_frames // 2
                expected = {
                    (0, data_1[:, half_frames * 2:]),  # 1/3 clip
                    (0, data_1[:, half_frames:half_frames * 2]),  # 2/3 clip
                    (0, data_1[:, :half_frames]),  # 3/3/ clip
                    (1, data_2[:, :half_frames]),  # First half
                    (1, data_2[:, half_frames:]),  # Second half
                }

                test_dataloader = DataLoader(dataset,
                                             batch_size=None,
                                             num_workers=2)
                actual = [(sample["label"], sample["video"])
                          for sample in test_dataloader]
                assert_unordered_list_compare_true(self, expected, actual)
    def test_constant_clips_per_video_sampling_works(self, decoder):
        # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2
        # clips respectively.
        num_frames = 10
        fps = 5
        with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as (
                video_file_name_1,
                data_1,
        ):
            with temp_encoded_video(num_frames=num_frames, fps=fps) as (
                    video_file_name_2,
                    data_2,
            ):
                with tempfile.NamedTemporaryFile(delete=False,
                                                 suffix=".txt") as f:
                    f.write(f"{video_file_name_1} 0\n".encode())
                    f.write(f"{video_file_name_2} 1\n".encode())

                clip_frames = 2
                duration_for_frames = clip_frames / fps - self._EPS
                clip_sampler = make_clip_sampler("constant_clips_per_video",
                                                 duration_for_frames, 2)
                labeled_video_paths = LabeledVideoPaths.from_path(f.name)
                dataset = LabeledVideoDataset(
                    labeled_video_paths,
                    clip_sampler=clip_sampler,
                    video_sampler=SequentialSampler,
                    decode_audio=False,
                    decoder=decoder,
                )

                # Dataset has 2 videos. Each video has two evenly spaced clips of size
                # clip_frames sampled. The first clip of each video will always be
                # sampled at second 0. The second clip of the video is the next frame
                # from time: (total_duration - clip_duration) / 2
                half_frames_1 = math.ceil((data_1.shape[1] - clip_frames) / 2)
                half_frames_2 = math.ceil((data_2.shape[1] - clip_frames) / 2)
                expected = [
                    (0, data_1[:, :clip_frames]),
                    (0, data_1[:, half_frames_1:half_frames_1 + clip_frames]),
                    (1, data_2[:, :clip_frames]),
                    (1, data_2[:, half_frames_2:half_frames_2 + clip_frames]),
                ]
                for i, sample in enumerate(dataset):
                    self.assertTrue(sample["video"].equal(expected[i][1]))
                    self.assertEqual(sample["label"], expected[i][0])
    def test_single_clip_per_video_works(self, decoder):
        with mock_encoded_video_dataset_file() as (mock_csv, expected,
                                                   total_duration):
            clip_sampler = make_clip_sampler("uniform", total_duration)
            dataset = labeled_video_dataset(
                data_path=mock_csv,
                clip_sampler=clip_sampler,
                video_sampler=SequentialSampler,
                decode_audio=False,
                decoder=decoder,
            )
            test_dataloader = DataLoader(dataset,
                                         batch_size=None,
                                         num_workers=2)

            for _ in range(2):
                actual = [(sample["label"], sample["video"])
                          for sample in test_dataloader]
                assert_unordered_list_compare_true(self, expected, actual)
Ejemplo n.º 17
0
    def test_multiple_clips_per_video_works(self):
        with temp_charades_dataset() as (filename, video_1, video_2):
            clip_sampler = make_clip_sampler(
                "uniform",
                0.033  # Expects each clip to have 1 frame each.
            )
            dataset = Charades(filename,
                               clip_sampler=clip_sampler,
                               video_sampler=SequentialSampler)

            expected = [
                ([[0]], video_1[:, 0:1]),
                ([[0]], video_1[:, 1:2]),
                ([[0]], video_1[:, 2:3]),
                ([[1]], video_2[:, 0:1]),
                ([[1]], video_2[:, 1:2]),
                ([[1]], video_2[:, 2:3]),
            ]
            for sample, expected_sample in zip(dataset, expected):
                self.assertEqual(sample["label"], expected_sample[0])
                self.assertTrue(sample["video"].equal(expected_sample[1]))
    def test_random_clip_sampling_works(self, decoder):
        with mock_encoded_video_dataset_file() as (
                mock_csv,
                label_videos,
                total_duration,
        ):
            half_duration = total_duration / 2 - self._EPS
            clip_sampler = make_clip_sampler("random", half_duration)
            labeled_video_paths = LabeledVideoPaths.from_path(mock_csv)
            dataset = LabeledVideoDataset(
                labeled_video_paths,
                clip_sampler=clip_sampler,
                video_sampler=SequentialSampler,
                decode_audio=False,
                decoder=decoder,
            )

            expected_labels = [label for label, _ in label_videos]
            for i, sample in enumerate(dataset):
                expected_t_shape = 5
                self.assertEqual(sample["video"].shape[1], expected_t_shape)
                self.assertEqual(sample["label"], expected_labels[i])
Ejemplo n.º 19
0
    def test_multiple_videos(self):
        with tempfile.NamedTemporaryFile(delete=False,
                                         suffix=".csv") as data_file:
            with temp_ava_dataset_2_videos() as (
                    frame_paths_file,
                    video_1,
                    video_2,
                    video_1_name,
                    video_2_name,
            ):
                # add bounding boxes
                # video 1
                bb_1_a, bb_1_a_string = get_random_bbox()
                action_1_a, iou_1_a = 1, 0.85
                bb_1_b, bb_1_b_string = get_random_bbox()
                action_1_b, iou_1_b = 2, 0.4

                data_file.write(
                    (f"{video_1_name},902,{bb_1_a_string}," +
                     f"{str(action_1_a)},{str(iou_1_a)}\n").encode())
                data_file.write(
                    (f"{video_1_name},902,{bb_1_b_string}," +
                     f"{str(action_1_b)},{str(iou_1_b)}\n").encode())
                # video 2
                bb_2_a, bb_2_a_string = get_random_bbox()
                action_2_a, iou_2_a = 3, 0.95
                bb_2_b, bb_2_b_string = get_random_bbox()
                action_2_b, iou_2_b = 4, 0.9

                data_file.write(
                    (f"{video_2_name},902,{bb_2_a_string}," +
                     f"{str(action_2_a)},{str(iou_2_a)}\n").encode())
                data_file.write(
                    (f"{video_2_name},902,{bb_2_b_string}," +
                     f"{str(action_2_b)},{str(iou_2_b)}\n").encode())

                data_file.close()

                dataset = Ava(
                    frame_paths_file=frame_paths_file,
                    frame_labels_file=data_file.name,
                    clip_sampler=make_clip_sampler("random", 1.0),
                )

                # All videos are of the form cthw and fps is 30
                # Clip is samples at time step = 2 secs in video
                sample_1 = next(dataset)
                self.assertTrue(sample_1["video"].equal(video_1[:,
                                                                45:75, :, :]))
                self.assertTrue(
                    torch.tensor(sample_1["boxes"]).equal(
                        torch.tensor([bb_1_a, bb_1_b])))
                self.assertTrue(
                    torch.tensor(sample_1["labels"]).equal(
                        torch.tensor([[action_1_a], [action_1_b]])))
                sample_2 = next(dataset)
                self.assertTrue(sample_2["video"].equal(video_2[:,
                                                                45:75, :, :]))
                self.assertTrue(
                    torch.tensor(sample_2["boxes"]).equal(
                        torch.tensor([bb_2_a, bb_2_b])))
                self.assertTrue(
                    torch.tensor(sample_2["labels"]).equal(
                        torch.tensor([[action_2_a], [action_2_b]])))
Ejemplo n.º 20
0
    def from_paths(
        cls,
        train_data_path: Optional[Union[str, pathlib.Path]] = None,
        val_data_path: Optional[Union[str, pathlib.Path]] = None,
        test_data_path: Optional[Union[str, pathlib.Path]] = None,
        predict_data_path: Union[str, pathlib.Path] = None,
        clip_sampler: Union[str, 'ClipSampler'] = "random",
        clip_duration: float = 2,
        clip_sampler_kwargs: Dict[str, Any] = None,
        video_sampler: Type[Sampler] = RandomSampler,
        decode_audio: bool = True,
        decoder: str = "pyav",
        train_transform: Optional[Dict[str, Callable]] = None,
        val_transform: Optional[Dict[str, Callable]] = None,
        test_transform: Optional[Dict[str, Callable]] = None,
        predict_transform: Optional[Dict[str, Callable]] = None,
        batch_size: int = 4,
        num_workers: Optional[int] = None,
        preprocess: Optional[Preprocess] = None,
        **kwargs,
    ) -> 'DataModule':
        """

        Creates a VideoClassificationData object from folders of videos arranged in this way: ::

            train/class_x/xxx.ext
            train/class_x/xxy.ext
            train/class_x/xxz.ext
            train/class_y/123.ext
            train/class_y/nsdf3.ext
            train/class_y/asd932_.ext

        Args:
            train_data_path: Path to training folder. Default: None.
            val_data_path: Path to validation folder. Default: None.
            test_data_path: Path to test folder. Default: None.
            predict_data_path: Path to predict folder. Default: None.
            clip_sampler: ClipSampler to be used on videos.
            clip_duration: Clip duration for the clip sampler.
            clip_sampler_kwargs: Extra ClipSampler keyword arguments.
            video_sampler: Sampler for the internal video container.
                This defines the order videos are decoded and, if necessary, the distributed split.
            decode_audio: Whether to decode the audio with the video clip.
            decoder: Defines what type of decoder used to decode a video.
            train_transform: Video clip dictionary transform to use for training set.
            val_transform:  Video clip dictionary transform to use for validation set.
            test_transform:  Video clip dictionary transform to use for test set.
            predict_transform:  Video clip dictionary transform to use for predict set.
            batch_size: Batch size for data loading.
            num_workers: The number of workers to use for parallelized loading.
                Defaults to ``None`` which equals the number of available CPU threads.
            preprocess: VideoClassifierPreprocess to handle the data processing.

        Returns:
            VideoClassificationData: the constructed data module

        Examples:
            >>> videos = VideoClassificationData.from_paths("train/") # doctest: +SKIP

        """
        if not _PYTORCHVIDEO_AVAILABLE:
            raise ModuleNotFoundError("Please, run `pip install pytorchvideo`.")

        if not clip_sampler_kwargs:
            clip_sampler_kwargs = {}

        if not clip_sampler:
            raise MisconfigurationException(
                "clip_sampler should be provided as a string or ``pytorchvideo.data.clip_sampling.ClipSampler``"
            )

        clip_sampler = make_clip_sampler(clip_sampler, clip_duration, **clip_sampler_kwargs)

        preprocess: Preprocess = preprocess or cls.preprocess_cls(
            clip_sampler, video_sampler, decode_audio, decoder, train_transform, val_transform, test_transform,
            predict_transform
        )

        return cls.from_load_data_inputs(
            train_load_data_input=train_data_path,
            val_load_data_input=val_data_path,
            test_load_data_input=test_data_path,
            predict_load_data_input=predict_data_path,
            batch_size=batch_size,
            num_workers=num_workers,
            preprocess=preprocess,
            use_iterable_auto_dataset=True,
            **kwargs,
        )
    def test_reading_from_directory_structure_hmdb51(self, decoder):
        # For an unknown reason this import has to be here for `buck test` to work.
        import torchvision.io as io

        with tempfile.TemporaryDirectory() as root_dir:

            # Create test directory structure with two classes and a video in each.
            root_dir_name = pathlib.Path(root_dir)
            action_1 = "running"
            action_2 = "cleaning_windows"

            videos_root_dir = root_dir_name / "videos"
            videos_root_dir.mkdir()

            test_class_1 = videos_root_dir / action_1
            test_class_1.mkdir()
            data_1 = create_dummy_video_frames(15, 10, 10)
            test_class_2 = videos_root_dir / action_2
            test_class_2.mkdir()
            data_2 = create_dummy_video_frames(20, 15, 15)

            test_splits = root_dir_name / "folds"
            test_splits.mkdir()

            with tempfile.NamedTemporaryFile(
                    suffix="_u_nm_np1_ba_goo_19.avi",
                    dir=test_class_1) as f_1, tempfile.NamedTemporaryFile(
                        suffix="_u_nm_np1_fr_med_1.avi",
                        dir=test_class_2) as f_2:
                f_1.close()
                f_2.close()

                # Write lossless video for each class.
                io.write_video(
                    f_1.name,
                    data_1,
                    fps=30,
                    video_codec="libx264rgb",
                    options={"crf": "0"},
                )
                io.write_video(
                    f_2.name,
                    data_2,
                    fps=30,
                    video_codec="libx264rgb",
                    options={"crf": "0"},
                )

                _, video_name_1 = os.path.split(f_1.name)
                _, video_name_2 = os.path.split(f_2.name)

                with open(
                        os.path.join(test_splits,
                                     action_1 + "_test_split1.txt"), "w") as f:
                    f.write(f"{video_name_1} 1\n")

                with open(
                        os.path.join(test_splits,
                                     action_2 + "_test_split1.txt"), "w") as f:
                    f.write(f"{video_name_2} 1\n")

                clip_sampler = make_clip_sampler("uniform", 3)
                dataset = Hmdb51(
                    data_path=test_splits,
                    video_path_prefix=root_dir_name / "videos",
                    clip_sampler=clip_sampler,
                    video_sampler=SequentialSampler,
                    split_id=1,
                    split_type="train",
                    decode_audio=False,
                    decoder=decoder,
                )

                # Videos are sorted alphabetically so "cleaning windows" (i.e. data_2)
                # will be first.
                sample_1 = next(dataset)
                sample_2 = next(dataset)

                self.assertTrue(sample_1["label"] in [action_1, action_2])
                if sample_1["label"] == action_2:
                    sample_1, sample_2 = sample_2, sample_1

                self.assertEqual(sample_1["label"], action_1)
                self.assertEqual(5, len(sample_1["meta_tags"]))
                self.assertTrue(sample_1["video"].equal(
                    thwc_to_cthw(data_1).to(torch.float32)))

                self.assertEqual(sample_2["label"], action_2)
                self.assertEqual(5, len(sample_2["meta_tags"]))
                self.assertTrue(sample_2["video"].equal(
                    thwc_to_cthw(data_2).to(torch.float32)))