def make_ucf11_datamodule(root='./', **kwargs): data_path = Path(root) / 'action_youtube_naudio' if not data_path.exists(): download_and_unzip(_ucf11_url, root, False) # Collect all class names, scene folders, and label2id mapping classes = sorted(x.name for x in data_path.glob("*") if x.is_dir()) label2id = {} scene_folders = [] for class_id, class_name in enumerate(classes): label2id[class_name] = class_id class_folder = data_path / class_name scene_folders.extend( list(filter(Path.is_dir, class_folder.glob('v_*')))) shuffle(scene_folders) num_train_scenes = int(0.8 * len(scene_folders)) train_paths, val_paths = [], [] for i, scene in enumerate(scene_folders): class_id = label2id[scene.parent.name] labeled_paths = [(video, class_id) for video in scene.glob('*.avi')] if i < num_train_scenes: train_paths.extend(labeled_paths) else: val_paths.extend(labeled_paths) return LabeledVideoDataModule(LabeledVideoPaths(train_paths), LabeledVideoPaths(val_paths), label2id=label2id, classes=classes, **kwargs)
def make_mini_kinetics_datamodule(root='./', **kwargs): kinetics_path = Path(root) / 'kinetics' if not kinetics_path.exists(): download_and_unzip(_mini_kinetics_url, root) return LabeledVideoDataModule( LabeledVideoPaths.from_path(kinetics_path / 'train'), LabeledVideoPaths.from_path(kinetics_path / 'val'), **kwargs)
def run_distributed(rank, size, decoder, clip_duration, data_name, return_dict): """ This function is run by each distributed process. It samples videos based on the distributed split (determined by the DistributedSampler) and returns the dataset clips in the return_dict. """ os.environ["MASTER_ADDR"] = "127.0.0.1" os.environ["MASTER_PORT"] = "29500" dist.init_process_group("gloo", rank=rank, world_size=size) clip_sampler = make_clip_sampler("uniform", clip_duration) labeled_video_paths = LabeledVideoPaths.from_path(data_name) dataset = LabeledVideoDataset( labeled_video_paths, clip_sampler=clip_sampler, video_sampler=DistributedSampler, decode_audio=False, decoder=decoder, ) test_dataloader = DataLoader(dataset, batch_size=None, num_workers=1) # Run two epochs, simulating use in a training loop dataset.video_sampler.set_epoch(0) epoch_1 = [(sample["label"], sample["video"]) for sample in test_dataloader] dataset.video_sampler.set_epoch(1) epoch_2 = [(sample["label"], sample["video"]) for sample in test_dataloader] return_dict[rank] = {"epoch_1": epoch_1, "epoch_2": epoch_2}
def test_video_name_with_whitespace_works(self, decoder): num_frames = 10 fps = 5 with temp_encoded_video(num_frames=num_frames, fps=fps, prefix="pre fix") as ( video_file_name, data, ): with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f: f.write(f"{video_file_name} 0\n".encode()) f.write(f"{video_file_name} 1\n".encode()) total_duration = num_frames / fps clip_sampler = make_clip_sampler("uniform", total_duration) labeled_video_paths = LabeledVideoPaths.from_path(f.name) dataset = LabeledVideoDataset( labeled_video_paths, clip_sampler=clip_sampler, video_sampler=SequentialSampler, decode_audio=False, decoder=decoder, ) expected = [(0, data), (1, data)] for i, sample in enumerate(dataset): self.assertTrue(sample["video"].equal(expected[i][1])) self.assertEqual(sample["label"], expected[i][0])
def test_sampling_with_more_processes_than_videos(self, decoder): with mock_encoded_video_dataset_file() as ( mock_csv, label_videos, total_duration, ): half_duration = total_duration / 2 - self._EPS clip_sampler = make_clip_sampler("uniform", half_duration) labeled_video_paths = LabeledVideoPaths.from_path(mock_csv) dataset = LabeledVideoDataset( labeled_video_paths, clip_sampler=clip_sampler, video_sampler=SequentialSampler, decode_audio=False, decoder=decoder, ) # Split each full video into two clips. expected = [] for label, data in label_videos: num_frames = data.shape[0] half_frames = num_frames // 2 first_half_data = data[:, :half_frames] second_half_data = data[:, half_frames:] expected.append((label, first_half_data)) expected.append((label, second_half_data)) test_dataloader = DataLoader(dataset, batch_size=None, num_workers=16) actual = [(sample["label"], sample["video"]) for sample in test_dataloader] assert_unordered_list_compare_true(self, expected, actual)
def test_reading_from_directory_structure(self, decoder): # For an unknown reason this import has to be here for `buck test` to work. import torchvision.io as io with tempfile.TemporaryDirectory() as root_dir: # Create test directory structure with two classes and a video in each. root_dir_name = pathlib.Path(root_dir) test_class_1 = root_dir_name / "running" test_class_1.mkdir() data_1 = create_dummy_video_frames(15, 10, 10) test_class_2 = root_dir_name / "cleaning windows" test_class_2.mkdir() data_2 = create_dummy_video_frames(20, 15, 15) with tempfile.NamedTemporaryFile( suffix=".mp4", dir=test_class_1) as f_1, tempfile.NamedTemporaryFile( suffix=".mp4", dir=test_class_2) as f_2: f_1.close() f_2.close() # Write lossless video for each class. io.write_video( f_1.name, data_1, fps=30, video_codec="libx264rgb", options={"crf": "0"}, ) io.write_video( f_2.name, data_2, fps=30, video_codec="libx264rgb", options={"crf": "0"}, ) clip_sampler = make_clip_sampler("uniform", 3) labeled_video_paths = LabeledVideoPaths.from_path(root_dir) dataset = LabeledVideoDataset( labeled_video_paths, clip_sampler=clip_sampler, video_sampler=SequentialSampler, decode_audio=False, decoder=decoder, ) # Videos are sorted alphabetically so "cleaning windows" (i.e. data_2) # will be first. sample_1 = next(dataset) self.assertEqual(sample_1["label"], 0) self.assertTrue(sample_1["video"].equal( thwc_to_cthw(data_2).to(torch.float32))) sample_2 = next(dataset) self.assertEqual(sample_2["label"], 1) self.assertTrue(sample_2["video"].equal( thwc_to_cthw(data_1).to(torch.float32)))
def test_sampling_with_non_divisible_processes_by_clips(self, decoder): # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2 # clips respectively. num_frames = 10 fps = 5 with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as ( video_file_name_1, data_1, ): with temp_encoded_video(num_frames=num_frames, fps=fps) as ( video_file_name_2, data_2, ): with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f: f.write(f"{video_file_name_1} 0\n".encode()) f.write(f"{video_file_name_2} 1\n".encode()) total_duration = num_frames / fps half_duration = total_duration / 2 - self._EPS clip_sampler = make_clip_sampler("uniform", half_duration) labeled_video_paths = LabeledVideoPaths.from_path(f.name) dataset = LabeledVideoDataset( labeled_video_paths, clip_sampler=clip_sampler, video_sampler=SequentialSampler, decode_audio=False, decoder=decoder, ) half_frames = num_frames // 2 expected = { (0, data_1[:, half_frames * 2:]), # 1/3 clip (0, data_1[:, half_frames:half_frames * 2]), # 2/3 clip (0, data_1[:, :half_frames]), # 3/3/ clip (1, data_2[:, :half_frames]), # First half (1, data_2[:, half_frames:]), # Second half } test_dataloader = DataLoader(dataset, batch_size=None, num_workers=2) actual = [(sample["label"], sample["video"]) for sample in test_dataloader] assert_unordered_list_compare_true(self, expected, actual)
def test_constant_clips_per_video_sampling_works(self, decoder): # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2 # clips respectively. num_frames = 10 fps = 5 with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as ( video_file_name_1, data_1, ): with temp_encoded_video(num_frames=num_frames, fps=fps) as ( video_file_name_2, data_2, ): with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f: f.write(f"{video_file_name_1} 0\n".encode()) f.write(f"{video_file_name_2} 1\n".encode()) clip_frames = 2 duration_for_frames = clip_frames / fps - self._EPS clip_sampler = make_clip_sampler("constant_clips_per_video", duration_for_frames, 2) labeled_video_paths = LabeledVideoPaths.from_path(f.name) dataset = LabeledVideoDataset( labeled_video_paths, clip_sampler=clip_sampler, video_sampler=SequentialSampler, decode_audio=False, decoder=decoder, ) # Dataset has 2 videos. Each video has two evenly spaced clips of size # clip_frames sampled. The first clip of each video will always be # sampled at second 0. The second clip of the video is the next frame # from time: (total_duration - clip_duration) / 2 half_frames_1 = math.ceil((data_1.shape[1] - clip_frames) / 2) half_frames_2 = math.ceil((data_2.shape[1] - clip_frames) / 2) expected = [ (0, data_1[:, :clip_frames]), (0, data_1[:, half_frames_1:half_frames_1 + clip_frames]), (1, data_2[:, :clip_frames]), (1, data_2[:, half_frames_2:half_frames_2 + clip_frames]), ] for i, sample in enumerate(dataset): self.assertTrue(sample["video"].equal(expected[i][1])) self.assertEqual(sample["label"], expected[i][0])
def _make_encoded_video_dataset(self, data: SampleCollection) -> 'EncodedVideoDataset': classes = self._get_classes(data) label_to_class_mapping = dict(enumerate(classes)) class_to_label_mapping = {c: lab for lab, c in label_to_class_mapping.items()} filepaths = data.values("filepath") labels = data.values(self.label_field + ".label") targets = [class_to_label_mapping[lab] for lab in labels] labeled_video_paths = LabeledVideoPaths(list(zip(filepaths, targets))) ds: EncodedVideoDataset = EncodedVideoDataset( labeled_video_paths, self.clip_sampler, video_sampler=self.video_sampler, decode_audio=self.decode_audio, decoder=self.decoder, ) return ds
def test_random_clip_sampling_works(self, decoder): with mock_encoded_video_dataset_file() as ( mock_csv, label_videos, total_duration, ): half_duration = total_duration / 2 - self._EPS clip_sampler = make_clip_sampler("random", half_duration) labeled_video_paths = LabeledVideoPaths.from_path(mock_csv) dataset = LabeledVideoDataset( labeled_video_paths, clip_sampler=clip_sampler, video_sampler=SequentialSampler, decode_audio=False, decoder=decoder, ) expected_labels = [label for label, _ in label_videos] for i, sample in enumerate(dataset): expected_t_shape = 5 self.assertEqual(sample["video"].shape[1], expected_t_shape) self.assertEqual(sample["label"], expected_labels[i])
def load_data( self, files: List[PATH_TYPE], targets: List[Any], clip_sampler: Union[str, "ClipSampler"] = "random", clip_duration: float = 2, clip_sampler_kwargs: Dict[str, Any] = None, video_sampler: Type[Sampler] = torch.utils.data.RandomSampler, decode_audio: bool = False, decoder: str = "pyav", target_formatter: Optional[TargetFormatter] = None, ) -> "LabeledVideoDataset": dataset = LabeledVideoDataset( LabeledVideoPaths(list(zip(files, targets))), _make_clip_sampler(clip_sampler, clip_duration, clip_sampler_kwargs), video_sampler=video_sampler, decode_audio=decode_audio, decoder=decoder, ) if not self.predicting: self.load_target_metadata( [sample[1] for sample in dataset._labeled_videos._paths_and_labels], target_formatter=target_formatter ) return dataset
def Ptvkinetics(cfg, mode): """ Construct the Kinetics video loader with a given csv file. The format of the csv file is: ``` path_to_video_1 label_1 path_to_video_2 label_2 ... path_to_video_N label_N ``` For `train` and `val` mode, a single clip is randomly sampled from every video with random cropping, scaling, and flipping. For `test` mode, multiple clips are uniformaly sampled from every video with center cropping. Args: cfg (CfgNode): configs. mode (string): Options includes `train`, `val`, or `test` mode. For the train and val mode, the data loader will take data from the train or val set, and sample one clip per video. For the test mode, the data loader will take data from test set, and sample multiple clips per video. """ # Only support train, val, and test mode. assert mode in [ "train", "val", "test", ], "Split '{}' not supported".format(mode) logger.info("Constructing Ptvkinetics {}...".format(mode)) clip_duration = (cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE / cfg.DATA.TARGET_FPS) path_to_file = os.path.join(cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format(mode)) labeled_video_paths = LabeledVideoPaths.from_path(path_to_file) num_videos = len(labeled_video_paths) labeled_video_paths.path_prefix = cfg.DATA.PATH_PREFIX logger.info("Constructing kinetics dataloader (size: {}) from {}".format( num_videos, path_to_file)) if mode in ["train", "val"]: num_clips = 1 num_crops = 1 transform = Compose([ ApplyTransformToKey( key="video", transform=Compose([ UniformTemporalSubsample(cfg.DATA.NUM_FRAMES), Lambda(div255), NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD), RandomShortSideScale( min_size=cfg.DATA.TRAIN_JITTER_SCALES[0], max_size=cfg.DATA.TRAIN_JITTER_SCALES[1], ), RandomCropVideo(cfg.DATA.TRAIN_CROP_SIZE), ] + ([RandomHorizontalFlipVideo( p=0.5)] if cfg.DATA.RANDOM_FLIP else []) + [PackPathway(cfg)]), ), DictToTuple(num_clips, num_crops), ]) clip_sampler = make_clip_sampler("random", clip_duration) if cfg.NUM_GPUS > 1: video_sampler = DistributedSampler else: video_sampler = (RandomSampler if mode == "train" else SequentialSampler) else: num_clips = cfg.TEST.NUM_ENSEMBLE_VIEWS num_crops = cfg.TEST.NUM_SPATIAL_CROPS transform = Compose([ ApplyTransformToKey( key="video", transform=Compose([ UniformTemporalSubsample(cfg.DATA.NUM_FRAMES), Lambda(div255), NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD), ShortSideScale(size=cfg.DATA.TRAIN_JITTER_SCALES[0]), ]), ), UniformCropVideo(size=cfg.DATA.TEST_CROP_SIZE), ApplyTransformToKey(key="video", transform=PackPathway(cfg)), DictToTuple(num_clips, num_crops), ]) clip_sampler = make_clip_sampler( "constant_clips_per_video", clip_duration, num_clips, num_crops, ) video_sampler = (DistributedSampler if cfg.NUM_GPUS > 1 else SequentialSampler) return PTVDatasetWrapper( num_videos=num_videos, clips_per_video=num_clips, crops_per_clip=num_crops, dataset=LabeledVideoDataset( labeled_video_paths=labeled_video_paths, clip_sampler=clip_sampler, video_sampler=video_sampler, transform=transform, decode_audio=False, ), )