def test_define_clip_structure_generator(self): seconds_per_clip = 5 define_clip_structure_fn = ( EpicKitchenRecognition._define_clip_structure_generator( seconds_per_clip=5, clip_sampling=ClipSampling.RandomOffsetUniform)) frame_videos = { "P01_003": FrameVideo.from_frame_paths( [f"root/P01_003/frame_{i}" for i in range(100)], 10), "P02_004": FrameVideo.from_frame_paths( [f"root/P02_004/frame_{i}" for i in range(300)], 10), "P11_010": FrameVideo.from_frame_paths( [f"root/P11_010/frame_{i}" for i in range(600)], 30), } actions = {video_id: [] for video_id in frame_videos} random_value = 0.5 with unittest.mock.patch("random.random", return_value=random_value) as _: clips = define_clip_structure_fn(frame_videos, actions) sorted_clips = sorted(clips, key=lambda c: c.start_time) # For stability for clip in sorted_clips: self.assertEqual(clip.stop_time - clip.start_time, seconds_per_clip) clips_P01_003 = [ c for c in sorted_clips if c.video_id == "P01_003" ] self.assertEqual(len(clips_P01_003), 1) for i in range(len(clips_P01_003)): self.assertEqual(clips_P01_003[i].start_time, seconds_per_clip * (i + random_value)) clips_P02_004 = [ c for c in sorted_clips if c.video_id == "P02_004" ] self.assertEqual(len(clips_P02_004), 5) for i in range(len(clips_P02_004)): self.assertEqual(clips_P02_004[i].start_time, seconds_per_clip * (i + random_value)) clips_P11_010 = [ c for c in sorted_clips if c.video_id == "P11_010" ] self.assertEqual(len(clips_P11_010), 3) for i in range(len(clips_P11_010)): self.assertEqual(clips_P11_010[i].start_time, seconds_per_clip * (i + random_value))
def video_from_path(self, filepath, decode_audio=False, decoder="pyav", fps=30): try: is_file = g_pathmgr.isfile(filepath) is_dir = g_pathmgr.isdir(filepath) except NotImplementedError: # Not all PathManager handlers support is{file,dir} functions, when this is the # case, we default to assuming the path is a file. is_file = True is_dir = False if is_file: from pytorchvideo.data.encoded_video import EncodedVideo return EncodedVideo.from_path(filepath, decode_audio, decoder) elif is_dir: from pytorchvideo.data.frame_video import FrameVideo assert not decode_audio, "decode_audio must be False when using FrameVideo" return FrameVideo.from_directory( filepath, fps, path_order_cache=self.path_order_cache) else: raise FileNotFoundError(f"{filepath} not found.")
def test_frame_video_works(self): frame_names = [f"{str(i)}.png" for i in range(3)] with temp_frame_video(frame_names) as (f_name, data): frame_paths = [f_name / x for x in frame_names] test_video = FrameVideo.from_frame_paths(frame_paths) expected_duration = ( 0.1 # Total duration of 3 frames at 30fps is 0.1 seconds. ) self.assertEqual(test_video.duration, expected_duration) # All frames (0 - 0.1 seconds) clip = test_video.get_clip(0, 0.1) frames, indices = clip["video"], clip["frame_indices"] self.assertTrue(frames.equal(data)) self.assertEqual(indices, [0, 1, 2]) # All frames (0 - 0.1 seconds), filtred to middle frame clip = test_video.get_clip(0, 0.1, lambda lst: lst[1:2]) frames, indices = clip["video"], clip["frame_indices"] self.assertTrue(frames.equal(data[:, 1:2])) self.assertEqual(indices, [1]) # 2 frames (0 - 0.066 seconds) clip = test_video.get_clip(0, 0.066) frames, indices = clip["video"], clip["frame_indices"] self.assertTrue(frames.equal(data[:, :2])) self.assertEqual(indices, [0, 1]) # No frames (3 - 5 seconds) result = test_video.get_clip(3, 5) self.assertEqual(result, None)
def __next__(self) -> dict: """ Retrieves the next clip based on the clip sampling strategy and video sampler. Returns: A dictionary with the following format. .. code-block:: text { 'video': <video_tensor>, 'label': <index_label>, 'video_label': <index_label> 'video_index': <video_index>, 'clip_index': <clip_index>, 'aug_index': <aug_index>, } """ if not self._video_sampler_iter: # Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned. self._video_sampler_iter = iter( MultiProcessSampler(self._video_sampler)) if self._loaded_video: video, video_index = self._loaded_video else: video_index = next(self._video_sampler_iter) path_to_video_frames = self._path_to_videos[video_index] video = FrameVideo.from_frame_paths(path_to_video_frames) self._loaded_video = (video, video_index) clip_start, clip_end, clip_index, aug_index, is_last_clip = self._clip_sampler( self._next_clip_start_time, video.duration, {}) # Only load the clip once and reuse previously stored clip if there are multiple # views for augmentations to perform on the same clip. if aug_index == 0: self._loaded_clip = video.get_clip(0, video.duration, self._frame_filter) self._next_clip_start_time = clip_end if is_last_clip: self._loaded_video = None self._next_clip_start_time = 0.0 sample_dict = { "video": self._loaded_clip["video"], "label": self._labels[video_index], "video_name": str(video_index), "video_index": video_index, "clip_index": clip_index, "aug_index": aug_index, } if self._transform is not None: sample_dict = self._transform(sample_dict) return sample_dict
def _load_frame_videos( frame_manifest_file_path: str, video_infos: Dict[str, VideoInfo], multithreaded_io: bool, ): video_frames: Dict[str, VideoFrameInfo] = load_dataclass_dict_from_csv( frame_manifest_file_path, VideoFrameInfo, "video_id") VideoDataset._remove_video_info_missing_or_incomplete_videos( video_frames, video_infos) return { video_id: FrameVideo( video_frame_paths=VideoDataset._frame_number_to_filepaths( video_id, video_frames, video_infos), duration=video_infos[video_id].duration, fps=video_infos[video_id].fps, multithreaded_io=multithreaded_io, ) for video_id in video_infos }
def __next__(self) -> dict: """ Retrieves the next clip based on the clip sampling strategy and video sampler. Returns: A video clip with the following format if transform is None: { 'video': <video_tensor>, 'label': <index_label> for clip-level label, 'video_label': <index_label> for video-level label, 'video_index': <video_index>, 'clip_index': <clip_index>, 'aug_index': <aug_index>, augmentation index as augmentations might generate multiple views for one clip. } Otherwise, the transform defines the clip output. """ if not self._video_sampler_iter: # Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned. self._video_sampler_iter = iter( MultiProcessSampler(self._video_sampler)) if self._loaded_video: video, video_index = self._loaded_video else: video_index = next(self._video_sampler_iter) path_to_video_frames = self._path_to_videos[video_index] video = FrameVideo.from_frame_paths(path_to_video_frames) self._loaded_video = (video, video_index) clip_start, clip_end, clip_index, aug_index, is_last_clip = self._clip_sampler( self._next_clip_start_time, video.duration) # Only load the clip once and reuse previously stored clip if there are multiple # views for augmentations to perform on the same clip. if aug_index == 0: self._loaded_clip = video.get_clip(clip_start, clip_end, self._frame_filter) frames, frame_indices = ( self._loaded_clip["video"], self._loaded_clip["frame_indices"], ) self._next_clip_start_time = clip_end if is_last_clip: self._loaded_video = None self._next_clip_start_time = 0.0 # Merge unique labels from each frame into clip label. labels_by_frame = [ self._labels[video_index][i] for i in range(min(frame_indices), max(frame_indices) + 1) ] sample_dict = { "video": frames, "label": labels_by_frame, "video_label": self._video_labels[video_index], "video_name": str(video_index), "video_index": video_index, "clip_index": clip_index, "aug_index": aug_index, } if self._transform is not None: sample_dict = self._transform(sample_dict) return sample_dict
def test_define_clip_structure_generator(self): frame_videos = { "P01_003": FrameVideo.from_frame_paths( [f"root/P01_003/frame_{i}" for i in range(200)], 10), "P02_004": FrameVideo.from_frame_paths( [f"root/P02_004/frame_{i}" for i in range(300)], 10), "P11_010": FrameVideo.from_frame_paths( [f"root/P11_010/frame_{i}" for i in range(600)], 30), } actions = { "P01_003": [ ActionData( "P01", "P01_003", "turn off light", "00:00:01.00", "00:00:02.00", 262, 370, "turn-off", 12, "light", 113, "['light']", "[113]", ), ActionData( "P01", "P01_003", "turn on light", "00:00:04.00", "00:00:05.00", 262, 370, "turn-on", 12, "light", 113, "['light']", "[113]", ), ActionData( "P01", "P01_003", "close door", "00:00:06.00", "00:00:07.00", 418, 569, "close", 3, "door", 8, "['door']", "[8]", ), ActionData( "P01", "P01_003", "slam door", "00:00:10.00", "00:00:11.00", 408, 509, "slam", 3, "door", 8, "['door']", "[8]", ), ], "P02_004": [ ActionData( "P02", "P02_004", "turn off light", "00:00:04.00", "00:00:05.00", 262, 370, "turn-off", 12, "light", 113, "['light']", "[113]", ), ActionData( "P02", "P02_004", "turn on light", "00:00:05.00", "00:00:06.00", 262, 370, "turn-on", 12, "light", 113, "['light']", "[113]", ), ActionData( "P02", "P02_004", "close door", "00:00:08.00", "00:00:09.00", 418, 569, "close", 3, "door", 8, "['door']", "[8]", ), ActionData( "P02", "P02_004", "slam door", "00:00:10.00", "00:00:11.00", 408, 509, "slam", 3, "door", 8, "['door']", "[8]", ), ], "P11_010": [ ActionData( "P11", "P11_010", "turn off light", "00:00:01.00", "00:00:02.00", 262, 370, "turn-off", 12, "light", 113, "['light']", "[113]", ), ActionData( "P11", "P11_010", "turn on light", "00:00:04.00", "00:00:05.50", 262, 370, "turn-on", 12, "light", 113, "['light']", "[113]", ), ActionData( "P11", "P11_010", "turn on light", "00:00:04.00", "00:00:06.00", 262, 370, "turn-on", 12, "light", 113, "['light']", "[113]", ), ActionData( "P11", "P11_010", "close door", "00:00:06.00", "00:00:07.00", 418, 569, "close", 3, "door", 8, "['door']", "[8]", ), ActionData( "P11", "P11_010", "slam door", "00:00:10.00", "00:00:11.00", 408, 509, "slam", 3, "door", 8, "['door']", "[8]", ), ], } random_value = 0.5 with unittest.mock.patch("random.random", return_value=random_value) as _: define_clip_structure_fn = ( EpicKitchenForecasting._define_clip_structure_generator( seconds_per_clip=1, clip_time_stride=3, num_input_clips=2, num_forecast_actions=2, clip_sampling=ClipSampling.Random, )) clips = define_clip_structure_fn(frame_videos, actions) sorted_clips = sorted(clips, key=lambda c: c.start_time) # For stability for clip in sorted_clips: self.assertEqual(clip.stop_time - clip.start_time, 4.0) clips_P01_003 = [ c for c in sorted_clips if c.video_id == "P01_003" ] self.assertEqual(len(clips_P01_003), 1) clips_P01_003[0].start_time == actions["P01_003"][1].stop_time clips_P02_004 = [ c for c in sorted_clips if c.video_id == "P02_004" ] self.assertEqual(len(clips_P02_004), 2) clips_P02_004[0].start_time == actions["P02_004"][0].stop_time clips_P02_004[1].start_time == actions["P02_004"][1].stop_time clips_P11_010 = [ c for c in sorted_clips if c.video_id == "P11_010" ] self.assertEqual(len(clips_P11_010), 1) clips_P11_010[0].start_time == actions["P11_010"][1].stop_time
def test_empty_frames_failure(self): with pytest.raises(AssertionError): FrameVideo.from_frame_paths([])
def test_open_video_failure(self): test_video = FrameVideo.from_frame_paths(["non_existent_file.txt"]) with pytest.raises(Exception): test_video.get_clip(0, 0.01) # duration is 1 / 30 because one frame