def predict_load_sample(self, sample: str) -> Dict[str, Any]: video = EncodedVideo.from_path(sample, decode_audio=self._decode_audio, decoder=self._decoder) ( clip_start, clip_end, clip_index, aug_index, is_last_clip, ) = self._clip_sampler(0.0, video.duration, None) loaded_clip = video.get_clip(clip_start, clip_end) clip_is_null = ( loaded_clip is None or loaded_clip["video"] is None or (loaded_clip["audio"] is None and self._decode_audio) ) if clip_is_null: raise MisconfigurationException( f"The provided video is too short {video.duration} to be clipped at {self._clip_sampler._clip_duration}" ) frames = loaded_clip["video"] audio_samples = loaded_clip["audio"] return { "video": frames, "video_name": video.name, "video_index": 0, "clip_index": clip_index, "aug_index": aug_index, **({"audio": audio_samples} if audio_samples is not None else {}), DataKeys.METADATA: {"filepath": sample}, }
def video_from_path(self, filepath, decode_audio=False, decoder="pyav", fps=30): try: is_file = g_pathmgr.isfile(filepath) is_dir = g_pathmgr.isdir(filepath) except NotImplementedError: # Not all PathManager handlers support is{file,dir} functions, when this is the # case, we default to assuming the path is a file. is_file = True is_dir = False if is_file: from pytorchvideo.data.encoded_video import EncodedVideo return EncodedVideo.from_path(filepath, decode_audio, decoder) elif is_dir: from pytorchvideo.data.frame_video import FrameVideo assert not decode_audio, "decode_audio must be False when using FrameVideo" return FrameVideo.from_directory( filepath, fps, path_order_cache=self.path_order_cache) else: raise FileNotFoundError(f"{filepath} not found.")
def test_video_with_longer_audio_works(self): audio_rate = 10000 fps = 5 num_frames = 5 num_audio_samples = 40000 with temp_encoded_video_with_audio( num_frames=num_frames, fps=fps, num_audio_samples=num_audio_samples, audio_rate=audio_rate, ) as (file_name, video_data, audio_data): test_video = EncodedVideo.from_path(file_name) # All audio clip = test_video.get_clip(0, test_video.duration) frames, audio_samples = clip["video"], clip["audio"] self.assertTrue(frames.equal(video_data)) self.assertTrue(audio_samples.equal(audio_data)) # No frames (3 - 5 seconds) clip = test_video.get_clip(test_video.duration + 1, test_video.duration + 2) frames, audio_samples = clip["video"], clip["audio"] self.assertEqual(frames, None) self.assertEqual(audio_samples, None) test_video.close()
def x3dpred(video): # Select the duration of the clip to load by specifying the start and end duration # The start_sec should correspond to where the action occurs in the video start_sec = 0 end_sec = start_sec + clip_duration # Initialize an EncodedVideo helper class and load the video video = EncodedVideo.from_path(video) # Load the desired clip video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec) # Apply a transform to normalize the video input video_data = transform(video_data) # Move the inputs to the desired device inputs = video_data["video"] inputs = inputs.to(device) # Pass the input clip through the model preds = model(inputs[None, ...]) # Get the predicted classes post_act = torch.nn.Softmax(dim=1) preds = post_act(preds) pred_classes = preds.topk(k=5).indices[0] # Map the predicted classes to the label names pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes] return "%s" % ", ".join(pred_class_names)
def test_video_with_shorter_audio_works(self): num_audio_samples = 8000 num_frames = 5 fps = 5 audio_rate = 8000 with temp_encoded_video_with_audio( num_frames=num_frames, fps=fps, num_audio_samples=num_audio_samples, audio_rate=audio_rate, ) as (file_name, video_data, audio_data): test_video = EncodedVideo.from_path(file_name) # Duration is max of both streams, therefore, the video duration will be expected. self.assertEqual(test_video.duration, num_frames / fps) # All audio (0 - 2 seconds) clip = test_video.get_clip(0, test_video.duration) frames, audio_samples = clip["video"], clip["audio"] self.assertTrue(frames.equal(video_data)) self.assertTrue(audio_samples.equal(audio_data)) # Half frames clip = test_video.get_clip(0, test_video.duration / 2) frames, audio_samples = clip["video"], clip["audio"] self.assertTrue(frames.equal(video_data[:, :num_frames // 2])) self.assertTrue(audio_samples.equal(audio_data)) test_video.close()
def test_video_works(self): num_frames = 11 fps = 5 with temp_encoded_video(num_frames=num_frames, fps=fps) as (file_name, data): test_video = EncodedVideo.from_path(file_name) self.assertAlmostEqual(test_video.duration, num_frames / fps) # All frames (0 - test_video.duration seconds) clip = test_video.get_clip(0, test_video.duration) frames, audio_samples = clip["video"], clip["audio"] self.assertTrue(frames.equal(data)) self.assertEqual(audio_samples, None) # Half frames clip = test_video.get_clip(0, test_video.duration / 2) frames, audio_samples = clip["video"], clip["audio"] self.assertTrue(frames.equal(data[:, :round(num_frames / 2)])) self.assertEqual(audio_samples, None) # No frames clip = test_video.get_clip(test_video.duration + 1, test_video.duration + 3) frames, audio_samples = clip["video"], clip["audio"] self.assertEqual(frames, None) self.assertEqual(audio_samples, None) test_video.close()
def _load_encoded_videos( encoded_video_manifest_file_path: str, video_infos: Dict[str, VideoInfo], ): encoded_video_infos: Dict[ str, EncodedVideoInfo] = load_dataclass_dict_from_csv( encoded_video_manifest_file_path, EncodedVideoInfo, "video_id") VideoDataset._remove_video_info_missing_or_incomplete_videos( encoded_video_infos, video_infos) return { video_id: EncodedVideo.from_path(encoded_video_info.file_path) for video_id, encoded_video_info in encoded_video_infos.items() }
def test_decode_audio_is_false(self): audio_rate = 10000 fps = 5 num_frames = 5 num_audio_samples = 40000 with temp_encoded_video_with_audio( num_frames=num_frames, fps=fps, num_audio_samples=num_audio_samples, audio_rate=audio_rate, ) as (file_name, video_data, audio_data): test_video = EncodedVideo.from_path(file_name, decode_audio=False) # All audio clip = test_video.get_clip(0, test_video.duration) frames, audio_samples = clip["video"], clip["audio"] self.assertTrue(frames.equal(video_data)) self.assertEqual(audio_samples, None) test_video.close()
def predict_load_sample(self, video_path: str) -> "EncodedVideo": return self._encoded_video_to_dict(EncodedVideo.from_path(video_path))
def predict_load_sample(self, sample: Dict[str, Any]) -> Dict[str, Any]: video_path = sample[DefaultDataKeys.INPUT] sample.update(self._encoded_video_to_dict(EncodedVideo.from_path(video_path))) sample[DefaultDataKeys.METADATA] = {"filepath": video_path} return sample
def predict_load_sample(self, sample: Dict[str, Any]) -> Dict[str, Any]: return self._encoded_video_to_dict( EncodedVideo.from_path(sample[DefaultDataKeys.INPUT]))
def __next__(self) -> dict: """ Retrieves the next clip based on the clip sampling strategy and video sampler. Returns: A dictionary with the following format. .. code-block:: text { 'video': <video_tensor>, 'label': <index_label>, 'video_label': <index_label> 'video_index': <video_index>, 'clip_index': <clip_index>, 'aug_index': <aug_index>, } """ if not self._video_sampler_iter: # Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned. self._video_sampler_iter = iter( MultiProcessSampler(self._video_sampler)) for i_try in range(self._MAX_CONSECUTIVE_FAILURES): # Reuse previously stored video if there are still clips to be sampled from # the last loaded video. if self._loaded_video_label: video, info_dict, video_index = self._loaded_video_label else: video_index = next(self._video_sampler_iter) try: video_path, info_dict = self._labeled_videos[video_index] video = EncodedVideo.from_path( video_path, decode_audio=self._decode_audio, decoder=self._decoder, ) self._loaded_video_label = (video, info_dict, video_index) except Exception as e: logger.debug( "Failed to load video with error: {}; trial {}".format( e, i_try, )) continue ( clip_start, clip_end, clip_index, aug_index, is_last_clip, ) = self._clip_sampler(self._next_clip_start_time, video.duration) # Only load the clip once and reuse previously stored clip if there are multiple # views for augmentations to perform on the same clip. if aug_index == 0: self._loaded_clip = video.get_clip(clip_start, clip_end) self._next_clip_start_time = clip_end clip_is_null = (self._loaded_clip is None or self._loaded_clip["video"] is None or (self._loaded_clip["audio"] is None and self._decode_audio)) if is_last_clip or clip_is_null: # Close the loaded encoded video and reset the last sampled clip time ready # to sample a new video on the next iteration. self._loaded_video_label[0].close() self._loaded_video_label = None self._next_clip_start_time = 0.0 if clip_is_null: logger.debug("Failed to load clip {}; trial {}".format( video.name, i_try)) continue frames = self._loaded_clip["video"] audio_samples = self._loaded_clip["audio"] sample_dict = { "video": frames, "video_name": video.name, "video_index": video_index, "clip_index": clip_index, "aug_index": aug_index, **info_dict, **({ "audio": audio_samples } if audio_samples is not None else {}), } if self._transform is not None: sample_dict = self._transform(sample_dict) # User can force dataset to continue by returning None in transform. if sample_dict is None: continue return sample_dict else: raise RuntimeError( f"Failed to load video after {self._MAX_CONSECUTIVE_FAILURES} retries." )
UniformTemporalSubsample(num_frames), Lambda(lambda x: x / 255.0), NormalizeVideo(mean, std), ShortSideScale(size=side_size), CenterCropVideo(crop_size=(crop_size, crop_size)) ])) clip_duration = (num_frames * sampling_rate) / frames_per_second # Load Video video_path = 'archery.mp4' start_sec = 0 end_sec = start_sec + clip_duration video = EncodedVideo.from_path(video_path) video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec) video_data = transform(video_data) inputs = video_data['video'] inputs = inputs.to(device) # Predict preds = model(inputs[None, ...]) post_act = torch.nn.Softmax(dim=1) preds = post_act(preds) pred_classes = preds.topk(k=5).indices pred_class_names = [ kinetics_id_to_classname[int(i)] for i in pred_classes[0]
def test_decode_video_failure(self): with tempfile.NamedTemporaryFile(suffix=".mp4") as f: f.write(b"This is not an mp4 file") with pytest.raises(RuntimeError): test_video = EncodedVideo.from_path(f.name) test_video.close()
def test_open_video_failure(self): with pytest.raises(FileNotFoundError): test_video = EncodedVideo.from_path("non_existent_file.txt") test_video.close()