Exemple #1
0
    def predict_load_sample(self, sample: str) -> Dict[str, Any]:
        video = EncodedVideo.from_path(sample, decode_audio=self._decode_audio, decoder=self._decoder)
        (
            clip_start,
            clip_end,
            clip_index,
            aug_index,
            is_last_clip,
        ) = self._clip_sampler(0.0, video.duration, None)

        loaded_clip = video.get_clip(clip_start, clip_end)

        clip_is_null = (
            loaded_clip is None or loaded_clip["video"] is None or (loaded_clip["audio"] is None and self._decode_audio)
        )

        if clip_is_null:
            raise MisconfigurationException(
                f"The provided video is too short {video.duration} to be clipped at {self._clip_sampler._clip_duration}"
            )

        frames = loaded_clip["video"]
        audio_samples = loaded_clip["audio"]
        return {
            "video": frames,
            "video_name": video.name,
            "video_index": 0,
            "clip_index": clip_index,
            "aug_index": aug_index,
            **({"audio": audio_samples} if audio_samples is not None else {}),
            DataKeys.METADATA: {"filepath": sample},
        }
Exemple #2
0
    def video_from_path(self,
                        filepath,
                        decode_audio=False,
                        decoder="pyav",
                        fps=30):
        try:
            is_file = g_pathmgr.isfile(filepath)
            is_dir = g_pathmgr.isdir(filepath)
        except NotImplementedError:

            # Not all PathManager handlers support is{file,dir} functions, when this is the
            # case, we default to assuming the path is a file.
            is_file = True
            is_dir = False

        if is_file:
            from pytorchvideo.data.encoded_video import EncodedVideo

            return EncodedVideo.from_path(filepath, decode_audio, decoder)
        elif is_dir:
            from pytorchvideo.data.frame_video import FrameVideo

            assert not decode_audio, "decode_audio must be False when using FrameVideo"
            return FrameVideo.from_directory(
                filepath, fps, path_order_cache=self.path_order_cache)
        else:
            raise FileNotFoundError(f"{filepath} not found.")
    def test_video_with_longer_audio_works(self):
        audio_rate = 10000
        fps = 5
        num_frames = 5
        num_audio_samples = 40000
        with temp_encoded_video_with_audio(
                num_frames=num_frames,
                fps=fps,
                num_audio_samples=num_audio_samples,
                audio_rate=audio_rate,
        ) as (file_name, video_data, audio_data):
            test_video = EncodedVideo.from_path(file_name)

            # All audio
            clip = test_video.get_clip(0, test_video.duration)
            frames, audio_samples = clip["video"], clip["audio"]
            self.assertTrue(frames.equal(video_data))
            self.assertTrue(audio_samples.equal(audio_data))

            # No frames (3 - 5 seconds)
            clip = test_video.get_clip(test_video.duration + 1,
                                       test_video.duration + 2)
            frames, audio_samples = clip["video"], clip["audio"]
            self.assertEqual(frames, None)
            self.assertEqual(audio_samples, None)

            test_video.close()
Exemple #4
0
def x3dpred(video):

    # Select the duration of the clip to load by specifying the start and end duration
    # The start_sec should correspond to where the action occurs in the video
    start_sec = 0
    end_sec = start_sec + clip_duration

    # Initialize an EncodedVideo helper class and load the video
    video = EncodedVideo.from_path(video)

    # Load the desired clip
    video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

    # Apply a transform to normalize the video input
    video_data = transform(video_data)

    # Move the inputs to the desired device
    inputs = video_data["video"]
    inputs = inputs.to(device)

    # Pass the input clip through the model
    preds = model(inputs[None, ...])

    # Get the predicted classes
    post_act = torch.nn.Softmax(dim=1)
    preds = post_act(preds)
    pred_classes = preds.topk(k=5).indices[0]

    # Map the predicted classes to the label names
    pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
    return "%s" % ", ".join(pred_class_names)
    def test_video_with_shorter_audio_works(self):
        num_audio_samples = 8000
        num_frames = 5
        fps = 5
        audio_rate = 8000
        with temp_encoded_video_with_audio(
                num_frames=num_frames,
                fps=fps,
                num_audio_samples=num_audio_samples,
                audio_rate=audio_rate,
        ) as (file_name, video_data, audio_data):
            test_video = EncodedVideo.from_path(file_name)

            # Duration is max of both streams, therefore, the video duration will be expected.
            self.assertEqual(test_video.duration, num_frames / fps)

            # All audio (0 - 2 seconds)
            clip = test_video.get_clip(0, test_video.duration)
            frames, audio_samples = clip["video"], clip["audio"]
            self.assertTrue(frames.equal(video_data))
            self.assertTrue(audio_samples.equal(audio_data))

            # Half frames
            clip = test_video.get_clip(0, test_video.duration / 2)
            frames, audio_samples = clip["video"], clip["audio"]

            self.assertTrue(frames.equal(video_data[:, :num_frames // 2]))
            self.assertTrue(audio_samples.equal(audio_data))

            test_video.close()
    def test_video_works(self):
        num_frames = 11
        fps = 5
        with temp_encoded_video(num_frames=num_frames,
                                fps=fps) as (file_name, data):
            test_video = EncodedVideo.from_path(file_name)
            self.assertAlmostEqual(test_video.duration, num_frames / fps)

            # All frames (0 - test_video.duration seconds)
            clip = test_video.get_clip(0, test_video.duration)
            frames, audio_samples = clip["video"], clip["audio"]
            self.assertTrue(frames.equal(data))
            self.assertEqual(audio_samples, None)

            # Half frames
            clip = test_video.get_clip(0, test_video.duration / 2)
            frames, audio_samples = clip["video"], clip["audio"]
            self.assertTrue(frames.equal(data[:, :round(num_frames / 2)]))
            self.assertEqual(audio_samples, None)

            # No frames
            clip = test_video.get_clip(test_video.duration + 1,
                                       test_video.duration + 3)
            frames, audio_samples = clip["video"], clip["audio"]
            self.assertEqual(frames, None)
            self.assertEqual(audio_samples, None)
            test_video.close()
    def _load_encoded_videos(
        encoded_video_manifest_file_path: str,
        video_infos: Dict[str, VideoInfo],
    ):
        encoded_video_infos: Dict[
            str, EncodedVideoInfo] = load_dataclass_dict_from_csv(
                encoded_video_manifest_file_path, EncodedVideoInfo, "video_id")
        VideoDataset._remove_video_info_missing_or_incomplete_videos(
            encoded_video_infos, video_infos)

        return {
            video_id: EncodedVideo.from_path(encoded_video_info.file_path)
            for video_id, encoded_video_info in encoded_video_infos.items()
        }
    def test_decode_audio_is_false(self):
        audio_rate = 10000
        fps = 5
        num_frames = 5
        num_audio_samples = 40000
        with temp_encoded_video_with_audio(
                num_frames=num_frames,
                fps=fps,
                num_audio_samples=num_audio_samples,
                audio_rate=audio_rate,
        ) as (file_name, video_data, audio_data):
            test_video = EncodedVideo.from_path(file_name, decode_audio=False)

            # All audio
            clip = test_video.get_clip(0, test_video.duration)
            frames, audio_samples = clip["video"], clip["audio"]
            self.assertTrue(frames.equal(video_data))
            self.assertEqual(audio_samples, None)

            test_video.close()
Exemple #9
0
 def predict_load_sample(self, video_path: str) -> "EncodedVideo":
     return self._encoded_video_to_dict(EncodedVideo.from_path(video_path))
 def predict_load_sample(self, sample: Dict[str, Any]) -> Dict[str, Any]:
     video_path = sample[DefaultDataKeys.INPUT]
     sample.update(self._encoded_video_to_dict(EncodedVideo.from_path(video_path)))
     sample[DefaultDataKeys.METADATA] = {"filepath": video_path}
     return sample
Exemple #11
0
 def predict_load_sample(self, sample: Dict[str, Any]) -> Dict[str, Any]:
     return self._encoded_video_to_dict(
         EncodedVideo.from_path(sample[DefaultDataKeys.INPUT]))
Exemple #12
0
    def __next__(self) -> dict:
        """
        Retrieves the next clip based on the clip sampling strategy and video sampler.

        Returns:
            A dictionary with the following format.

            .. code-block:: text

                {
                    'video': <video_tensor>,
                    'label': <index_label>,
                    'video_label': <index_label>
                    'video_index': <video_index>,
                    'clip_index': <clip_index>,
                    'aug_index': <aug_index>,
                }
        """
        if not self._video_sampler_iter:
            # Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned.
            self._video_sampler_iter = iter(
                MultiProcessSampler(self._video_sampler))

        for i_try in range(self._MAX_CONSECUTIVE_FAILURES):
            # Reuse previously stored video if there are still clips to be sampled from
            # the last loaded video.
            if self._loaded_video_label:
                video, info_dict, video_index = self._loaded_video_label
            else:
                video_index = next(self._video_sampler_iter)
                try:
                    video_path, info_dict = self._labeled_videos[video_index]
                    video = EncodedVideo.from_path(
                        video_path,
                        decode_audio=self._decode_audio,
                        decoder=self._decoder,
                    )
                    self._loaded_video_label = (video, info_dict, video_index)
                except Exception as e:
                    logger.debug(
                        "Failed to load video with error: {}; trial {}".format(
                            e,
                            i_try,
                        ))
                    continue

            (
                clip_start,
                clip_end,
                clip_index,
                aug_index,
                is_last_clip,
            ) = self._clip_sampler(self._next_clip_start_time, video.duration)
            # Only load the clip once and reuse previously stored clip if there are multiple
            # views for augmentations to perform on the same clip.
            if aug_index == 0:
                self._loaded_clip = video.get_clip(clip_start, clip_end)
            self._next_clip_start_time = clip_end

            clip_is_null = (self._loaded_clip is None
                            or self._loaded_clip["video"] is None
                            or (self._loaded_clip["audio"] is None
                                and self._decode_audio))
            if is_last_clip or clip_is_null:
                # Close the loaded encoded video and reset the last sampled clip time ready
                # to sample a new video on the next iteration.
                self._loaded_video_label[0].close()
                self._loaded_video_label = None
                self._next_clip_start_time = 0.0

                if clip_is_null:
                    logger.debug("Failed to load clip {}; trial {}".format(
                        video.name, i_try))
                    continue

            frames = self._loaded_clip["video"]
            audio_samples = self._loaded_clip["audio"]
            sample_dict = {
                "video": frames,
                "video_name": video.name,
                "video_index": video_index,
                "clip_index": clip_index,
                "aug_index": aug_index,
                **info_dict,
                **({
                    "audio": audio_samples
                } if audio_samples is not None else {}),
            }
            if self._transform is not None:
                sample_dict = self._transform(sample_dict)

                # User can force dataset to continue by returning None in transform.
                if sample_dict is None:
                    continue

            return sample_dict
        else:
            raise RuntimeError(
                f"Failed to load video after {self._MAX_CONSECUTIVE_FAILURES} retries."
            )
                                        UniformTemporalSubsample(num_frames),
                                        Lambda(lambda x: x / 255.0),
                                        NormalizeVideo(mean, std),
                                        ShortSideScale(size=side_size),
                                        CenterCropVideo(crop_size=(crop_size,
                                                                   crop_size))
                                    ]))

    clip_duration = (num_frames * sampling_rate) / frames_per_second

    # Load Video
    video_path = 'archery.mp4'
    start_sec = 0
    end_sec = start_sec + clip_duration

    video = EncodedVideo.from_path(video_path)
    video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

    video_data = transform(video_data)

    inputs = video_data['video']
    inputs = inputs.to(device)

    # Predict
    preds = model(inputs[None, ...])
    post_act = torch.nn.Softmax(dim=1)
    preds = post_act(preds)
    pred_classes = preds.topk(k=5).indices

    pred_class_names = [
        kinetics_id_to_classname[int(i)] for i in pred_classes[0]
 def test_decode_video_failure(self):
     with tempfile.NamedTemporaryFile(suffix=".mp4") as f:
         f.write(b"This is not an mp4 file")
         with pytest.raises(RuntimeError):
             test_video = EncodedVideo.from_path(f.name)
             test_video.close()
 def test_open_video_failure(self):
     with pytest.raises(FileNotFoundError):
         test_video = EncodedVideo.from_path("non_existent_file.txt")
         test_video.close()