def run_distributed(rank, size, decoder, clip_duration, data_name,
                    return_dict):
    """
    This function is run by each distributed process. It samples videos
    based on the distributed split (determined by the
    DistributedSampler) and returns the dataset clips in the return_dict.
    """
    os.environ["MASTER_ADDR"] = "127.0.0.1"
    os.environ["MASTER_PORT"] = "29500"
    dist.init_process_group("gloo", rank=rank, world_size=size)
    clip_sampler = make_clip_sampler("uniform", clip_duration)
    labeled_video_paths = LabeledVideoPaths.from_path(data_name)
    dataset = LabeledVideoDataset(
        labeled_video_paths,
        clip_sampler=clip_sampler,
        video_sampler=DistributedSampler,
        decode_audio=False,
        decoder=decoder,
    )
    test_dataloader = DataLoader(dataset, batch_size=None, num_workers=1)

    # Run two epochs, simulating use in a training loop
    dataset.video_sampler.set_epoch(0)
    epoch_1 = [(sample["label"], sample["video"])
               for sample in test_dataloader]
    dataset.video_sampler.set_epoch(1)
    epoch_2 = [(sample["label"], sample["video"])
               for sample in test_dataloader]
    return_dict[rank] = {"epoch_1": epoch_1, "epoch_2": epoch_2}
    def test_video_name_with_whitespace_works(self, decoder):
        num_frames = 10
        fps = 5
        with temp_encoded_video(num_frames=num_frames,
                                fps=fps,
                                prefix="pre fix") as (
                                    video_file_name,
                                    data,
                                ):
            with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f:
                f.write(f"{video_file_name} 0\n".encode())
                f.write(f"{video_file_name} 1\n".encode())

            total_duration = num_frames / fps
            clip_sampler = make_clip_sampler("uniform", total_duration)
            labeled_video_paths = LabeledVideoPaths.from_path(f.name)
            dataset = LabeledVideoDataset(
                labeled_video_paths,
                clip_sampler=clip_sampler,
                video_sampler=SequentialSampler,
                decode_audio=False,
                decoder=decoder,
            )

            expected = [(0, data), (1, data)]
            for i, sample in enumerate(dataset):
                self.assertTrue(sample["video"].equal(expected[i][1]))
                self.assertEqual(sample["label"], expected[i][0])
    def test_sampling_with_more_processes_than_videos(self, decoder):
        with mock_encoded_video_dataset_file() as (
                mock_csv,
                label_videos,
                total_duration,
        ):
            half_duration = total_duration / 2 - self._EPS
            clip_sampler = make_clip_sampler("uniform", half_duration)
            labeled_video_paths = LabeledVideoPaths.from_path(mock_csv)
            dataset = LabeledVideoDataset(
                labeled_video_paths,
                clip_sampler=clip_sampler,
                video_sampler=SequentialSampler,
                decode_audio=False,
                decoder=decoder,
            )

            # Split each full video into two clips.
            expected = []
            for label, data in label_videos:
                num_frames = data.shape[0]
                half_frames = num_frames // 2
                first_half_data = data[:, :half_frames]
                second_half_data = data[:, half_frames:]
                expected.append((label, first_half_data))
                expected.append((label, second_half_data))

            test_dataloader = DataLoader(dataset,
                                         batch_size=None,
                                         num_workers=16)
            actual = [(sample["label"], sample["video"])
                      for sample in test_dataloader]
            assert_unordered_list_compare_true(self, expected, actual)
Ejemplo n.º 4
0
def Ava(
    frame_paths_file: str,
    frame_labels_file: str,
    video_path_prefix: str = "",
    label_map_file: Optional[str] = None,
    clip_sampler: Callable = ClipSampler,
    video_sampler: Type[
        torch.utils.data.Sampler] = torch.utils.data.RandomSampler,
    transform: Optional[Callable[[dict], Any]] = None,
) -> None:
    """
    Args:
        frame_paths_file (str): Path to a file containing relative paths
            to all the frames in the video. Each line in the file is of the
            form <original_vido_id video_id frame_id rel_path labels>

        frame_labels_file (str): Path to the file containing containing labels
            per key frame. Acceptible file formats are,
            Type 1:
                <original_vido_id, frame_time_stamp, bbox_x_1, bbox_y_1, ...
                bbox_x_2, bbox_y_2, action_lable, detection_iou>
            Type 2:
                <original_vido_id, frame_time_stamp, bbox_x_1, bbox_y_1, ...
                bbox_x_2, bbox_y_2, action_lable, person_label>

        video_path_prefix (str): Path to be augumented to the each relative frame
            path to get the global frame path.

        label_map_file (str): Path to a .pbtxt containing class id's
            and class names. If not set, label_map is not loaded and bbox labels are
            not pruned based on allowable class_id's in label_map.

        clip_sampler (ClipSampler): Defines how clips should be sampled from each
                video.

        video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal
                video container. This defines the order videos are decoded and,
                if necessary, the distributed split.

        transform (Optional[Callable]): This callable is evaluated on the clip output
            and the corresponding bounding boxes before the clip and the bounding boxes
            are returned. It can be used for user defined preprocessing and
            augmentations to the clips. If transform is None, the clip and bounding
            boxes are returned as it is.
    """
    labeled_video_paths = AvaLabeledVideoFramePaths.from_csv(
        frame_paths_file,
        frame_labels_file,
        video_path_prefix,
        label_map_file,
    )
    return LabeledVideoDataset(
        labeled_video_paths=labeled_video_paths,
        clip_sampler=TimeStampClipSampler(clip_sampler),
        transform=transform,
        video_sampler=video_sampler,
        decode_audio=False,
    )
    def test_reading_from_directory_structure(self, decoder):
        # For an unknown reason this import has to be here for `buck test` to work.
        import torchvision.io as io

        with tempfile.TemporaryDirectory() as root_dir:

            # Create test directory structure with two classes and a video in each.
            root_dir_name = pathlib.Path(root_dir)
            test_class_1 = root_dir_name / "running"
            test_class_1.mkdir()
            data_1 = create_dummy_video_frames(15, 10, 10)
            test_class_2 = root_dir_name / "cleaning windows"
            test_class_2.mkdir()
            data_2 = create_dummy_video_frames(20, 15, 15)
            with tempfile.NamedTemporaryFile(
                    suffix=".mp4",
                    dir=test_class_1) as f_1, tempfile.NamedTemporaryFile(
                        suffix=".mp4", dir=test_class_2) as f_2:
                f_1.close()
                f_2.close()

                # Write lossless video for each class.
                io.write_video(
                    f_1.name,
                    data_1,
                    fps=30,
                    video_codec="libx264rgb",
                    options={"crf": "0"},
                )
                io.write_video(
                    f_2.name,
                    data_2,
                    fps=30,
                    video_codec="libx264rgb",
                    options={"crf": "0"},
                )

                clip_sampler = make_clip_sampler("uniform", 3)
                labeled_video_paths = LabeledVideoPaths.from_path(root_dir)
                dataset = LabeledVideoDataset(
                    labeled_video_paths,
                    clip_sampler=clip_sampler,
                    video_sampler=SequentialSampler,
                    decode_audio=False,
                    decoder=decoder,
                )

                # Videos are sorted alphabetically so "cleaning windows" (i.e. data_2)
                # will be first.
                sample_1 = next(dataset)
                self.assertEqual(sample_1["label"], 0)
                self.assertTrue(sample_1["video"].equal(
                    thwc_to_cthw(data_2).to(torch.float32)))

                sample_2 = next(dataset)
                self.assertEqual(sample_2["label"], 1)
                self.assertTrue(sample_2["video"].equal(
                    thwc_to_cthw(data_1).to(torch.float32)))
    def test_sampling_with_non_divisible_processes_by_clips(self, decoder):

        # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2
        # clips respectively.
        num_frames = 10
        fps = 5
        with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as (
                video_file_name_1,
                data_1,
        ):
            with temp_encoded_video(num_frames=num_frames, fps=fps) as (
                    video_file_name_2,
                    data_2,
            ):
                with tempfile.NamedTemporaryFile(delete=False,
                                                 suffix=".txt") as f:
                    f.write(f"{video_file_name_1} 0\n".encode())
                    f.write(f"{video_file_name_2} 1\n".encode())

                total_duration = num_frames / fps
                half_duration = total_duration / 2 - self._EPS
                clip_sampler = make_clip_sampler("uniform", half_duration)
                labeled_video_paths = LabeledVideoPaths.from_path(f.name)
                dataset = LabeledVideoDataset(
                    labeled_video_paths,
                    clip_sampler=clip_sampler,
                    video_sampler=SequentialSampler,
                    decode_audio=False,
                    decoder=decoder,
                )

                half_frames = num_frames // 2
                expected = {
                    (0, data_1[:, half_frames * 2:]),  # 1/3 clip
                    (0, data_1[:, half_frames:half_frames * 2]),  # 2/3 clip
                    (0, data_1[:, :half_frames]),  # 3/3/ clip
                    (1, data_2[:, :half_frames]),  # First half
                    (1, data_2[:, half_frames:]),  # Second half
                }

                test_dataloader = DataLoader(dataset,
                                             batch_size=None,
                                             num_workers=2)
                actual = [(sample["label"], sample["video"])
                          for sample in test_dataloader]
                assert_unordered_list_compare_true(self, expected, actual)
    def test_constant_clips_per_video_sampling_works(self, decoder):
        # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2
        # clips respectively.
        num_frames = 10
        fps = 5
        with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as (
                video_file_name_1,
                data_1,
        ):
            with temp_encoded_video(num_frames=num_frames, fps=fps) as (
                    video_file_name_2,
                    data_2,
            ):
                with tempfile.NamedTemporaryFile(delete=False,
                                                 suffix=".txt") as f:
                    f.write(f"{video_file_name_1} 0\n".encode())
                    f.write(f"{video_file_name_2} 1\n".encode())

                clip_frames = 2
                duration_for_frames = clip_frames / fps - self._EPS
                clip_sampler = make_clip_sampler("constant_clips_per_video",
                                                 duration_for_frames, 2)
                labeled_video_paths = LabeledVideoPaths.from_path(f.name)
                dataset = LabeledVideoDataset(
                    labeled_video_paths,
                    clip_sampler=clip_sampler,
                    video_sampler=SequentialSampler,
                    decode_audio=False,
                    decoder=decoder,
                )

                # Dataset has 2 videos. Each video has two evenly spaced clips of size
                # clip_frames sampled. The first clip of each video will always be
                # sampled at second 0. The second clip of the video is the next frame
                # from time: (total_duration - clip_duration) / 2
                half_frames_1 = math.ceil((data_1.shape[1] - clip_frames) / 2)
                half_frames_2 = math.ceil((data_2.shape[1] - clip_frames) / 2)
                expected = [
                    (0, data_1[:, :clip_frames]),
                    (0, data_1[:, half_frames_1:half_frames_1 + clip_frames]),
                    (1, data_2[:, :clip_frames]),
                    (1, data_2[:, half_frames_2:half_frames_2 + clip_frames]),
                ]
                for i, sample in enumerate(dataset):
                    self.assertTrue(sample["video"].equal(expected[i][1]))
                    self.assertEqual(sample["label"], expected[i][0])
Ejemplo n.º 8
0
    def _make_encoded_video_dataset(
            self, data: SampleCollection) -> "LabeledVideoDataset":
        classes = self._get_classes(data)
        label_to_class_mapping = dict(enumerate(classes))
        class_to_label_mapping = {
            c: lab
            for lab, c in label_to_class_mapping.items()
        }

        filepaths = data.values("filepath")
        labels = data.values(self.label_field + ".label")
        targets = [class_to_label_mapping[lab] for lab in labels]
        labeled_video_paths = LabeledVideoPaths(list(zip(filepaths, targets)))

        ds: LabeledVideoDataset = LabeledVideoDataset(
            labeled_video_paths,
            self.clip_sampler,
            video_sampler=self.video_sampler,
            decode_audio=self.decode_audio,
            decoder=self.decoder,
        )
        return ds
    def test_random_clip_sampling_works(self, decoder):
        with mock_encoded_video_dataset_file() as (
                mock_csv,
                label_videos,
                total_duration,
        ):
            half_duration = total_duration / 2 - self._EPS
            clip_sampler = make_clip_sampler("random", half_duration)
            labeled_video_paths = LabeledVideoPaths.from_path(mock_csv)
            dataset = LabeledVideoDataset(
                labeled_video_paths,
                clip_sampler=clip_sampler,
                video_sampler=SequentialSampler,
                decode_audio=False,
                decoder=decoder,
            )

            expected_labels = [label for label, _ in label_videos]
            for i, sample in enumerate(dataset):
                expected_t_shape = 5
                self.assertEqual(sample["video"].shape[1], expected_t_shape)
                self.assertEqual(sample["label"], expected_labels[i])
Ejemplo n.º 10
0
 def load_data(
     self,
     files: List[PATH_TYPE],
     targets: List[Any],
     clip_sampler: Union[str, "ClipSampler"] = "random",
     clip_duration: float = 2,
     clip_sampler_kwargs: Dict[str, Any] = None,
     video_sampler: Type[Sampler] = torch.utils.data.RandomSampler,
     decode_audio: bool = False,
     decoder: str = "pyav",
     target_formatter: Optional[TargetFormatter] = None,
 ) -> "LabeledVideoDataset":
     dataset = LabeledVideoDataset(
         LabeledVideoPaths(list(zip(files, targets))),
         _make_clip_sampler(clip_sampler, clip_duration, clip_sampler_kwargs),
         video_sampler=video_sampler,
         decode_audio=decode_audio,
         decoder=decoder,
     )
     if not self.predicting:
         self.load_target_metadata(
             [sample[1] for sample in dataset._labeled_videos._paths_and_labels], target_formatter=target_formatter
         )
     return dataset
Ejemplo n.º 11
0
def clip_recognition_dataset(
    data_path: str,
    clip_sampler: ClipSampler,
    video_sampler: Type[
        torch.utils.data.Sampler] = torch.utils.data.RandomSampler,
    transform: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
    video_path_prefix: str = "",
    decode_audio: bool = True,
    decoder: str = "pyav",
):
    """
    Builds a LabeledVideoDataset with noun, verb annotations from a json file with the following
    format:

        .. code-block:: text

            {
              "video_name1": {
                  {
                    "benchmarks": {
                        "forecasting_hands_objects": [
                            {
                                "critical_frame_selection_parent_start_sec": <start_sec>
                                "critical_frame_selection_parent_end_sec": <end_sec>
                                {
                                    "taxonomy: {
                                        "noun": <label>,
                                        "verb": <label>,
                                    }
                                }
                            },
                            {
                                ...
                            }
                        ]
                    }
                  }
              }
              "video_name2": {...}
              ....
              "video_nameN": {...}
            }

    Args:
        labeled_video_paths (List[Tuple[str, Optional[dict]]]): List containing
                video file paths and associated labels. If video paths are a folder
                it's interpreted as a frame video, otherwise it must be an encoded
                video.

        clip_sampler (ClipSampler): Defines how clips should be sampled from each
            video. See the clip sampling documentation for more information.

        video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal
            video container. This defines the order videos are decoded and,
            if necessary, the distributed split.

        transform (Callable): This callable is evaluated on the clip output before
            the clip is returned. It can be used for user defined preprocessing and
            augmentations on the clips. The clip output format is described in __next__().

        decode_audio (bool): If True, also decode audio from video.

        decoder (str): Defines what type of decoder used to decode a video. Not used for
            frame videos.
    """
    if g_pathmgr.isfile(data_path):
        try:
            with g_pathmgr.open(data_path, "r") as f:
                annotations = json.load(f)
        except Exception:
            raise FileNotFoundError(
                f"{data_path} must be json for Ego4D dataset")

        # LabeledVideoDataset requires the data to be list of tuples with format:
        # (video_paths, annotation_dict), for no annotations we just pass in an empty dict.
        untrimmed_clip_annotations = []
        for video_name, child in annotations.items():
            video_path = os.path.join(video_path_prefix, video_name)
            for clip_annotation in child["benchmarks"][
                    "forecasting_hands_objects"]:
                clip_start = clip_annotation[
                    "critical_frame_selection_parent_start_sec"]
                clip_end = clip_annotation[
                    "critical_frame_selection_parent_end_sec"]
                taxonomy = clip_annotation["taxonomy"]
                noun_label = taxonomy["noun"]
                verb_label = taxonomy["verb"]
                verb_unsure = taxonomy["verb_unsure"]
                noun_unsure = taxonomy["noun_unsure"]
                if (noun_label is None or verb_label is None or verb_unsure
                        or noun_unsure):
                    continue

                untrimmed_clip_annotations.append((
                    video_path,
                    {
                        "clip_start_sec": clip_start,
                        "clip_end_sec": clip_end,
                        "noun_label": noun_label,
                        "verb_label": verb_label,
                    },
                ))
    else:
        raise FileNotFoundError(f"{data_path} not found.")

    # Map noun and verb key words to unique index.
    def map_labels_to_index(label_name):
        labels = list(
            {info[label_name]
             for _, info in untrimmed_clip_annotations})
        label_to_idx = {label: i for i, label in enumerate(labels)}
        for i in range(len(untrimmed_clip_annotations)):
            label = untrimmed_clip_annotations[i][1][label_name]
            untrimmed_clip_annotations[i][1][label_name] = label_to_idx[label]

    map_labels_to_index("noun_label")
    map_labels_to_index("verb_label")

    dataset = LabeledVideoDataset(
        untrimmed_clip_annotations,
        UntrimmedClipSampler(clip_sampler),
        video_sampler,
        transform,
        decode_audio=decode_audio,
        decoder=decoder,
    )
    return dataset
Ejemplo n.º 12
0
def video_only_dataset(
    data_path: str,
    clip_sampler: ClipSampler,
    video_sampler: Type[
        torch.utils.data.Sampler] = torch.utils.data.RandomSampler,
    transform: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
    video_path_prefix: str = "",
    decode_audio: bool = True,
    decoder: str = "pyav",
):
    """
    Builds a LabeledVideoDataset with no annotations from a json file with the following
    format:

        .. code-block:: text

            {
              "video_name1": {...}
              "video_name2": {...}
              ....
              "video_nameN": {...}
            }

    Args:
        labeled_video_paths (List[Tuple[str, Optional[dict]]]): List containing
                video file paths and associated labels. If video paths are a folder
                it's interpreted as a frame video, otherwise it must be an encoded
                video.

        clip_sampler (ClipSampler): Defines how clips should be sampled from each
            video. See the clip sampling documentation for more information.

        video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal
            video container. This defines the order videos are decoded and,
            if necessary, the distributed split.

        transform (Callable): This callable is evaluated on the clip output before
            the clip is returned. It can be used for user defined preprocessing and
            augmentations on the clips. The clip output format is described in __next__().

        decode_audio (bool): If True, also decode audio from video.

        decoder (str): Defines what type of decoder used to decode a video. Not used for
            frame videos.
    """

    torch._C._log_api_usage_once(
        "PYTORCHVIDEO.dataset.json_dataset.video_only_dataset")

    if g_pathmgr.isfile(data_path):
        try:
            with g_pathmgr.open(data_path, "r") as f:
                annotations = json.load(f)
        except Exception:
            raise FileNotFoundError(
                f"{data_path} must be json for Ego4D dataset")

        # LabeledVideoDataset requires the data to be list of tuples with format:
        # (video_paths, annotation_dict), for no annotations we just pass in an empty dict.
        video_paths = [(os.path.join(video_path_prefix, x), {})
                       for x in annotations.keys()]
    else:
        raise FileNotFoundError(f"{data_path} not found.")

    dataset = LabeledVideoDataset(
        video_paths,
        clip_sampler,
        video_sampler,
        transform,
        decode_audio=decode_audio,
        decoder=decoder,
    )
    return dataset