Example #1
0
    def _make_datapipe(
        self,
        resource_dps: List[IterDataPipe],
        *,
        config: DatasetConfig,
        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
    ) -> IterDataPipe[Dict[str, Any]]:
        archive_dp = resource_dps[0]

        splits_dp, joint_categories_dp, images_dp = Demultiplexer(
            archive_dp, 3, self._classify_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
        )

        splits_dp = Filter(splits_dp, path_comparator("name", f"{config.split}{config.fold}.txt"))
        splits_dp = LineReader(splits_dp, decode=True, return_path=False)
        splits_dp = Shuffler(splits_dp, buffer_size=INFINITE_BUFFER_SIZE)
        splits_dp = hint_sharding(splits_dp)

        joint_categories_dp = CSVParser(joint_categories_dp, delimiter=" ")

        dp = IterKeyZipper(
            splits_dp,
            joint_categories_dp,
            key_fn=getitem(),
            ref_key_fn=getitem(0),
            buffer_size=INFINITE_BUFFER_SIZE,
        )
        dp = IterKeyZipper(
            dp,
            images_dp,
            key_fn=getitem(0),
            ref_key_fn=self._image_key_fn,
            buffer_size=INFINITE_BUFFER_SIZE,
        )
        return Mapper(dp, functools.partial(self._collate_and_decode_sample, decoder=decoder))
Example #2
0
    def _make_datapipe(
        self,
        resource_dps: List[IterDataPipe],
        *,
        config: DatasetConfig,
    ) -> IterDataPipe[Dict[str, Any]]:
        archive_dp, extra_split_dp = resource_dps

        archive_dp = resource_dps[0]
        split_dp, images_dp, anns_dp = Demultiplexer(
            archive_dp,
            3,
            self._classify_archive,
            buffer_size=INFINITE_BUFFER_SIZE,
            drop_none=True,
        )
        if config.split == "train_noval":
            split_dp = extra_split_dp

        split_dp = Filter(split_dp,
                          path_comparator("name", f"{config.split}.txt"))
        split_dp = LineReader(split_dp, decode=True)
        split_dp = hint_sharding(split_dp)
        split_dp = hint_shuffling(split_dp)

        dp = split_dp
        for level, data_dp in enumerate((images_dp, anns_dp)):
            dp = IterKeyZipper(
                dp,
                data_dp,
                key_fn=getitem(*[0] * level, 1),
                ref_key_fn=path_accessor("stem"),
                buffer_size=INFINITE_BUFFER_SIZE,
            )
        return Mapper(dp, self._prepare_sample)
Example #3
0
    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
        archive_dp = resource_dps[0]
        split_dp, images_dp, anns_dp = Demultiplexer(
            archive_dp,
            3,
            self._classify_archive,
            drop_none=True,
            buffer_size=INFINITE_BUFFER_SIZE,
        )

        split_dp = Filter(split_dp, functools.partial(self._is_in_folder, name=self._split_folder))
        split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt"))
        split_dp = LineReader(split_dp, decode=True)
        split_dp = hint_shuffling(split_dp)
        split_dp = hint_sharding(split_dp)

        dp = split_dp
        for level, data_dp in enumerate((images_dp, anns_dp)):
            dp = IterKeyZipper(
                dp,
                data_dp,
                key_fn=getitem(*[0] * level, 1),
                ref_key_fn=path_accessor("stem"),
                buffer_size=INFINITE_BUFFER_SIZE,
            )
        return Mapper(dp, self._prepare_sample)
Example #4
0
 def _datapipe(
         self,
         resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
     dp = Decompressor(resource_dps[0])
     dp = LineReader(dp, decode=True, return_path=False)
     dp = hint_shuffling(dp)
     dp = hint_sharding(dp)
     return Mapper(dp, self._prepare_sample)
Example #5
0
    def _make_datapipe(self, resource_dps: List[IterDataPipe], *,
                       config: DatasetConfig) -> IterDataPipe[Dict[str, Any]]:
        if config.split in {"train", "test"}:
            dp = resource_dps[0]

            # the train archive is a tar of tars
            if config.split == "train":
                dp = TarArchiveReader(dp)

            dp = hint_sharding(dp)
            dp = hint_shuffling(dp)
            dp = Mapper(
                dp, self._prepare_train_data
                if config.split == "train" else self._prepare_test_data)
        else:  # config.split == "val":
            images_dp, devkit_dp = resource_dps

            meta_dp, label_dp = Demultiplexer(devkit_dp,
                                              2,
                                              self._classifiy_devkit,
                                              drop_none=True,
                                              buffer_size=INFINITE_BUFFER_SIZE)

            meta_dp = Mapper(meta_dp, self._extract_categories_and_wnids)
            _, wnids = zip(*next(iter(meta_dp)))

            label_dp = LineReader(label_dp, decode=True, return_path=False)
            label_dp = Mapper(
                label_dp,
                functools.partial(self._imagenet_label_to_wnid, wnids=wnids))
            label_dp: IterDataPipe[Tuple[int, str]] = Enumerator(label_dp, 1)
            label_dp = hint_sharding(label_dp)
            label_dp = hint_shuffling(label_dp)

            dp = IterKeyZipper(
                label_dp,
                images_dp,
                key_fn=getitem(0),
                ref_key_fn=self._val_test_image_key,
                buffer_size=INFINITE_BUFFER_SIZE,
            )
            dp = Mapper(dp, self._prepare_val_data)

        return Mapper(dp, self._prepare_sample)
Example #6
0
    def _make_datapipe(
        self,
        resource_dps: List[IterDataPipe],
        *,
        config: DatasetConfig,
        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
    ) -> IterDataPipe[Dict[str, Any]]:
        images_dp, devkit_dp = resource_dps

        if config.split == "train":
            # the train archive is a tar of tars
            dp = TarArchiveReader(images_dp)
            dp = hint_sharding(dp)
            dp = hint_shuffling(dp)
            dp = Mapper(dp, self._collate_train_data)
        elif config.split == "val":
            devkit_dp = Filter(
                devkit_dp,
                path_comparator("name",
                                "ILSVRC2012_validation_ground_truth.txt"))
            devkit_dp = LineReader(devkit_dp, return_path=False)
            devkit_dp = Mapper(devkit_dp, int)
            devkit_dp = Enumerator(devkit_dp, 1)
            devkit_dp = hint_sharding(devkit_dp)
            devkit_dp = hint_shuffling(devkit_dp)

            dp = IterKeyZipper(
                devkit_dp,
                images_dp,
                key_fn=getitem(0),
                ref_key_fn=self._val_test_image_key,
                buffer_size=INFINITE_BUFFER_SIZE,
            )
            dp = Mapper(dp, self._collate_val_data)
        else:  # config.split == "test"
            dp = hint_sharding(images_dp)
            dp = hint_shuffling(dp)
            dp = Mapper(dp, self._collate_test_data)

        return Mapper(
            dp,
            functools.partial(self._collate_and_decode_sample,
                              decoder=decoder))
Example #7
0
File: voc.py Project: nairbv/vision
    def _make_datapipe(
        self,
        resource_dps: List[IterDataPipe],
        *,
        config: DatasetConfig,
    ) -> IterDataPipe[Dict[str, Any]]:
        archive_dp = resource_dps[0]
        split_dp, images_dp, anns_dp = Demultiplexer(
            archive_dp,
            3,
            functools.partial(self._classify_archive, config=config),
            drop_none=True,
            buffer_size=INFINITE_BUFFER_SIZE,
        )

        split_dp = Filter(
            split_dp,
            functools.partial(self._is_in_folder,
                              name=self._SPLIT_FOLDER[config.task]))
        split_dp = Filter(split_dp,
                          path_comparator("name", f"{config.split}.txt"))
        split_dp = LineReader(split_dp, decode=True)
        split_dp = hint_sharding(split_dp)
        split_dp = hint_shuffling(split_dp)

        dp = split_dp
        for level, data_dp in enumerate((images_dp, anns_dp)):
            dp = IterKeyZipper(
                dp,
                data_dp,
                key_fn=getitem(*[0] * level, 1),
                ref_key_fn=path_accessor("stem"),
                buffer_size=INFINITE_BUFFER_SIZE,
            )
        return Mapper(
            dp,
            functools.partial(
                self._prepare_sample,
                prepare_ann_fn=self._prepare_detection_ann if config.task
                == "detection" else self._prepare_segmentation_ann,
            ),
        )
Example #8
0
    def _make_datapipe(
        self,
        resource_dps: List[IterDataPipe],
        *,
        config: DatasetConfig,
        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
    ) -> IterDataPipe[Dict[str, Any]]:
        archive_dp, extra_split_dp = resource_dps

        archive_dp = resource_dps[0]
        split_dp, images_dp, anns_dp = Demultiplexer(
            archive_dp,
            3,
            self._classify_archive,
            buffer_size=INFINITE_BUFFER_SIZE,
            drop_none=True,
        )

        if config.split == "train_noval":
            split_dp = extra_split_dp
        split_dp = Filter(split_dp, path_comparator("stem", config.split))
        split_dp = LineReader(split_dp, decode=True)
        split_dp = hint_sharding(split_dp)
        split_dp = hint_shuffling(split_dp)

        dp = split_dp
        for level, data_dp in enumerate((images_dp, anns_dp)):
            dp = IterKeyZipper(
                dp,
                data_dp,
                key_fn=getitem(*[0] * level, 1),
                ref_key_fn=path_accessor("stem"),
                buffer_size=INFINITE_BUFFER_SIZE,
            )
        return Mapper(
            dp,
            functools.partial(self._collate_and_decode_sample,
                              config=config,
                              decoder=decoder))
Example #9
0
    def _generate_categories(self) -> Tuple[str, ...]:
        resources = self._resources()

        dp = resources[0].load(self._root)
        dp = Filter(dp, path_comparator("name", "category_names.m"))
        dp = LineReader(dp)
        dp = Mapper(dp, bytes.decode, input_col=1)
        lines = tuple(zip(*iter(dp)))[1]

        pattern = re.compile(r"\s*'(?P<category>\w+)';\s*%(?P<label>\d+)")
        categories_and_labels = cast(
            List[Tuple[str, ...]],
            [
                pattern.match(line).groups()  # type: ignore[union-attr]
                # the first and last line contain no information
                for line in lines[1:-1]
            ],
        )
        categories_and_labels.sort(key=lambda category_and_label: int(category_and_label[1]))
        categories, _ = zip(*categories_and_labels)

        return categories
Example #10
0
    def _datapipe(
            self,
            resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
        archive_dp = resource_dps[0]
        images_dp, split_dp = Demultiplexer(archive_dp,
                                            2,
                                            self._classify_archive,
                                            drop_none=True,
                                            buffer_size=INFINITE_BUFFER_SIZE)
        split_dp = Filter(split_dp,
                          path_comparator("name", f"{self._split}.txt"))
        split_dp = LineReader(split_dp, decode=True, return_path=False)
        split_dp = hint_sharding(split_dp)
        split_dp = hint_shuffling(split_dp)

        dp = IterKeyZipper(
            split_dp,
            images_dp,
            key_fn=getitem(),
            ref_key_fn=self._image_key,
            buffer_size=INFINITE_BUFFER_SIZE,
        )

        return Mapper(dp, self._prepare_sample)
Example #11
0
 def _generate_categories(self) -> List[str]:
     resources = self._resources()
     dp = resources[0].load(self._root)
     dp = Filter(dp, path_comparator("name", "classes.txt"))
     dp = LineReader(dp, decode=True, return_path=False)
     return list(dp)
Example #12
0
    def _datapipe(
            self,
            resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
        prepare_ann_fn: Callable
        if self._year == "2011":
            archive_dp, segmentations_dp = resource_dps
            images_dp, split_dp, image_files_dp, bounding_boxes_dp = Demultiplexer(
                archive_dp,
                4,
                self._2011_classify_archive,
                drop_none=True,
                buffer_size=INFINITE_BUFFER_SIZE)

            image_files_dp = CSVParser(image_files_dp, dialect="cub200")
            image_files_map = dict(
                (image_id, rel_posix_path.rsplit("/", maxsplit=1)[1])
                for image_id, rel_posix_path in image_files_dp)

            split_dp = CSVParser(split_dp, dialect="cub200")
            split_dp = Filter(split_dp, self._2011_filter_split)
            split_dp = Mapper(split_dp, getitem(0))
            split_dp = Mapper(split_dp, image_files_map.get)

            bounding_boxes_dp = CSVParser(bounding_boxes_dp, dialect="cub200")
            bounding_boxes_dp = Mapper(bounding_boxes_dp,
                                       image_files_map.get,
                                       input_col=0)

            anns_dp = IterKeyZipper(
                bounding_boxes_dp,
                segmentations_dp,
                key_fn=getitem(0),
                ref_key_fn=self._2011_segmentation_key,
                keep_key=True,
                buffer_size=INFINITE_BUFFER_SIZE,
            )

            prepare_ann_fn = self._2011_prepare_ann
        else:  # self._year == "2010"
            split_dp, images_dp, anns_dp = resource_dps

            split_dp = Filter(split_dp,
                              path_comparator("name", f"{self._split}.txt"))
            split_dp = LineReader(split_dp, decode=True, return_path=False)
            split_dp = Mapper(split_dp, self._2010_split_key)

            anns_dp = Mapper(anns_dp, self._2010_anns_key)

            prepare_ann_fn = self._2010_prepare_ann

        split_dp = hint_shuffling(split_dp)
        split_dp = hint_sharding(split_dp)

        dp = IterKeyZipper(
            split_dp,
            images_dp,
            getitem(),
            path_accessor("name"),
            buffer_size=INFINITE_BUFFER_SIZE,
        )
        dp = IterKeyZipper(
            dp,
            anns_dp,
            getitem(0),
            buffer_size=INFINITE_BUFFER_SIZE,
        )
        return Mapper(
            dp,
            functools.partial(self._prepare_sample,
                              prepare_ann_fn=prepare_ann_fn))
Example #13
0
    def _make_datapipe(
        self,
        resource_dps: List[IterDataPipe],
        *,
        config: DatasetConfig,
        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
    ) -> IterDataPipe[Dict[str, Any]]:
        if config.year == "2011":
            archive_dp, segmentations_dp = resource_dps
            images_dp, split_dp, image_files_dp, bounding_boxes_dp = Demultiplexer(
                archive_dp,
                4,
                self._2011_classify_archive,
                drop_none=True,
                buffer_size=INFINITE_BUFFER_SIZE)

            image_files_dp = CSVParser(image_files_dp, dialect="cub200")
            image_files_map = dict(
                (image_id, rel_posix_path.rsplit("/", maxsplit=1)[1])
                for image_id, rel_posix_path in image_files_dp)

            split_dp = CSVParser(split_dp, dialect="cub200")
            split_dp = Filter(
                split_dp,
                functools.partial(self._2011_filter_split, split=config.split))
            split_dp = Mapper(split_dp, getitem(0))
            split_dp = Mapper(split_dp, image_files_map.get)

            bounding_boxes_dp = CSVParser(bounding_boxes_dp, dialect="cub200")
            bounding_boxes_dp = Mapper(bounding_boxes_dp,
                                       image_files_map.get,
                                       input_col=0)

            anns_dp = IterKeyZipper(
                bounding_boxes_dp,
                segmentations_dp,
                key_fn=getitem(0),
                ref_key_fn=self._2011_segmentation_key,
                keep_key=True,
                buffer_size=INFINITE_BUFFER_SIZE,
            )
        else:  # config.year == "2010"
            split_dp, images_dp, anns_dp = resource_dps

            split_dp = Filter(split_dp,
                              path_comparator("name", f"{config.split}.txt"))
            split_dp = LineReader(split_dp, decode=True, return_path=False)
            split_dp = Mapper(split_dp, self._2010_split_key)

            anns_dp = Mapper(anns_dp, self._2010_anns_key)

        split_dp = hint_sharding(split_dp)
        split_dp = hint_shuffling(split_dp)

        dp = IterKeyZipper(
            split_dp,
            images_dp,
            getitem(),
            path_accessor("name"),
            buffer_size=INFINITE_BUFFER_SIZE,
        )
        dp = IterKeyZipper(
            dp,
            anns_dp,
            getitem(0),
            buffer_size=INFINITE_BUFFER_SIZE,
        )
        return Mapper(
            dp,
            functools.partial(self._collate_and_decode_sample,
                              year=config.year,
                              decoder=decoder))