Esempio n. 1
0
    def classes(self,
                annotation_type: str,
                release_name: Optional[str] = None):
        """
        Returns the list of `class_type` classes

        Parameters
        ----------
        annotation_type
            The type of annotation classes, e.g. 'tag' or 'polygon'
        release_name: str
            Version of the dataset


        Returns
        -------
        classes: list
            List of classes in the dataset of type `class_type`
        """
        assert self.local_path.exists()
        if release_name in ["latest", None]:
            release = self.get_release("latest")
            release_name = release.name

        return get_classes(self.local_path,
                           release_name=release_name,
                           annotation_type=annotation_type)
Esempio n. 2
0
    def classes(self, annotation_type: str):
        """
        Returns the list of `class_type` classes

        Parameters
        ----------
        annotation_type
            The type of annotation classes, e.g. 'tag' or 'polygon'

        Returns
        -------
        classes: list
            List of classes in the dataset of type `class_type`
        """
        assert self.local_path.exists()
        return get_classes(self.local_path, annotation_type=annotation_type)
Esempio n. 3
0
    def __init__(
        self,
        dataset_path: Path,
        annotation_type: str,
        partition: Optional[str] = None,
        split: str = "default",
        split_type: str = "random",
        release_name: Optional[str] = None,
    ):
        """ Creates a dataset

        Parameters
        ----------
        dataset_path: Path, str
            Path to the location of the dataset on the file system
        annotation_type: str
            The type of annotation classes [tag, bounding_box, polygon]
        partition: str
            Selects one of the partitions [train, val, test]
        split: str
            Selects the split that defines the percentages used (use 'default' to select the default split)
        split_type: str
            Heuristic used to do the split [random, stratified]
        release_name: str
            Version of the dataset
        """
        assert dataset_path is not None
        release_path = get_release_path(dataset_path, release_name)
        annotations_dir = release_path / "annotations"
        assert annotations_dir.exists()
        images_dir = dataset_path / "images"
        assert images_dir.exists()

        if partition not in ["train", "val", "test", None]:
            raise ValueError("partition should be either 'train', 'val', or 'test'")
        if split_type not in ["random", "stratified"]:
            raise ValueError("split_type should be either 'random', 'stratified'")
        if annotation_type not in ["tag", "polygon", "bounding_box"]:
            raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")

        self.dataset_path = dataset_path
        self.annotation_type = annotation_type
        self.images_path: List[Path] = []
        self.annotations_path: List[Path] = []
        self.original_classes = None
        self.original_images_path: Optional[List[Path]] = None
        self.original_annotations_path: Optional[List[Path]] = None

        # Get the list of classes
        self.classes = get_classes(
            self.dataset_path, release_name, annotation_type=self.annotation_type, remove_background=True
        )
        self.num_classes = len(self.classes)

        # Get the list of stems
        if partition:
            # Get the split
            if split_type == "random":
                split_file = f"{split_type}_{partition}.txt"
            elif split_type == "stratified":
                split_file = f"{split_type}_{annotation_type}_{partition}.txt"
            split_path = release_path / "lists" / split / split_file
            if split_path.is_file():
                stems = (e.strip() for e in split_path.open())
            else:
                raise FileNotFoundError(
                    f"could not find a dataset partition. "
                    f"Split the dataset using `split_dataset()` from `darwin.dataset.utils`"
                ) from None
        else:
            # If the partition is not specified, get all the annotations
            stems = [e.stem for e in annotations_dir.glob("*.json")]

        # Find all the annotations and their corresponding images
        for stem in stems:
            annotation_path = annotations_dir / f"{stem}.json"
            images = []
            for ext in SUPPORTED_IMAGE_EXTENSIONS:
                image_path = images_dir / f"{stem}{ext}"
                if image_path.exists():
                    images.append(image_path)
            if len(images) < 1:
                raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image")
            if len(images) > 1:
                raise ValueError(f"Image ({stem}) is present with multiple extensions. This is forbidden.")
            assert len(images) == 1
            self.images_path.append(images[0])
            self.annotations_path.append(annotation_path)

        if len(self.images_path) == 0:
            raise ValueError(f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", f" in {images_dir}")

        assert len(self.images_path) == len(self.annotations_path)
Esempio n. 4
0
def detectron2_register_dataset(
    dataset: str,
    release_name: Optional[str] = "latest",
    partition: Optional[str] = None,
    split: Optional[str] = "default",
    split_type: Optional[str] = "stratified",
    evaluator_type: Optional[str] = None,
) -> str:
    """Registers a local Darwin-formatted dataset in Detectron2

    Parameters
    ----------
    dataset: str
        Dataset slug
    release_name: str
        Version of the dataset
    partition: str
        Selects one of the partitions [train, val, test]
    split
        Selects the split that defines the percetages used (use 'default' to select the default split)
    split_type: str
        Heuristic used to do the split [random, stratified]
    evaluator_type: str
        Evaluator to be used in the val and test sets
    """
    try:
        from detectron2.data import DatasetCatalog, MetadataCatalog
    except ImportError:
        print("Detectron2 not found.")
        sys.exit(1)
    from darwin.dataset.utils import get_annotations, get_classes

    dataset_path: Optional[Path] = None
    if os.path.isdir(dataset):
        dataset_path = Path(dataset)
    else:
        identifier = DatasetIdentifier.parse(dataset)
        if identifier.version:
            release_name = identifier.version

        client = _load_client(offline=True)
        dataset_path = None
        for path in client.list_local_datasets(team_slug=identifier.team_slug):
            if identifier.dataset_slug == path.name:
                dataset_path = path

        if not dataset_path:
            _error(
                f"Dataset '{identifier.dataset_slug}' does not exist locally. "
                f"Use 'darwin dataset remote' to see all the available datasets, "
                f"and 'darwin dataset pull' to pull them.")

    catalog_name = f"darwin_{dataset_path.name}"
    if partition:
        catalog_name += f"_{partition}"

    classes = get_classes(dataset_path=dataset_path,
                          release_name=release_name,
                          annotation_type="polygon")

    DatasetCatalog.register(
        catalog_name,
        lambda partition=partition: list(
            get_annotations(
                dataset_path,
                partition=partition,
                split=split,
                split_type=split_type,
                release_name=release_name,
                annotation_type="polygon",
                annotation_format="coco",
                ignore_inconsistent_examples=True,
            )),
    )
    MetadataCatalog.get(catalog_name).set(thing_classes=classes)
    if evaluator_type:
        MetadataCatalog.get(catalog_name).set(evaluator_type=evaluator_type)
    return catalog_name