Beispiel #1
0
def dataset_convert(dataset_identifier: str,
                    format: str,
                    output_dir: Optional[PathLike] = None) -> None:
    """
    Converts the annotations from the given dataset to the given format.
    Exits the application if no dataset with the given slug exists or no releases for the dataset
    were previously pulled.

    Parameters
    ----------
    dataset_identifier: str
        The dataset identifier, normally in the "<team-slug>/<dataset-slug>:<version>" form.
    format: str
        The format we want to convert to.
    output_dir: Optional[PathLike]
        The folder where the exported annotation files will be. If None it will be the inside the
        annotations folder of the dataset under 'other_formats/{format}'. The Defaults to None.
    """
    identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_identifier)
    client: Client = _load_client(team_slug=identifier.team_slug)

    try:
        parser: ExportParser = get_exporter(format)
        dataset: RemoteDataset = client.get_remote_dataset(
            dataset_identifier=identifier)
        if not dataset.local_path.exists():
            _error(
                f"No annotations downloaded for dataset f{dataset}, first pull a release using "
                f"'darwin dataset pull {identifier}'")

        release_path: Path = get_release_path(dataset.local_path,
                                              identifier.version)
        annotations_path: Path = release_path / "annotations"
        if output_dir is None:
            output_dir = release_path / "other_formats" / format
        else:
            output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        export_annotations(parser, [annotations_path], output_dir)
    except ExporterNotFoundError:
        _error(
            f"Unsupported export format: {format}, currently supported: {export_formats}"
        )
    except AttributeError:
        _error(
            f"Unsupported export format: {format}, currently supported: {export_formats}"
        )
    except NotFound as e:
        _error(f"No dataset with name '{e.name}'")
def dataset_convert(dataset_slug: str, format: str, output_dir: Optional[Union[str, Path]]):
    client = _load_client()
    parser = find_supported_format(format, darwin.exporter.formats.supported_formats)

    try:
        dataset = client.get_remote_dataset(dataset_identifier=dataset_slug)
        if not dataset.local_path.exists():
            _error(
                f"No annotations downloaded for dataset f{dataset}, first pull a release using "
                f"'darwin dataset pull {dataset_slug}'"
            )

        release_path = get_release_path(dataset.local_path)
        annotations_path = release_path / "annotations"
        if output_dir is None:
            output_dir = release_path / "other_formats" / f"{format}"
        else:
            output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        exporter.export_annotations(parser, [annotations_path], output_dir)
    except NotFound as e:
        _error(f"No dataset with name '{e.name}'")
Beispiel #3
0
    def __init__(
        self,
        dataset_path: Path,
        annotation_type: str,
        partition: Optional[str] = None,
        split: str = "default",
        split_type: str = "random",
        release_name: Optional[str] = None,
    ):
        """ Creates a dataset

        Parameters
        ----------
        dataset_path: Path, str
            Path to the location of the dataset on the file system
        annotation_type: str
            The type of annotation classes [tag, bounding_box, polygon]
        partition: str
            Selects one of the partitions [train, val, test]
        split: str
            Selects the split that defines the percentages used (use 'default' to select the default split)
        split_type: str
            Heuristic used to do the split [random, stratified]
        release_name: str
            Version of the dataset
        """
        assert dataset_path is not None
        release_path = get_release_path(dataset_path, release_name)
        annotations_dir = release_path / "annotations"
        assert annotations_dir.exists()
        images_dir = dataset_path / "images"
        assert images_dir.exists()

        if partition not in ["train", "val", "test", None]:
            raise ValueError("partition should be either 'train', 'val', or 'test'")
        if split_type not in ["random", "stratified"]:
            raise ValueError("split_type should be either 'random', 'stratified'")
        if annotation_type not in ["tag", "polygon", "bounding_box"]:
            raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")

        self.dataset_path = dataset_path
        self.annotation_type = annotation_type
        self.images_path: List[Path] = []
        self.annotations_path: List[Path] = []
        self.original_classes = None
        self.original_images_path: Optional[List[Path]] = None
        self.original_annotations_path: Optional[List[Path]] = None

        # Get the list of classes
        self.classes = get_classes(
            self.dataset_path, release_name, annotation_type=self.annotation_type, remove_background=True
        )
        self.num_classes = len(self.classes)

        # Get the list of stems
        if partition:
            # Get the split
            if split_type == "random":
                split_file = f"{split_type}_{partition}.txt"
            elif split_type == "stratified":
                split_file = f"{split_type}_{annotation_type}_{partition}.txt"
            split_path = release_path / "lists" / split / split_file
            if split_path.is_file():
                stems = (e.strip() for e in split_path.open())
            else:
                raise FileNotFoundError(
                    f"could not find a dataset partition. "
                    f"Split the dataset using `split_dataset()` from `darwin.dataset.utils`"
                ) from None
        else:
            # If the partition is not specified, get all the annotations
            stems = [e.stem for e in annotations_dir.glob("*.json")]

        # Find all the annotations and their corresponding images
        for stem in stems:
            annotation_path = annotations_dir / f"{stem}.json"
            images = []
            for ext in SUPPORTED_IMAGE_EXTENSIONS:
                image_path = images_dir / f"{stem}{ext}"
                if image_path.exists():
                    images.append(image_path)
            if len(images) < 1:
                raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image")
            if len(images) > 1:
                raise ValueError(f"Image ({stem}) is present with multiple extensions. This is forbidden.")
            assert len(images) == 1
            self.images_path.append(images[0])
            self.annotations_path.append(annotation_path)

        if len(self.images_path) == 0:
            raise ValueError(f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", f" in {images_dir}")

        assert len(self.images_path) == len(self.annotations_path)
Beispiel #4
0
 def it_uses_provided_version_name_otherwise(team_dataset_path: Path):
     test_release_path = team_dataset_path / "releases" / "test"
     test_release_path.mkdir(parents=True)
     assert get_release_path(team_dataset_path, "test") == test_release_path
Beispiel #5
0
 def it_defaults_to_latest_version_if_no_version_provided(
         team_dataset_path: Path):
     latest_release_path = team_dataset_path / "releases" / "latest"
     latest_release_path.mkdir(parents=True)
     assert get_release_path(team_dataset_path) == latest_release_path
Beispiel #6
0
def split_dataset(
    dataset_path: PathLike,
    release_name: Optional[str] = None,
    val_percentage: float = 0.1,
    test_percentage: float = 0.2,
    split_seed: int = 0,
    make_default_split: bool = True,
    stratified_types: List[str] = ["bounding_box", "polygon", "tag"],
) -> Path:
    """
    Given a local a dataset (pulled from Darwin), split it by creating lists of filenames.
    The partitions to split the dataset into are called train, val and test.

    The dataset is always split randomly, and can be additionally split according to the
    stratified strategy by providing a list of stratified types.

    Parameters
    ----------
    dataset_path : Path
        Local path to the dataset
    release_name : str
        Version of the dataset
    val_percentage : float
        Percentage of images used in the validation set
    test_percentage : float
        Percentage of images used in the test set
    split_seed : int
        Fix seed for random split creation
    make_default_split : bool
        Makes this split the default split
    stratified_types : List[str]
        List of annotation types to split with the stratified strategy

    Returns
    -------
    splits : Path
        Keys are the different splits (random, tags, ...) and values are the relative file names
    """
    # Requirements: scikit-learn
    try:
        import sklearn  # noqa
    except ImportError:
        raise ImportError(
            "Darwin requires scikit-learn to split a dataset. Install it using: pip install scikit-learn"
        ) from None

    _validate_split(val_percentage, test_percentage)

    # Infer release path
    if isinstance(dataset_path, str):
        dataset_path = Path(dataset_path)
    release_path = get_release_path(dataset_path, release_name)

    # List all annotation files in release
    annotation_path = release_path / "annotations"
    assert annotation_path.exists()
    annotation_files = list(annotation_path.glob("**/*.json"))

    # Prepare the "lists" folder, which is where we are going to save the split files
    lists_path = release_path / "lists"
    lists_path.mkdir(parents=True, exist_ok=True)

    # Compute sizes of each dataset partition
    dataset_size: int = len(annotation_files)
    val_size: int = int(val_percentage * dataset_size)
    test_size: int = int(test_percentage * dataset_size)
    train_size: int = dataset_size - val_size - test_size
    split_id = f"{train_size}_{val_size}_{test_size}"

    # Compute split id, a combination of val precentage, test percentage and split seed
    # The split id is used to create a folder with the same name in the "lists" folder
    if split_seed != 0:
        split_id += f"_s{split_seed}"
    split_path = lists_path / split_id

    # Build a split paths dictionary. The split paths are indexed by strategy (e.g. random
    # or stratified), and by partition (train/val/test)
    split = _build_split(split_path, stratified_types)
    assert split.is_valid()

    # Do the actual splitting
    split_path.mkdir(exist_ok=True)

    if split.random:
        _random_split(
            annotation_path=annotation_path,
            annotation_files=annotation_files,
            split=split.random,
            train_size=train_size,
            val_size=val_size,
            test_size=test_size,
            split_seed=split_seed,
        )

    if split.stratified:
        _stratified_split(
            annotation_path=annotation_path,
            split=split.stratified,
            annotation_files=annotation_files,
            train_size=train_size,
            val_size=val_size,
            test_size=test_size,
            stratified_types=stratified_types,
            split_seed=split_seed,
        )

    # Create symlink for default split
    default_split_path = lists_path / "default"
    if make_default_split or not default_split_path.exists():
        if default_split_path.exists():
            default_split_path.unlink()
        default_split_path.symlink_to(f"./{split_id}")

    return split_path