Beispiel #1
0
 def special_characters_are_replaced_with_underscores():
     assert sanitize_filename("2020-06-18T08<50<13.14815Z.json"
                              ) == "2020-06-18T08_50_13.14815Z.json"
     assert sanitize_filename("2020-06-18T08>50>13.14815Z.json"
                              ) == "2020-06-18T08_50_13.14815Z.json"
     assert sanitize_filename('2020-06-18T08"50"13.14815Z.json'
                              ) == "2020-06-18T08_50_13.14815Z.json"
     assert sanitize_filename("2020-06-18T08/50/13.14815Z.json"
                              ) == "2020-06-18T08_50_13.14815Z.json"
     assert sanitize_filename("2020-06-18T08\\50\\13.14815Z.json"
                              ) == "2020-06-18T08_50_13.14815Z.json"
     assert sanitize_filename("2020-06-18T08|50|13.14815Z.json"
                              ) == "2020-06-18T08_50_13.14815Z.json"
     assert sanitize_filename("2020-06-18T08?50?13.14815Z.json"
                              ) == "2020-06-18T08_50_13.14815Z.json"
     assert sanitize_filename("2020-06-18T08*50*13.14815Z.json"
                              ) == "2020-06-18T08_50_13.14815Z.json"
Beispiel #2
0
def download_image_from_json_annotation(api_key: str, api_url: str,
                                        annotation_path: Path,
                                        image_path: Path, use_folders: bool,
                                        video_frames: bool) -> None:
    """
    Helper function: downloads an image given a .json annotation path
    and renames the json after the image filename

    Parameters
    ----------
    api_key : str
        API Key of the current team
    api_url : str
        Url of the darwin API (e.g. 'https://darwin.v7labs.com/api/')
    annotation_path : Path
        Path where the annotation is located
    image_path : Path
        Path where to download the image
    use_folders: bool
        Recreate folders
    video_frames: bool
        Pulls video frames images instead of video files
    """
    with annotation_path.open() as file:
        annotation = json.load(file)

    # If we are using folders, extract the path for the image and create the folder if needed
    sub_path = annotation["image"].get("path", "/") if use_folders else "/"
    parent_path = Path(image_path) / Path(sub_path).relative_to(
        Path(sub_path).anchor)
    parent_path.mkdir(exist_ok=True, parents=True)

    if video_frames and "frame_urls" in annotation["image"]:
        video_path: Path = parent_path / annotation_path.stem
        video_path.mkdir(exist_ok=True, parents=True)
        for i, frame_url in enumerate(annotation["image"]["frame_urls"]):
            path = video_path / f"{i:07d}.png"
            download_image(frame_url, path, api_key)
    else:
        image_url = annotation["image"]["url"]
        image_path = parent_path / sanitize_filename(
            annotation["image"]["filename"])
        download_image(image_url, image_path, api_key)
Beispiel #3
0
def download_all_images_from_annotations(
    api_key: str,
    api_url: str,
    annotations_path: Path,
    images_path: Path,
    force_replace: bool = False,
    remove_extra: bool = False,
    annotation_format: str = "json",
    use_folders: bool = False,
    video_frames: bool = False,
) -> Tuple[Callable[[], Iterator[Any]], int]:
    """Helper function: downloads the all images corresponding to a project.

    Parameters
    ----------
    api_key : str
        API Key of the current team
    api_url : str
        Url of the darwin API (e.g. 'https://darwin.v7labs.com/api/')
    annotations_path : Path
        Path where the annotations are located
    images_path : Path
        Path where to download the images
    force_replace: bool
        Forces the re-download of an existing image
    remove_extra: bool
        Removes existing images for which there is not corresponding annotation
    annotation_format : str
        Format of the annotations. Currently only JSON and xml are expected
    use_folders: bool
        Recreate folders
    video_frames: bool
        Pulls video frames images instead of video files

    Returns
    -------
    generator : function
        Generator for doing the actual downloads,
    count : int
        The files count
    """
    Path(images_path).mkdir(exist_ok=True)
    if annotation_format not in ["json", "xml"]:
        raise ValueError(
            f"Annotation format {annotation_format} not supported")

    # Verify that there is not already image in the images folder
    unfiltered_files = images_path.rglob(
        f"*") if use_folders else images_path.glob(f"*")
    existing_images = {
        image.stem: image
        for image in unfiltered_files
        if is_image_extension_allowed(image.suffix)
    }

    annotations_to_download_path = []
    for annotation_path in annotations_path.glob(f"*.{annotation_format}"):
        with annotation_path.open() as file:
            annotation = json.load(file)
        if not force_replace:
            # Check collisions on image filename, original_filename and json filename on the system
            if sanitize_filename(Path(
                    annotation["image"]["filename"]).stem) in existing_images:
                continue
            if sanitize_filename(
                    Path(annotation["image"]
                         ["original_filename"]).stem) in existing_images:
                continue
            if sanitize_filename(annotation_path.stem) in existing_images:
                continue
        annotations_to_download_path.append(annotation_path)

    if remove_extra:
        # Removes existing images for which there is not corresponding annotation
        annotations_downloaded_stem = [
            a.stem for a in annotations_path.glob(f"*.{annotation_format}")
        ]
        for existing_image in existing_images.values():
            if existing_image.stem not in annotations_downloaded_stem:
                print(
                    f"Removing {existing_image} as there is no corresponding annotation"
                )
                existing_image.unlink()
    # Create the generator with the partial functions
    count = len(annotations_to_download_path)
    generator = lambda: (functools.partial(
        download_image_from_annotation,
        api_key,
        api_url,
        annotation_path,
        images_path,
        annotation_format,
        use_folders,
        video_frames,
    ) for annotation_path in annotations_to_download_path)
    return generator, count
Beispiel #4
0
 def avoid_replacing_columns_on_non_windows(mock: MagicMock):
     assert sanitize_filename("2020-06-18T08:50:13.14815Z.json"
                              ) == "2020-06-18T08:50:13.14815Z.json"
     mock.assert_called_once()
Beispiel #5
0
 def normal_filenames_stay_untouched():
     assert sanitize_filename("test.jpg") == "test.jpg"
Beispiel #6
0
    def pull(
        self,
        *,
        release: Optional[Release] = None,
        blocking: bool = True,
        multi_threaded: bool = True,
        only_annotations: bool = False,
        force_replace: bool = False,
        remove_extra: bool = False,
        subset_filter_annotations_function: Optional[Callable] = None,
        subset_folder_name: Optional[str] = None,
        use_folders: bool = False,
        video_frames: bool = False,
    ) -> Tuple[Optional[Callable[[], Iterator[Any]]], int]:
        """
        Downloads a remote dataset (images and annotations) to the datasets directory.

        Parameters
        ----------
        release: Release
            The release to pull
        blocking : bool
            If False, the dataset is not downloaded and a generator function is returned instead
        multi_threaded : bool
            Uses multiprocessing to download the dataset in parallel. If blocking is False this has no effect.
        only_annotations: bool
            Download only the annotations and no corresponding images
        force_replace: bool
            Forces the re-download of an existing image
        remove_extra: bool
            Removes existing images for which there is not corresponding annotation
        subset_filter_annotations_function: Callable
            This function receives the directory where the annotations are downloaded and can
            perform any operation on them i.e. filtering them with custom rules or else.
            If it needs to receive other parameters is advised to use functools.partial() for it.
        subset_folder_name: str
            Name of the folder with the subset of the dataset. If not provided a timestamp is used.
        use_folders: bool
            Recreates folders from the dataset
        video_frames: bool
            Pulls video frames images instead of video files

        Returns
        -------
        generator : function
            Generator for doing the actual downloads. This is None if blocking is True
        count : int
            The files count
        """
        if release is None:
            release = self.get_release()

        if release.format != "json":
            raise UnsupportedExportFormat(release.format)

        release_dir = self.local_releases_path / release.name
        release_dir.mkdir(parents=True, exist_ok=True)

        with tempfile.TemporaryDirectory() as tmp_dir_str:
            tmp_dir = Path(tmp_dir_str)
            # Download the release from Darwin
            zip_file_path = release.download_zip(tmp_dir / "dataset.zip")
            with zipfile.ZipFile(zip_file_path) as z:
                # Extract annotations
                z.extractall(tmp_dir)
                # If a filtering function is provided, apply it
                if subset_filter_annotations_function is not None:
                    subset_filter_annotations_function(tmp_dir)
                    if subset_folder_name is None:
                        subset_folder_name = datetime.now().strftime(
                            "%m/%d/%Y_%H:%M:%S")
                annotations_dir: Path = release_dir / (subset_folder_name
                                                       or "") / "annotations"
                # Remove existing annotations if necessary
                if annotations_dir.exists():
                    try:
                        shutil.rmtree(annotations_dir)
                    except PermissionError:
                        print(
                            f"Could not remove dataset in {annotations_dir}. Permission denied."
                        )
                annotations_dir.mkdir(parents=True, exist_ok=False)
                # Move the annotations into the right folder and rename them to have the image
                # original filename as contained in the json
                for annotation_path in tmp_dir.glob("*.json"):
                    with annotation_path.open() as file:
                        annotation = json.load(file)
                    filename = sanitize_filename(
                        Path(annotation["image"]["filename"]).stem)
                    destination_name = annotations_dir / f"{filename}{annotation_path.suffix}"
                    shutil.move(str(annotation_path), str(destination_name))

        # Extract the list of classes and create the text files
        make_class_lists(release_dir)

        if release.latest and is_unix_like_os():
            try:
                latest_dir: Path = self.local_releases_path / "latest"
                if latest_dir.is_symlink():
                    latest_dir.unlink()

                target_link: Path = self.local_releases_path / release_dir.name
                latest_dir.symlink_to(target_link)
            except OSError:
                self.console.log(
                    f"Could not mark release {release.name} as latest. Continuing..."
                )

        if only_annotations:
            # No images will be downloaded
            return None, 0

        team_config: Optional[Team] = self.client.config.get_team(self.team)
        if not team_config:
            raise ValueError("Unable to get Team configuration.")

        api_key = team_config.api_key

        # Create the generator with the download instructions
        progress, count = download_all_images_from_annotations(
            api_key=api_key,
            api_url=self.client.url,
            annotations_path=annotations_dir,
            images_path=self.local_images_path,
            force_replace=force_replace,
            remove_extra=remove_extra,
            use_folders=use_folders,
            video_frames=video_frames,
        )
        if count == 0:
            return None, count

        # If blocking is selected, download the dataset on the file system
        if blocking:
            exhaust_generator(progress=progress(),
                              count=count,
                              multi_threaded=multi_threaded)
            return None, count
        else:
            return progress, count