Esempio n. 1
0
    def __init__(self,
                 root: Path,
                 split: Path,
                 transform: Optional[List] = None):
        """ Creates a dataset

        Parameters
        ----------
        root : Path
            Path to the location of the dataset on the file system
        split : Path
            Path to the *.txt file containing the list of files for this split.
        transform : list[torchvision.transforms]
            List of PyTorch transforms
        """
        self.root = root
        self.split = split
        self.transform = transform
        self.images_path: List[Path] = []
        self.annotations_path: List[Path] = []
        self.classes = None
        self.original_classes = None
        self.original_images_path: Optional[List[Path]] = None
        self.original_annotations_path: Optional[List[Path]] = None
        self.convert_polygons: Optional[Callable] = None

        # Compose the transform if necessary
        if self.transform is not None and isinstance(self.transform, list):
            self.transform = Compose(transform)

        # Populate internal lists of annotations and images paths
        if not self.split.exists():
            raise FileNotFoundError(
                f"Could not find partition file: {self.split}")
        stems = (e.strip() for e in split.open())
        image_extensions_mapping = {
            image.stem: image.suffix
            for image in self.root.glob(f"images/*")
            if is_image_extension_allowed(image.suffix)
        }
        for stem in stems:
            annotation_path = self.root / f"annotations/{stem}.json"
            try:
                extension = image_extensions_mapping[stem]
            except KeyError:
                raise ValueError(
                    f"Annotation ({annotation_path}) does not have a corresponding image"
                )
            image_path = self.root / f"images/{stem}{extension}"
            self.images_path.append(image_path)
            self.annotations_path.append(annotation_path)

        if len(self.images_path) == 0:
            raise ValueError(
                f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file"
                f" in {self.root / 'images'}")

        assert len(self.images_path) == len(self.annotations_path)
Esempio n. 2
0
def sign_upload(client: "Client", image_id: int, key: str, file_path: Path,
                team: str):
    """Obtains the signed URL from the back so that we can update
    to the AWS without credentials

    Parameters
    ----------
    client: Client
        Client authenticated to the team where the put request will be made
    image_id: int
        Id of the image to upload
    key: str
        Path in the s3 bucket
    file_path: Path
        Path to the file to upload on the file system

    Returns
    -------
    dict
        Dictionary which contains the server response
    """
    file_format = file_path.suffix
    if is_image_extension_allowed(file_format):
        return client.post(
            endpoint=f"/dataset_images/{image_id}/sign_upload?key={key}",
            payload={
                "filePath": str(file_path),
                "contentType": f"image/{file_format}"
            },
            team=team,
        )
    elif is_video_extension_allowed(file_format):
        return client.post(
            endpoint=f"/dataset_videos/{image_id}/sign_upload?key={key}",
            payload={
                "filePath": str(file_path),
                "contentType": f"video/{file_format}"
            },
            team=team,
        )
Esempio n. 3
0
def _split_on_file_type(files: List[Path]):
    """Splits a single list of files into images and videos based on their extension

    Parameters
    ----------
    files : list[Path]
        List of files to split according to their type

    Returns
    -------
    images, videos : list[Path]
        List of image and videos, respectively
    """
    images = []
    videos = []
    for file_path in files:
        suffix = file_path.suffix
        if is_image_extension_allowed(suffix):
            images.append(file_path)
        elif is_video_extension_allowed(suffix):
            videos.append(file_path)
        else:
            raise UnsupportedFileType(file_path)
    return images, videos
Esempio n. 4
0
def download_all_images_from_annotations(
    api_key: str,
    api_url: str,
    annotations_path: Path,
    images_path: Path,
    force_replace: bool = False,
    remove_extra: bool = False,
    annotation_format: str = "json",
    use_folders: bool = False,
    video_frames: bool = False,
) -> Tuple[Callable[[], Iterator[Any]], int]:
    """Helper function: downloads the all images corresponding to a project.

    Parameters
    ----------
    api_key : str
        API Key of the current team
    api_url : str
        Url of the darwin API (e.g. 'https://darwin.v7labs.com/api/')
    annotations_path : Path
        Path where the annotations are located
    images_path : Path
        Path where to download the images
    force_replace: bool
        Forces the re-download of an existing image
    remove_extra: bool
        Removes existing images for which there is not corresponding annotation
    annotation_format : str
        Format of the annotations. Currently only JSON and xml are expected
    use_folders: bool
        Recreate folders
    video_frames: bool
        Pulls video frames images instead of video files

    Returns
    -------
    generator : function
        Generator for doing the actual downloads,
    count : int
        The files count
    """
    Path(images_path).mkdir(exist_ok=True)
    if annotation_format not in ["json", "xml"]:
        raise ValueError(
            f"Annotation format {annotation_format} not supported")

    # Verify that there is not already image in the images folder
    unfiltered_files = images_path.rglob(
        f"*") if use_folders else images_path.glob(f"*")
    existing_images = {
        image.stem: image
        for image in unfiltered_files
        if is_image_extension_allowed(image.suffix)
    }

    annotations_to_download_path = []
    for annotation_path in annotations_path.glob(f"*.{annotation_format}"):
        with annotation_path.open() as file:
            annotation = json.load(file)
        if not force_replace:
            # Check collisions on image filename, original_filename and json filename on the system
            if sanitize_filename(Path(
                    annotation["image"]["filename"]).stem) in existing_images:
                continue
            if sanitize_filename(
                    Path(annotation["image"]
                         ["original_filename"]).stem) in existing_images:
                continue
            if sanitize_filename(annotation_path.stem) in existing_images:
                continue
        annotations_to_download_path.append(annotation_path)

    if remove_extra:
        # Removes existing images for which there is not corresponding annotation
        annotations_downloaded_stem = [
            a.stem for a in annotations_path.glob(f"*.{annotation_format}")
        ]
        for existing_image in existing_images.values():
            if existing_image.stem not in annotations_downloaded_stem:
                print(
                    f"Removing {existing_image} as there is no corresponding annotation"
                )
                existing_image.unlink()
    # Create the generator with the partial functions
    count = len(annotations_to_download_path)
    generator = lambda: (functools.partial(
        download_image_from_annotation,
        api_key,
        api_url,
        annotation_path,
        images_path,
        annotation_format,
        use_folders,
        video_frames,
    ) for annotation_path in annotations_to_download_path)
    return generator, count
Esempio n. 5
0
 def it_returns_false_for_unknown_extensions():
     assert not is_image_extension_allowed(".not_an_image")
Esempio n. 6
0
 def it_returns_true_for_allowed_extensions():
     assert is_image_extension_allowed(".png")
Esempio n. 7
0
def get_annotations(
    dataset,
    partition: str,
    split: str = "split",
    split_type: str = "stratified",
    annotation_type: str = "polygon",
):
    """
    Returns all the annotations of a given dataset and split in a single dictionary

    Parameters
    ----------
    dataset
        Path to the location of the dataset on the file system
    partition
        Selects one of the partitions [train, val, test]
    split
        Selects the split that defines the percetages used (use 'split' to select the default split
    split_type
        Heuristic used to do the split [random, stratified]
    annotation_type
        The type of annotation classes [tag, polygon]

    Returns
    -------
    dict
        Dictionary containing all the annotations of the dataset
    """
    assert dataset is not None
    if isinstance(dataset, Path) or isinstance(dataset, str):
        dataset_path = Path(dataset)
    else:
        dataset_path = dataset.local_path

    if partition not in ["train", "val", "test"]:
        raise ValueError(
            "partition should be either 'train', 'val', or 'test'")
    if split_type not in ["random", "stratified"]:
        raise ValueError(
            "split_type should be either 'random' or 'stratified'")
    if annotation_type not in ["tag", "polygon"]:
        raise ValueError("annotation_type should be either 'tag' or 'polygon'")

    # Get the list of classes
    classes = get_classes(dataset,
                          annotation_type=annotation_type,
                          remove_background=True)
    # Get the split
    if split_type == "random":
        split_file = f"{split_type}_{partition}.txt"
    elif split_type == "stratified":
        split_file = f"{split_type}_{annotation_type}_{partition}.txt"
    split_path = dataset_path / "lists" / split / split_file
    stems = (e.strip() for e in split_path.open())
    images_path = []
    annotations_path = []

    # Find all the annotations and their corresponding images
    for stem in stems:
        annotation_path = dataset_path / f"annotations/{stem}.json"
        images = [
            image for image in dataset_path.glob(f"images/{stem}.*")
            if is_image_extension_allowed(image.suffix)
        ]
        if len(images) < 1:
            raise ValueError(f"Annotation ({annotation_path}) does"
                             f" not have a corresponding image")
        if len(images) > 1:
            raise ValueError(
                f"Image ({stem}) is present with multiple extensions."
                f" This is forbidden.")
        assert len(images) == 1
        image_path = images[0]
        images_path.append(image_path)
        annotations_path.append(annotation_path)

    if len(images_path) == 0:
        raise ValueError(
            f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file"
            f" in {dataset_path / 'images'}")

    assert len(images_path) == len(annotations_path)

    try:
        from detectron2.structures import BoxMode
    except ImportError:
        BoxMode = None

    # Load and re-format all the annotations
    dataset_dicts = []
    for image_id, (im_path,
                   annot_path) in enumerate(zip(images_path,
                                                annotations_path)):
        record = {}

        with annot_path.open() as f:
            data = json.load(f)

        height, width = data["image"]["height"], data["image"]["width"]
        annotations = data["annotations"]

        filename = im_path
        record["file_name"] = str(filename)
        record["height"] = height
        record["width"] = width
        record["image_id"] = image_id

        objs = []
        for obj in annotations:
            px, py = [], []
            if "polygon" not in obj:
                continue
            for point in obj["polygon"]["path"]:
                px.append(point["x"])
                py.append(point["y"])
            poly = [(x, y) for x, y in zip(px, py)]
            if len(poly) < 3:  # Discard polyhons with less than 3 points
                continue
            poly = list(itertools.chain.from_iterable(poly))

            category_id = classes.index(obj["name"])

            if BoxMode is not None:
                box_mode = BoxMode.XYXY_ABS
            else:
                box_mode = 0

            obj = {
                "bbox": [np.min(px),
                         np.min(py),
                         np.max(px),
                         np.max(py)],
                "bbox_mode": box_mode,
                "segmentation": [poly],
                "category_id": category_id,
                "iscrowd": 0,
            }
            objs.append(obj)
        record["annotations"] = objs
        dataset_dicts.append(record)
    return dataset_dicts