Exemple #1
0
def convert_to_coco_json(dataset_name, output_file, allow_cached=True):
    """
    Converts dataset into COCO format and saves it to a json file.
    dataset_name must be registered in DatasetCatalog and in detectron2's standard format.

    Args:
        dataset_name:
            reference from the config file to the catalogs
            must be registered in DatasetCatalog and in detectron2's standard format
        output_file: path of json file that will be saved to
        allow_cached: if json file is already present then skip conversion
    """

    # TODO: The dataset or the conversion script *may* change,
    # a checksum would be useful for validating the cached data

    PathManager.mkdirs(os.path.dirname(output_file))
    with file_lock(output_file):
        if PathManager.exists(output_file) and allow_cached:
            logger.warning(
                f"Using previously cached COCO format annotations at '{output_file}'. "
                "You need to clear the cache file if your dataset has been modified."
            )
        else:
            logger.info(f"Converting annotations of dataset '{dataset_name}' to COCO format ...)")
            coco_dict = convert_to_coco_dict(dataset_name)

            logger.info(f"Caching COCO format annotations at '{output_file}' ...")
            tmp_file = output_file + ".tmp"
            with PathManager.open(tmp_file, "w") as f:
                json.dump(coco_dict, f)
            shutil.move(tmp_file, output_file)
    def _get_local_path(
        self,
        path: str,
        force: bool = False,
        cache_dir: Optional[str] = None,
        **kwargs: Any,
    ) -> str:
        """
        This implementation downloads the remote resource from google drive and caches it locally.
        The resource will only be downloaded if not previously requested.
        """
        self._check_kwargs(kwargs)
        if (force or path not in self.cache_map
                or not os.path.exists(self.cache_map[path])):
            logger = logging.getLogger(__name__)
            dirname = get_cache_dir(cache_dir)

            response, filename = _get_response_from_google_drive(path)
            if len(filename) > self.MAX_FILENAME_LEN:
                filename = filename[:100] + "_" + uuid.uuid4().hex

            cached = os.path.join(dirname, filename)
            with file_lock(cached):
                if not os.path.isfile(cached):
                    logger.info("Downloading {} ...".format(path))
                    with open(cached, 'wb') as f:
                        for data in _stream_response(response):
                            f.write(data)
            logger.info("URL {} cached in {}".format(path, cached))
            self.cache_map[path] = cached
        return self.cache_map[path]
Exemple #3
0
def cache_url(url: str, cache_dir: str) -> str:
    """
    This implementation downloads the remote resource and caches it locally.
    The resource will only be downloaded if not previously requested.
    """
    parsed_url = urlparse(url)
    dirname = os.path.join(cache_dir, os.path.dirname(parsed_url.path.lstrip("/")))
    makedir(dirname)
    filename = url.split("/")[-1]
    cached = os.path.join(dirname, filename)
    with file_lock(cached):
        if not os.path.isfile(cached):
            logging.info(f"Downloading {url} to {cached} ...")
            cached = download(url, dirname, filename=filename)
    logging.info(f"URL {url} cached in {cached}")
    return cached
Exemple #4
0
 def _siamese_to_coco(self, siamese_json):
     assert self._output_dir
     save_json = os.path.join(self._output_dir, "siamese2coco.json")
     pm = PathManager()
     pm.mkdirs(os.path.dirname(save_json))
     with file_lock(save_json):
         if pm.exists(save_json):
             logger.warning(
                 f"Using previously cached COCO format annotations at '{save_json}'. "
                 "You need to clear the cache file if your dataset has been modified."
             )
         else:
             logger.info(
                 f"Converting annotations of dataset '{siamese_json}' to COCO format ...)"
             )
             with pm.open(siamese_json, "r") as f:
                 siamese_data = json.load(f)
             coco_data = {"data": []}
             exist_imgid = set()
             for key, datas in siamese_data.items():
                 # copy 'info', 'categories'
                 if key != "data":
                     coco_data[key] = datas
                 else:
                     for data in datas:
                         for i in range(2):
                             img_data = data[str(i)]
                             if img_data["image_id"] in exist_imgid:
                                 continue
                             else:
                                 exist_imgid.add(img_data["image_id"])
                                 coco_data[key].append(img_data)
             self._logger.info(
                 f"Number of unique images: {len(exist_imgid)}.")
             coco_data = convert_to_coco_dict(coco_data["data"],
                                              self._metadata)
             with pm.open(save_json, "w") as f:
                 json.dump(coco_data, f)
     return save_json
Exemple #5
0
    def _get_local_path(
        self,
        path: str,
        force: bool = False,
        cache_dir: Optional[str] = None,
        **kwargs: Any,
    ) -> str:
        """
        As paddle model stores all files in tar files, we need to extract them
        and get the newly extracted folder path. This function rewrites the base
        function to support the following situations:

        1. If the tar file is not downloaded, it will download the tar file,
            extract it to the target folder, delete the downloaded tar file,
            and return the folder path.
        2. If the extracted target folder is present, and all the necessary model
            files are present (specified in _TAR_FILE_NAME_LIST), it will
            return the folder path.
        3. If the tar file is downloaded, but the extracted target folder is not
            present (or it doesn't contain the necessary files in _TAR_FILE_NAME_LIST),
            it will extract the tar file to the target folder, delete the tar file,
            and return the folder path.

        """
        self._check_kwargs(kwargs)
        if (force or path not in self.cache_map
                or not os.path.exists(self.cache_map[path])):
            logger = logging.getLogger(__name__)
            parsed_url = urlparse(path)
            dirname = os.path.join(
                get_cache_dir(cache_dir),
                os.path.dirname(parsed_url.path.lstrip("/")))
            filename = path.split("/")[-1]
            if len(filename) > self.MAX_FILENAME_LEN:
                filename = filename[:100] + "_" + uuid.uuid4().hex

            cached = os.path.join(dirname, filename)

            if is_cached_folder_exists_and_valid(cached):
                # When the cached folder exists and valid, we don't need to redownload
                # the tar file.
                self.cache_map[path] = _get_untar_directory(cached)

            else:
                with file_lock(cached):
                    if not os.path.isfile(cached):
                        logger.info("Downloading {} ...".format(path))
                        cached = download(path, dirname, filename=filename)

                    if path.endswith(".tar"):
                        model_dir = _untar_model_weights(cached)
                        try:
                            os.remove(cached)  # remove the redundant tar file
                            # TODO: remove the .lock file .
                        except:
                            logger.warning(
                                f"Not able to remove the cached tar file {cached}"
                            )

                logger.info("URL {} cached in {}".format(path, model_dir))
                self.cache_map[path] = model_dir

        return self.cache_map[path]