コード例 #1
0
    def __init__(
        self,
        download_dir: str,
        extract_dir: Optional[str] = None,
        manual_dir: Optional[str] = None,
        manual_dir_instructions: Optional[str] = None,
        dataset_name: Optional[str] = None,
        force_download: bool = False,
        force_extraction: bool = False,
        force_checksums_validation: bool = False,
        register_checksums: bool = False,
    ):
        """Download manager constructor.

    Args:
      download_dir: Path to directory where downloads are stored.
      extract_dir: Path to directory where artifacts are extracted.
      manual_dir: Path to manually downloaded/extracted data directory.
      manual_dir_instructions: Human readable instructions on how to
        prepare contents of the manual_dir for this dataset.
      dataset_name: Name of dataset this instance will be used for. If
        provided, downloads will contain which datasets they were used for.
      force_download: If True, always [re]download.
      force_extraction: If True, always [re]extract.
      force_checksums_validation: If True, raises an error if an URL do not
        have checksums.
      register_checksums: If True, dl checksums aren't
        checked, but stored into file.
    """
        self._dataset_name = dataset_name
        self._download_dir = os.path.expanduser(download_dir)
        self._extract_dir = os.path.expanduser(
            extract_dir or os.path.join(download_dir, 'extracted'))
        self._manual_dir = manual_dir and os.path.expanduser(manual_dir)
        self._manual_dir_instructions = manual_dir_instructions
        tf.io.gfile.makedirs(self._download_dir)
        tf.io.gfile.makedirs(self._extract_dir)
        self._force_download = force_download
        self._force_extraction = force_extraction
        self._force_checksums_validation = force_checksums_validation
        self._register_checksums = register_checksums
        # All known URLs: {url: (size, checksum)}
        self._url_infos = checksums.get_all_url_infos()
        # To record what is being used: {url: (size, checksum)}
        self._recorded_url_infos = {}
        # These attributes are lazy-initialized since they must be cleared when this
        # object is pickled for Beam. They are then recreated on each worker.
        self.__downloader = None
        self.__extractor = None
        # Executor to avoid blocking other download/extractions when running I/O
        # operations (reading/renaming download file).
        # Only use a single thread as the read/ops are locked by the
        # `build_synchronize_decorator`.
        # Note: This thread is in additions of the download and extraction
        # executors threads.
        self._executor = concurrent.futures.ThreadPoolExecutor(1)
コード例 #2
0
    def __init__(
        self,
        *,
        download_dir: str,
        extract_dir: Optional[str] = None,
        manual_dir: Optional[str] = None,
        manual_dir_instructions: Optional[str] = None,
        url_infos: Optional[Dict[str, checksums.UrlInfo]] = None,
        dataset_name: Optional[str] = None,
        force_download: bool = False,
        force_extraction: bool = False,
        force_checksums_validation: bool = False,
        register_checksums: bool = False,
        register_checksums_path: Optional[type_utils.PathLike] = None,
        verify_ssl: bool = True,
    ):
        """Download manager constructor.

    Args:
      download_dir: Path to directory where downloads are stored.
      extract_dir: Path to directory where artifacts are extracted.
      manual_dir: Path to manually downloaded/extracted data directory.
      manual_dir_instructions: Human readable instructions on how to
        prepare contents of the manual_dir for this dataset.
      url_infos: Urls info for the checksums.
      dataset_name: Name of dataset this instance will be used for. If
        provided, downloads will contain which datasets they were used for.
      force_download: If True, always [re]download.
      force_extraction: If True, always [re]extract.
      force_checksums_validation: If True, raises an error if an URL do not
        have checksums.
      register_checksums: If True, dl checksums aren't
        checked, but stored into file.
      register_checksums_path: Path were to save checksums. Should be set
        if register_checksums is True.
      verify_ssl: `bool`, defaults to True. If True, will verify certificate
        when downloading dataset.

    Raises:
      FileNotFoundError: Raised if the register_checksums_path does not exists.
    """
        if register_checksums and not register_checksums_path:
            raise ValueError(
                'When register_checksums=True, register_checksums_path should be set.'
            )
        # TODO(tfds): Should use `path = tfds.core.Path(path)`
        if isinstance(register_checksums_path, str):
            register_checksums_path = pathlib.Path(register_checksums_path)
        register_checksums_path = typing.cast(type_utils.ReadOnlyPath,
                                              register_checksums_path)
        if register_checksums_path and not register_checksums_path.exists():
            # Create the file here to make sure user has write access before starting
            # downloads.
            register_checksums_path.touch()

        self._download_dir = os.path.expanduser(download_dir)
        self._extract_dir = os.path.expanduser(
            extract_dir or os.path.join(download_dir, 'extracted'))
        self._manual_dir = manual_dir and os.path.expanduser(manual_dir)
        self._manual_dir_instructions = manual_dir_instructions
        tf.io.gfile.makedirs(self._download_dir)
        tf.io.gfile.makedirs(self._extract_dir)
        self._force_download = force_download
        self._force_extraction = force_extraction
        self._force_checksums_validation = force_checksums_validation
        self._register_checksums = register_checksums
        self._register_checksums_path = register_checksums_path
        self._verify_ssl = verify_ssl
        self._dataset_name = dataset_name

        # All known URLs: {url: UrlInfo(size=, checksum=)}
        self._url_infos = checksums.get_all_url_infos()
        if url_infos is not None:
            self._url_infos.update(url_infos)

        # To record what is being used: {url: (size, checksum)}
        self._recorded_url_infos = {}
        # These attributes are lazy-initialized since they must be cleared when this
        # object is pickled for Beam. They are then recreated on each worker.
        self.__downloader = None
        self.__extractor = None
        # Executor to avoid blocking other download/extractions when running I/O
        # operations (reading/renaming download file).
        # Only use a single thread as the read/ops are locked by the
        # `build_synchronize_decorator`.
        # Note: This thread is in additions of the download and extraction
        # executors threads.
        self._executor = concurrent.futures.ThreadPoolExecutor(1)
コード例 #3
0
    def __init__(
        self,
        *,
        download_dir: epath.PathLike,
        extract_dir: Optional[epath.PathLike] = None,
        manual_dir: Optional[epath.PathLike] = None,
        manual_dir_instructions: Optional[str] = None,
        url_infos: Optional[Dict[str, checksums.UrlInfo]] = None,
        dataset_name: Optional[str] = None,
        force_download: bool = False,
        force_extraction: bool = False,
        force_checksums_validation: bool = False,
        register_checksums: bool = False,
        register_checksums_path: Optional[epath.PathLike] = None,
        verify_ssl: bool = True,
    ):
        """Download manager constructor.

    Args:
      download_dir: Path to directory where downloads are stored.
      extract_dir: Path to directory where artifacts are extracted.
      manual_dir: Path to manually downloaded/extracted data directory.
      manual_dir_instructions: Human readable instructions on how to prepare
        contents of the manual_dir for this dataset.
      url_infos: Urls info for the checksums.
      dataset_name: Name of dataset this instance will be used for. If provided,
        downloads will contain which datasets they were used for.
      force_download: If True, always [re]download.
      force_extraction: If True, always [re]extract.
      force_checksums_validation: If True, raises an error if an URL do not have
        checksums.
      register_checksums: If True, dl checksums aren't checked, but stored into
        file.
      register_checksums_path: Path were to save checksums. Should be set if
        register_checksums is True.
      verify_ssl: `bool`, defaults to True. If True, will verify certificate
        when downloading dataset.

    Raises:
      FileNotFoundError: Raised if the register_checksums_path does not exists.
    """
        if register_checksums:
            if not register_checksums_path:
                raise ValueError(
                    'When register_checksums=True, register_checksums_path should be set.'
                )
            register_checksums_path = epath.Path(register_checksums_path)
            if not register_checksums_path.exists():
                # Create the file here to make sure user has write access before
                # starting downloads.
                register_checksums_path.touch()
            else:
                # Make sure the user has write access before downloading any files.
                # (e.g. TFDS installed by admin)
                register_checksums_path.write_text(
                    register_checksums_path.read_text())

        download_dir = epath.Path(download_dir).expanduser()
        if extract_dir:
            extract_dir = epath.Path(extract_dir).expanduser()
        else:
            extract_dir = download_dir / 'extracted'
        if manual_dir:
            manual_dir = epath.Path(manual_dir).expanduser()

        self._download_dir: epath.Path = download_dir
        self._extract_dir: epath.Path = extract_dir
        self._manual_dir: Optional[epath.Path] = manual_dir  # pytype: disable=annotation-type-mismatch  # attribute-variable-annotations
        self._manual_dir_instructions = utils.dedent(manual_dir_instructions)
        self._download_dir.mkdir(parents=True, exist_ok=True)
        self._extract_dir.mkdir(parents=True, exist_ok=True)

        self._force_download = force_download
        self._force_extraction = force_extraction
        self._force_checksums_validation = force_checksums_validation
        self._register_checksums = register_checksums
        self._register_checksums_path = register_checksums_path
        self._verify_ssl = verify_ssl
        self._dataset_name = dataset_name

        # All known URLs: {url: UrlInfo(size=, checksum=)}
        self._url_infos = checksums.get_all_url_infos()
        if url_infos is not None:
            self._url_infos.update(url_infos)

        # To record what is being used: {url: UrlInfo(size, checksum, filename)}
        self._recorded_url_infos: Dict[str, checksums.UrlInfo] = {}
        # These attributes are lazy-initialized since they must be cleared when this
        # object is pickled for Beam. They are then recreated on each worker.
        self.__downloader = None
        self.__extractor = None
        # Executor to avoid blocking other download/extractions when running I/O
        # operations (reading/renaming download file).
        # Only use a single thread as the read/ops are locked by the
        # `build_synchronize_decorator`.
        # Note: This thread is in additions of the download and extraction
        # executors threads.
        self._executor = concurrent.futures.ThreadPoolExecutor(1)