def __init__( self, download_dir: str, extract_dir: Optional[str] = None, manual_dir: Optional[str] = None, manual_dir_instructions: Optional[str] = None, dataset_name: Optional[str] = None, force_download: bool = False, force_extraction: bool = False, force_checksums_validation: bool = False, register_checksums: bool = False, ): """Download manager constructor. Args: download_dir: Path to directory where downloads are stored. extract_dir: Path to directory where artifacts are extracted. manual_dir: Path to manually downloaded/extracted data directory. manual_dir_instructions: Human readable instructions on how to prepare contents of the manual_dir for this dataset. dataset_name: Name of dataset this instance will be used for. If provided, downloads will contain which datasets they were used for. force_download: If True, always [re]download. force_extraction: If True, always [re]extract. force_checksums_validation: If True, raises an error if an URL do not have checksums. register_checksums: If True, dl checksums aren't checked, but stored into file. """ self._dataset_name = dataset_name self._download_dir = os.path.expanduser(download_dir) self._extract_dir = os.path.expanduser( extract_dir or os.path.join(download_dir, 'extracted')) self._manual_dir = manual_dir and os.path.expanduser(manual_dir) self._manual_dir_instructions = manual_dir_instructions tf.io.gfile.makedirs(self._download_dir) tf.io.gfile.makedirs(self._extract_dir) self._force_download = force_download self._force_extraction = force_extraction self._force_checksums_validation = force_checksums_validation self._register_checksums = register_checksums # All known URLs: {url: (size, checksum)} self._url_infos = checksums.get_all_url_infos() # To record what is being used: {url: (size, checksum)} self._recorded_url_infos = {} # These attributes are lazy-initialized since they must be cleared when this # object is pickled for Beam. They are then recreated on each worker. self.__downloader = None self.__extractor = None # Executor to avoid blocking other download/extractions when running I/O # operations (reading/renaming download file). # Only use a single thread as the read/ops are locked by the # `build_synchronize_decorator`. # Note: This thread is in additions of the download and extraction # executors threads. self._executor = concurrent.futures.ThreadPoolExecutor(1)
def __init__( self, *, download_dir: str, extract_dir: Optional[str] = None, manual_dir: Optional[str] = None, manual_dir_instructions: Optional[str] = None, url_infos: Optional[Dict[str, checksums.UrlInfo]] = None, dataset_name: Optional[str] = None, force_download: bool = False, force_extraction: bool = False, force_checksums_validation: bool = False, register_checksums: bool = False, register_checksums_path: Optional[type_utils.PathLike] = None, verify_ssl: bool = True, ): """Download manager constructor. Args: download_dir: Path to directory where downloads are stored. extract_dir: Path to directory where artifacts are extracted. manual_dir: Path to manually downloaded/extracted data directory. manual_dir_instructions: Human readable instructions on how to prepare contents of the manual_dir for this dataset. url_infos: Urls info for the checksums. dataset_name: Name of dataset this instance will be used for. If provided, downloads will contain which datasets they were used for. force_download: If True, always [re]download. force_extraction: If True, always [re]extract. force_checksums_validation: If True, raises an error if an URL do not have checksums. register_checksums: If True, dl checksums aren't checked, but stored into file. register_checksums_path: Path were to save checksums. Should be set if register_checksums is True. verify_ssl: `bool`, defaults to True. If True, will verify certificate when downloading dataset. Raises: FileNotFoundError: Raised if the register_checksums_path does not exists. """ if register_checksums and not register_checksums_path: raise ValueError( 'When register_checksums=True, register_checksums_path should be set.' ) # TODO(tfds): Should use `path = tfds.core.Path(path)` if isinstance(register_checksums_path, str): register_checksums_path = pathlib.Path(register_checksums_path) register_checksums_path = typing.cast(type_utils.ReadOnlyPath, register_checksums_path) if register_checksums_path and not register_checksums_path.exists(): # Create the file here to make sure user has write access before starting # downloads. register_checksums_path.touch() self._download_dir = os.path.expanduser(download_dir) self._extract_dir = os.path.expanduser( extract_dir or os.path.join(download_dir, 'extracted')) self._manual_dir = manual_dir and os.path.expanduser(manual_dir) self._manual_dir_instructions = manual_dir_instructions tf.io.gfile.makedirs(self._download_dir) tf.io.gfile.makedirs(self._extract_dir) self._force_download = force_download self._force_extraction = force_extraction self._force_checksums_validation = force_checksums_validation self._register_checksums = register_checksums self._register_checksums_path = register_checksums_path self._verify_ssl = verify_ssl self._dataset_name = dataset_name # All known URLs: {url: UrlInfo(size=, checksum=)} self._url_infos = checksums.get_all_url_infos() if url_infos is not None: self._url_infos.update(url_infos) # To record what is being used: {url: (size, checksum)} self._recorded_url_infos = {} # These attributes are lazy-initialized since they must be cleared when this # object is pickled for Beam. They are then recreated on each worker. self.__downloader = None self.__extractor = None # Executor to avoid blocking other download/extractions when running I/O # operations (reading/renaming download file). # Only use a single thread as the read/ops are locked by the # `build_synchronize_decorator`. # Note: This thread is in additions of the download and extraction # executors threads. self._executor = concurrent.futures.ThreadPoolExecutor(1)
def __init__( self, *, download_dir: epath.PathLike, extract_dir: Optional[epath.PathLike] = None, manual_dir: Optional[epath.PathLike] = None, manual_dir_instructions: Optional[str] = None, url_infos: Optional[Dict[str, checksums.UrlInfo]] = None, dataset_name: Optional[str] = None, force_download: bool = False, force_extraction: bool = False, force_checksums_validation: bool = False, register_checksums: bool = False, register_checksums_path: Optional[epath.PathLike] = None, verify_ssl: bool = True, ): """Download manager constructor. Args: download_dir: Path to directory where downloads are stored. extract_dir: Path to directory where artifacts are extracted. manual_dir: Path to manually downloaded/extracted data directory. manual_dir_instructions: Human readable instructions on how to prepare contents of the manual_dir for this dataset. url_infos: Urls info for the checksums. dataset_name: Name of dataset this instance will be used for. If provided, downloads will contain which datasets they were used for. force_download: If True, always [re]download. force_extraction: If True, always [re]extract. force_checksums_validation: If True, raises an error if an URL do not have checksums. register_checksums: If True, dl checksums aren't checked, but stored into file. register_checksums_path: Path were to save checksums. Should be set if register_checksums is True. verify_ssl: `bool`, defaults to True. If True, will verify certificate when downloading dataset. Raises: FileNotFoundError: Raised if the register_checksums_path does not exists. """ if register_checksums: if not register_checksums_path: raise ValueError( 'When register_checksums=True, register_checksums_path should be set.' ) register_checksums_path = epath.Path(register_checksums_path) if not register_checksums_path.exists(): # Create the file here to make sure user has write access before # starting downloads. register_checksums_path.touch() else: # Make sure the user has write access before downloading any files. # (e.g. TFDS installed by admin) register_checksums_path.write_text( register_checksums_path.read_text()) download_dir = epath.Path(download_dir).expanduser() if extract_dir: extract_dir = epath.Path(extract_dir).expanduser() else: extract_dir = download_dir / 'extracted' if manual_dir: manual_dir = epath.Path(manual_dir).expanduser() self._download_dir: epath.Path = download_dir self._extract_dir: epath.Path = extract_dir self._manual_dir: Optional[epath.Path] = manual_dir # pytype: disable=annotation-type-mismatch # attribute-variable-annotations self._manual_dir_instructions = utils.dedent(manual_dir_instructions) self._download_dir.mkdir(parents=True, exist_ok=True) self._extract_dir.mkdir(parents=True, exist_ok=True) self._force_download = force_download self._force_extraction = force_extraction self._force_checksums_validation = force_checksums_validation self._register_checksums = register_checksums self._register_checksums_path = register_checksums_path self._verify_ssl = verify_ssl self._dataset_name = dataset_name # All known URLs: {url: UrlInfo(size=, checksum=)} self._url_infos = checksums.get_all_url_infos() if url_infos is not None: self._url_infos.update(url_infos) # To record what is being used: {url: UrlInfo(size, checksum, filename)} self._recorded_url_infos: Dict[str, checksums.UrlInfo] = {} # These attributes are lazy-initialized since they must be cleared when this # object is pickled for Beam. They are then recreated on each worker. self.__downloader = None self.__extractor = None # Executor to avoid blocking other download/extractions when running I/O # operations (reading/renaming download file). # Only use a single thread as the read/ops are locked by the # `build_synchronize_decorator`. # Note: This thread is in additions of the download and extraction # executors threads. self._executor = concurrent.futures.ThreadPoolExecutor(1)