def download_and_prepare(self, cache_dir=None, dl_manager=None): """Downloads and prepares dataset for reading. Subclasses must override _download_and_prepare. Args: cache_dir: (str) Cached directory where to extract the data. If None, a default tmp directory will be used. dl_manager: (`tfds.download.DownloadManager`) DownloadManager to use. Only one of dl_manager and cache_dir can be set Raises: ValueError: If the user defines both cache_dir and dl_manager """ # Both args are set if cache_dir and dl_manager is not None: raise ValueError( "Only one of dl_manager and cache_dir can be defined.") # None are set. Use the data_dir as cache_dir if not cache_dir and dl_manager is None: cache_dir = os.path.join(self._data_dir_root, "tmp") # Create the download manager if cache_dir: dl_manager = download.DownloadManager(cache_dir=cache_dir) # If the dataset already exists (data_dir not empty) and that we do not # overwrite the dataset if (self._data_dir and dl_manager.mode == download.GenerateMode.REUSE_DATASET_IF_EXISTS): tf.logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return # Otherwise, create a new version in a new data_dir. curr_date = datetime.datetime.now() version_str = curr_date.strftime("v_%Y%m%d_%H%M") data_dir = self._get_data_dir(version=version_str) self._data_dir = None tf.logging.info("Generating dataset %s (%s)", self.name, data_dir) # Print is intentional: we want this to always go to stdout so user has # information needed to cancel download/preparation if needed. # This comes right before the progress bar. size_text = termcolor.colored("%s GB" % self.SIZE or "?", attrs=["bold"]) termcolor.cprint("Downloading / extracting dataset %s (%s) to %s..." % (self.name, size_text, data_dir)) # Wrap the Dataset generation in a .incomplete directory with file_format_adapter.incomplete_dir(data_dir) as data_dir_tmp: self._download_and_prepare(dl_manager=dl_manager, data_dir=data_dir_tmp) # Update the DatasetInfo metadata (splits info, num samples,...) self._data_dir = data_dir self.info.update_from_metadata_dir(self._data_dir)
def download_and_prepare(self, cache_dir=None, dl_manager=None): """Downloads and prepares dataset for reading. Subclasses must override _download_and_prepare. Args: cache_dir (str): Cached directory where to extract the data. If None, a default tmp directory will be used. dl_manager (DownloadManager): DownloadManager to use. Only one of dl_manager and cache_dir can be set Raises: ValueError: If the user defines both cache_dir and dl_manager """ # Both args are set if cache_dir and dl_manager is not None: raise ValueError( "Only one of dl_manager and cache_dir can be defined.") # None are set. Use the data_dir as cache_dir if not cache_dir and dl_manager is None: cache_dir = os.path.join(self._data_dir_root, "tmp") # Create the download manager if cache_dir: dl_manager = download.DownloadManager(cache_dir=cache_dir) # If the dataset already exists (data_dir not empty) and that we do not # overwrite the dataset if (self._data_dir and dl_manager.mode == download.GenerateMode.REUSE_DATASET_IF_EXISTS): tf.logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return # Otherwise, create a new version in a new data_dir. curr_date = datetime.datetime.now() version_str = curr_date.strftime("v_%Y%m%d_%H%M") data_dir = self._get_data_dir(version=version_str) tf.logging.info("Generating dataset %s (%s)", self.name, data_dir) # Wrap the Dataset generation in a .incomplete directory with file_format_adapter.incomplete_dir(data_dir) as data_dir_tmp: # TODO(epot): Data_dir should be an argument of download_and_prepare. # Modify this once a better split API exists. self._data_dir = data_dir_tmp self._download_and_prepare(dl_manager) self._data_dir = data_dir
def download_and_prepare(self, download_dir=None, download_config=None): """Downloads and prepares dataset for reading. Args: download_dir: `str`, directory where downloaded files are stored. Defaults to "~/tensorflow-datasets/downloads". download_config: `tfds.download.DownloadConfig`, further configuration for downloading and preparing dataset. """ download_config = download_config or download.DownloadConfig() data_exists = tf.io.gfile.exists(self._data_dir) if (data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS): logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return dl_manager = self._make_download_manager( download_dir=download_dir, download_config=download_config) # Currently it's not possible to overwrite the data because it would # conflict with versioning: If the last version has already been generated, # it will always be reloaded and data_dir will be set at construction. if data_exists: raise ValueError( "Trying to overwrite an existing dataset {} at {}. A dataset with " "the same version {} already exists. If the dataset has changed, " "please update the version number.".format( self.name, self._data_dir, self.info.version)) logging.info("Generating dataset %s (%s)", self.name, self._data_dir) self._log_download_bytes() # Create a tmp dir and rename to self._data_dir on successful exit. with file_format_adapter.incomplete_dir( self._data_dir) as tmp_data_dir: # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward # it to every sub function. with utils.temporary_assignment(self, "_data_dir", tmp_data_dir): self._download_and_prepare( dl_manager=dl_manager, max_examples_per_split=download_config. max_examples_per_split) # NOTE: If modifying the lines below to put additional information in # DatasetInfo, you'll likely also want to update # DatasetInfo.read_from_directory to possibly restore these attributes # when reading from package data. # Update the DatasetInfo metadata by computing statistics from the data. if download_config.compute_stats: already_has_stats = bool( self.info.splits.total_num_examples) if already_has_stats: logging.info( "Skipping computing stats because they are already " "populated.") else: self.info.compute_dynamic_properties() # Set checksums for all files downloaded self.info.download_checksums = ( dl_manager.recorded_download_checksums) # Set size of all files downloaded self.info.size_in_bytes = sum( [v for _, v in dl_manager.download_sizes.items()]) # Write DatasetInfo to disk, even if we haven't computed the statistics. self.info.write_to_directory(self._data_dir)
def download_and_prepare(self, download_dir=None, download_config=None): """Downloads and prepares dataset for reading. Args: download_dir: `str`, directory where downloaded files are stored. Defaults to "~/tensorflow-datasets/downloads". download_config: `tfds.download.DownloadConfig`, further configuration for downloading and preparing dataset. Raises: IOError: if there is not enough disk space available. """ download_config = download_config or download.DownloadConfig() data_exists = tf.io.gfile.exists(self._data_dir) if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS: logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return dl_manager = self._make_download_manager( download_dir=download_dir, download_config=download_config) # Currently it's not possible to overwrite the data because it would # conflict with versioning: If the last version has already been generated, # it will always be reloaded and data_dir will be set at construction. if data_exists: raise ValueError( "Trying to overwrite an existing dataset {} at {}. A dataset with " "the same version {} already exists. If the dataset has changed, " "please update the version number.".format( self.name, self._data_dir, self.version)) logging.info("Generating dataset %s (%s)", self.name, self._data_dir) if not utils.has_sufficient_disk_space(self.info.size_in_bytes, directory=self._data_dir_root): raise IOError("Not enough disk space. Needed: %s" % units.size_str(self.info.size_in_bytes)) self._log_download_bytes() # Create a tmp dir and rename to self._data_dir on successful exit. with file_format_adapter.incomplete_dir( self._data_dir) as tmp_data_dir: # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward # it to every sub function. with utils.temporary_assignment(self, "_data_dir", tmp_data_dir): if (download_config.try_download_gcs and gcs_utils.is_dataset_on_gcs(self.info.full_name)): logging.warning(GCS_HOSTED_MSG, self.name) gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir) self.info.read_from_directory(self._data_dir) else: self._download_and_prepare(dl_manager=dl_manager, download_config=download_config) # NOTE: If modifying the lines below to put additional information in # DatasetInfo, you'll likely also want to update # DatasetInfo.read_from_directory to possibly restore these attributes # when reading from package data. # Update DatasetInfo metadata by computing statistics from the data. if (download_config.compute_stats == download.ComputeStatsMode.SKIP or download_config.compute_stats == download.ComputeStatsMode.AUTO and bool(self.info.splits.total_num_examples)): logging.info("Skipping computing stats for mode %s.", download_config.compute_stats) else: # Mode is forced or stats do not exists yet logging.info("Computing statistics.") self.info.compute_dynamic_properties() self.info.size_in_bytes = dl_manager.downloaded_size # Write DatasetInfo to disk, even if we haven't computed statistics. self.info.write_to_directory(self._data_dir) self._log_download_done()
def download_and_prepare(self, download_dir=None, download_config=None): """Downloads and prepares dataset for reading. Args: download_dir: `str`, directory where downloaded files are stored. Defaults to "~/tensorflow-datasets/downloads". download_config: `tfds.download.DownloadConfig`, further configuration for downloading and preparing dataset. Raises: IOError: if there is not enough disk space available. """ download_config = download_config or download.DownloadConfig() data_exists = tf.io.gfile.exists(self._data_dir) if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS: logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return if self.version.tfds_version_to_prepare: available_to_prepare = ", ".join(str(v) for v in self.versions if not v.tfds_version_to_prepare) raise AssertionError( "The version of the dataset you are trying to use ({}:{}) can only " "be generated using TFDS code synced @ {} or earlier. Either sync to " "that version of TFDS to first prepare the data or use another " "version of the dataset (available for `download_and_prepare`: " "{}).".format( self.name, self.version, self.version.tfds_version_to_prepare, available_to_prepare)) # Only `cls.VERSION` or `experimental_latest` versions can be generated. # Otherwise, users may accidentally generate an old version using the # code from newer versions. installable_versions = { str(v) for v in (self.canonical_version, max(self.versions)) } if str(self.version) not in installable_versions: msg = ( "The version of the dataset you are trying to use ({}) is too " "old for this version of TFDS so cannot be generated." ).format(self.info.full_name) if self.version.tfds_version_to_prepare: msg += ( "{} can only be generated using TFDS code synced @ {} or earlier " "Either sync to that version of TFDS to first prepare the data or " "use another version of the dataset. " ).format(self.version, self.version.tfds_version_to_prepare) else: msg += ( "Either sync to a previous version of TFDS to first prepare the " "data or use another version of the dataset. " ) msg += "Available for `download_and_prepare`: {}".format( list(sorted(installable_versions))) raise ValueError(msg) # Currently it's not possible to overwrite the data because it would # conflict with versioning: If the last version has already been generated, # it will always be reloaded and data_dir will be set at construction. if data_exists: raise ValueError( "Trying to overwrite an existing dataset {} at {}. A dataset with " "the same version {} already exists. If the dataset has changed, " "please update the version number.".format(self.name, self._data_dir, self.version)) logging.info("Generating dataset %s (%s)", self.name, self._data_dir) if not utils.has_sufficient_disk_space( self.info.dataset_size + self.info.download_size, directory=self._data_dir_root): raise IOError( "Not enough disk space. Needed: {} (download: {}, generated: {})" .format( units.size_str(self.info.dataset_size + self.info.download_size), units.size_str(self.info.download_size), units.size_str(self.info.dataset_size), )) self._log_download_bytes() dl_manager = self._make_download_manager( download_dir=download_dir, download_config=download_config) # Create a tmp dir and rename to self._data_dir on successful exit. with file_format_adapter.incomplete_dir(self._data_dir) as tmp_data_dir: # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward # it to every sub function. with utils.temporary_assignment(self, "_data_dir", tmp_data_dir): if (download_config.try_download_gcs and gcs_utils.is_dataset_on_gcs(self.info.full_name)): logging.warning(GCS_HOSTED_MSG, self.name) gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir) self.info.read_from_directory(self._data_dir) else: self._download_and_prepare( dl_manager=dl_manager, download_config=download_config) # NOTE: If modifying the lines below to put additional information in # DatasetInfo, you'll likely also want to update # DatasetInfo.read_from_directory to possibly restore these attributes # when reading from package data. splits = list(self.info.splits.values()) statistics_already_computed = bool( splits and splits[0].statistics.num_examples) # Update DatasetInfo metadata by computing statistics from the data. if (download_config.compute_stats == download.ComputeStatsMode.SKIP or download_config.compute_stats == download.ComputeStatsMode.AUTO and statistics_already_computed ): logging.info( "Skipping computing stats for mode %s.", download_config.compute_stats) else: # Mode is forced or stats do not exists yet logging.info("Computing statistics.") self.info.compute_dynamic_properties() self.info.download_size = dl_manager.downloaded_size # Write DatasetInfo to disk, even if we haven't computed statistics. self.info.write_to_directory(self._data_dir) self._log_download_done()