def _log_download_bytes(self): # Print is intentional: we want this to always go to stdout so user has # information needed to cancel download/preparation if needed. # This comes right before the progress bar. termcolor.cprint( "Downloading and preparing dataset {} (download: {}, generated: {}, " "total: {}) to {}...".format( self.info.full_name, units.size_str(self.info.download_size), units.size_str(self.info.dataset_size), units.size_str(self.info.download_size + self.info.dataset_size), self._data_dir, ), attrs=["bold"])
def _log_download_bytes(self): # Print is intentional: we want this to always go to stdout so user has # information needed to cancel download/preparation if needed. # This comes right before the progress bar. size_text = units.size_str(self.info.size_in_bytes) termcolor.cprint("Downloading / extracting dataset %s (%s) to %s..." % (self.name, size_text, self._data_dir), attrs=["bold"])
def download_and_prepare(self, download_dir=None, download_config=None): """Downloads and prepares dataset for reading. Args: download_dir: `str`, directory where downloaded files are stored. Defaults to "~/tensorflow-datasets/downloads". download_config: `tfds.download.DownloadConfig`, further configuration for downloading and preparing dataset. Raises: IOError: if there is not enough disk space available. """ download_config = download_config or download.DownloadConfig() data_exists = tf.io.gfile.exists(self._data_dir) if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS: logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return dl_manager = self._make_download_manager( download_dir=download_dir, download_config=download_config) # Currently it's not possible to overwrite the data because it would # conflict with versioning: If the last version has already been generated, # it will always be reloaded and data_dir will be set at construction. if data_exists: raise ValueError( "Trying to overwrite an existing dataset {} at {}. A dataset with " "the same version {} already exists. If the dataset has changed, " "please update the version number.".format( self.name, self._data_dir, self.version)) logging.info("Generating dataset %s (%s)", self.name, self._data_dir) if not utils.has_sufficient_disk_space(self.info.size_in_bytes, directory=self._data_dir_root): raise IOError("Not enough disk space. Needed: %s" % units.size_str(self.info.size_in_bytes)) self._log_download_bytes() # Create a tmp dir and rename to self._data_dir on successful exit. with file_format_adapter.incomplete_dir( self._data_dir) as tmp_data_dir: # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward # it to every sub function. with utils.temporary_assignment(self, "_data_dir", tmp_data_dir): if (download_config.try_download_gcs and gcs_utils.is_dataset_on_gcs(self.info.full_name)): logging.warning(GCS_HOSTED_MSG, self.name) gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir) self.info.read_from_directory(self._data_dir) else: self._download_and_prepare(dl_manager=dl_manager, download_config=download_config) # NOTE: If modifying the lines below to put additional information in # DatasetInfo, you'll likely also want to update # DatasetInfo.read_from_directory to possibly restore these attributes # when reading from package data. # Update DatasetInfo metadata by computing statistics from the data. if (download_config.compute_stats == download.ComputeStatsMode.SKIP or download_config.compute_stats == download.ComputeStatsMode.AUTO and bool(self.info.splits.total_num_examples)): logging.info("Skipping computing stats for mode %s.", download_config.compute_stats) else: # Mode is forced or stats do not exists yet logging.info("Computing statistics.") self.info.compute_dynamic_properties() self.info.size_in_bytes = dl_manager.downloaded_size # Write DatasetInfo to disk, even if we haven't computed statistics. self.info.write_to_directory(self._data_dir) self._log_download_done()
def download_and_prepare(self, download_dir=None, download_config=None): """Downloads and prepares dataset for reading. Args: download_dir: `str`, directory where downloaded files are stored. Defaults to "~/tensorflow-datasets/downloads". download_config: `tfds.download.DownloadConfig`, further configuration for downloading and preparing dataset. Raises: IOError: if there is not enough disk space available. """ download_config = download_config or download.DownloadConfig() data_exists = tf.io.gfile.exists(self._data_dir) if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS: logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return # Disable `download_and_prepare` (internally, we are still # allowing Py2 for the `dataset_builder_tests.py` & cie if _is_py2_download_and_prepare_disabled and six.PY2: raise NotImplementedError( "TFDS has dropped `builder.download_and_prepare` support for " "Python 2. Please update your code to Python 3.") if self.version.tfds_version_to_prepare: available_to_prepare = ", ".join( str(v) for v in self.versions if not v.tfds_version_to_prepare) raise AssertionError( "The version of the dataset you are trying to use ({}:{}) can only " "be generated using TFDS code synced @ {} or earlier. Either sync to " "that version of TFDS to first prepare the data or use another " "version of the dataset (available for `download_and_prepare`: " "{}).".format(self.name, self.version, self.version.tfds_version_to_prepare, available_to_prepare)) # Only `cls.VERSION` or `experimental_latest` versions can be generated. # Otherwise, users may accidentally generate an old version using the # code from newer versions. installable_versions = { str(v) for v in (self.canonical_version, max(self.versions)) } if str(self.version) not in installable_versions: msg = ( "The version of the dataset you are trying to use ({}) is too " "old for this version of TFDS so cannot be generated.").format( self.info.full_name) if self.version.tfds_version_to_prepare: msg += ( "{} can only be generated using TFDS code synced @ {} or earlier " "Either sync to that version of TFDS to first prepare the data or " "use another version of the dataset. ").format( self.version, self.version.tfds_version_to_prepare) else: msg += ( "Either sync to a previous version of TFDS to first prepare the " "data or use another version of the dataset. ") msg += "Available for `download_and_prepare`: {}".format( list(sorted(installable_versions))) raise ValueError(msg) # Currently it's not possible to overwrite the data because it would # conflict with versioning: If the last version has already been generated, # it will always be reloaded and data_dir will be set at construction. if data_exists: raise ValueError( "Trying to overwrite an existing dataset {} at {}. A dataset with " "the same version {} already exists. If the dataset has changed, " "please update the version number.".format( self.name, self._data_dir, self.version)) logging.info("Generating dataset %s (%s)", self.name, self._data_dir) if not utils.has_sufficient_disk_space( self.info.dataset_size + self.info.download_size, directory=self._data_dir_root): raise IOError( "Not enough disk space. Needed: {} (download: {}, generated: {})" .format( units.size_str(self.info.dataset_size + self.info.download_size), units.size_str(self.info.download_size), units.size_str(self.info.dataset_size), )) self._log_download_bytes() dl_manager = self._make_download_manager( download_dir=download_dir, download_config=download_config) # Create a tmp dir and rename to self._data_dir on successful exit. with utils.incomplete_dir(self._data_dir) as tmp_data_dir: # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward # it to every sub function. with utils.temporary_assignment(self, "_data_dir", tmp_data_dir): if (download_config.try_download_gcs and gcs_utils.is_dataset_on_gcs(self.info.full_name)): logging.warning(GCS_HOSTED_MSG, self.name) gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir) self.info.read_from_directory(self._data_dir) else: self._download_and_prepare(dl_manager=dl_manager, download_config=download_config) # NOTE: If modifying the lines below to put additional information in # DatasetInfo, you'll likely also want to update # DatasetInfo.read_from_directory to possibly restore these attributes # when reading from package data. # Skip statistics computation if tfdv isn't present try: import tensorflow_data_validation # pylint: disable=g-import-not-at-top,import-outside-toplevel,unused-import # pytype: disable=import-error skip_stats_computation = False except ImportError: skip_stats_computation = True splits = list(self.info.splits.values()) statistics_already_computed = bool( splits and splits[0].statistics.num_examples) # Update DatasetInfo metadata by computing statistics from the data. if (skip_stats_computation or download_config.compute_stats == download.ComputeStatsMode.SKIP or download_config.compute_stats == download.ComputeStatsMode.AUTO and statistics_already_computed): logging.info("Skipping computing stats for mode %s.", download_config.compute_stats) else: # Mode is forced or stats do not exists yet logging.info("Computing statistics.") self.info.compute_dynamic_properties() self.info.download_size = dl_manager.downloaded_size # Write DatasetInfo to disk, even if we haven't computed statistics. self.info.write_to_directory(self._data_dir) self._log_download_done()
def test_bytes(self): self.assertEqual("150 bytes", units.size_str(150))
def test_normal_sizes(self): self.assertEqual("1.50 PiB", units.size_str(1.5 * units.PiB)) self.assertEqual("1.50 TiB", units.size_str(1.5 * units.TiB)) self.assertEqual("1.50 GiB", units.size_str(1.5 * units.GiB)) self.assertEqual("1.50 MiB", units.size_str(1.5 * units.MiB)) self.assertEqual("1.50 KiB", units.size_str(1.5 * units.KiB))
def test_none(self): self.assertEqual("Unknown size", units.size_str(None))
def test_none(self): self.assertEqual("?? GiB", units.size_str(None))