def test_force_stats(self): # Test when stats already exists but compute_stats='force' with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: # No dataset_info restored, so stats are empty builder = testing.DummyMnist(data_dir=tmp_dir) self.assertEqual(builder.info.splits.total_num_examples, 40) self.assertFalse(self.compute_dynamic_property.called) download_config = download.DownloadConfig( compute_stats=download.ComputeStatsMode.FORCE, ) builder.download_and_prepare(download_config=download_config) # Statistics computation should have been recomputed self.assertTrue(self.compute_dynamic_property.called)
def test_stats_not_restored_gcs_overwritten(self): with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: # If split are different that the one restored, stats should be recomputed builder = testing.DummyMnist(data_dir=tmp_dir) self.assertEqual(builder.info.splits["train"].statistics.num_examples, 20) self.assertFalse(self.compute_dynamic_property.called) dl_config = download.DownloadConfig( max_examples_per_split=5, compute_stats=download.ComputeStatsMode.AUTO, ) builder.download_and_prepare(download_config=dl_config) # Statistics should have been recomputed (split different from the # restored ones) self.assertTrue(self.compute_dynamic_property.called)
def _download_and_prepare_as_dataset(self, builder): with absltest.mock.patch.multiple( "tensorflow_datasets.core.download.DownloadManager", download_and_extract=self._get_dl_extract_result, download=self._get_dl_extract_result, manual_dir=self.example_dir, ): if isinstance(builder, dataset_builder.BeamBasedBuilder): # TODO(b/129148632): The current apache-beam 2.11.0 do not work with Py3 # Update once the new version is out (around April) skip_beam_test = bool(six.PY3) if skip_beam_test: return import apache_beam as beam # pylint: disable=g-import-not-at-top # For Beam datasets, set-up the runner config beam_runner = None beam_options = beam.options.pipeline_options.PipelineOptions() else: beam_runner = None beam_options = None # Skip computation, otherwise the computed number of samples won't match # the one restored from GCS download_config = download.DownloadConfig( compute_stats=download.ComputeStatsMode.FORCE, beam_runner=beam_runner, beam_options=beam_options, ) builder.download_and_prepare(download_config=download_config) with self._subTest("as_dataset"): self._assertAsDataset(builder) with self._subTest("num_examples"): self._assertNumSamples(builder) with self._subTest("reload"): # When reloading the dataset, metadata should been reloaded too. builder_reloaded = self._make_builder(config=builder.builder_config) self._assertNumSamples(builder_reloaded) # After reloading, as_dataset should still be working with self._subTest("as_dataset"): self._assertAsDataset(builder_reloaded)
def _download_and_prepare_as_dataset(self, builder): # Provide the manual dir only if builder has MANUAL_DOWNLOAD_INSTRUCTIONS # set. missing_dir_mock = mock.PropertyMock( side_effect=Exception("Missing MANUAL_DOWNLOAD_INSTRUCTIONS")) manual_dir = (self.dummy_data if builder.MANUAL_DOWNLOAD_INSTRUCTIONS else missing_dir_mock) with mock.patch.multiple( "tensorflow_datasets.core.download.DownloadManager", download_and_extract=self._get_dl_extract_result, download=self._get_dl_download_result, download_checksums=self._download_checksums, manual_dir=manual_dir, download_dir=self.dummy_data, ): # For Beam datasets, set-up the runner config beam_runner = None download_config = download.DownloadConfig( compute_stats=download.ComputeStatsMode.SKIP, beam_runner=beam_runner, ) with self._test_key_not_local_path(builder): builder.download_and_prepare(download_config=download_config) with self._subTest("as_dataset"): self._assertAsDataset(builder) with self._subTest("num_examples"): self._assertNumSamples(builder) with self._subTest("reload"): # When reloading the dataset, metadata should been reloaded too. builder_reloaded = self._make_builder( config=builder.builder_config) self._assertNumSamples(builder_reloaded) # After reloading, as_dataset should still be working with self._subTest("as_dataset"): self._assertAsDataset(builder_reloaded) with self._subTest("config_description"): self._test_description_builder_config(builder)
def test_gcs_not_exists(self): # By disabling the patch, and because DummyMnist is not on GCS, we can # simulate a new dataset starting from scratch self.patch_gcs.stop() with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: builder = testing.DummyMnist(data_dir=tmp_dir) # No dataset_info restored, so stats are empty self.assertEqual(builder.info.splits.total_num_examples, 0) self.assertFalse(self.compute_dynamic_property.called) dl_config = download.DownloadConfig( compute_stats=download.ComputeStatsMode.AUTO, ) builder.download_and_prepare(download_config=dl_config) # Statistics should have been recomputed self.assertTrue(self.compute_dynamic_property.called) self.patch_gcs.start()
def test_skip_stats(self): # Test when stats do not exists yet and compute_stats='skip' # By disabling the patch, and because DummyMnist is not on GCS, we can # simulate a new dataset starting from scratch self.patch_gcs.stop() with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: # No dataset_info restored, so stats are empty builder = testing.DummyMnist(data_dir=tmp_dir, num_shards=5) self.assertEqual(builder.info.splits.total_num_examples, 0) self.assertFalse(self.compute_dynamic_property.called) download_config = download.DownloadConfig( compute_stats=download.ComputeStatsMode.SKIP, ) builder.download_and_prepare(download_config=download_config) # Statistics computation should have been skipped self.assertEqual(builder.info.splits.total_num_examples, 0) self.assertFalse(self.compute_dynamic_property.called) self.patch_gcs.start()
def _download_and_prepare_as_dataset(self, builder): with absltest.mock.patch.multiple( "tensorflow_datasets.core.download.DownloadManager", download_and_extract=self._get_dl_extract_result, download=self._get_dl_extract_result, download_checksums=lambda *_: None, manual_dir=self.example_dir, ): if isinstance(builder, dataset_builder.BeamBasedBuilder): import apache_beam as beam # pylint: disable=g-import-not-at-top # For Beam datasets, set-up the runner config beam_runner = None beam_options = beam.options.pipeline_options.PipelineOptions() else: beam_runner = None beam_options = None download_config = download.DownloadConfig( compute_stats=download.ComputeStatsMode.FORCE, beam_runner=beam_runner, beam_options=beam_options, ) builder.download_and_prepare(download_config=download_config) with self._subTest("as_dataset"): self._assertAsDataset(builder) with self._subTest("num_examples"): self._assertNumSamples(builder) with self._subTest("reload"): # When reloading the dataset, metadata should been reloaded too. builder_reloaded = self._make_builder( config=builder.builder_config) self._assertNumSamples(builder_reloaded) # After reloading, as_dataset should still be working with self._subTest("as_dataset"): self._assertAsDataset(builder_reloaded)
def download_and_prepare(self, download_dir=None, download_config=None): """Downloads and prepares dataset for reading. Args: download_dir: `str`, directory where downloaded files are stored. Defaults to "~/tensorflow-datasets/downloads". download_config: `tfds.download.DownloadConfig`, further configuration for downloading and preparing dataset. """ download_config = download_config or download.DownloadConfig() data_exists = tf.io.gfile.exists(self._data_dir) if (data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS): logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return dl_manager = self._make_download_manager( download_dir=download_dir, download_config=download_config) # Currently it's not possible to overwrite the data because it would # conflict with versioning: If the last version has already been generated, # it will always be reloaded and data_dir will be set at construction. if data_exists: raise ValueError( "Trying to overwrite an existing dataset {} at {}. A dataset with " "the same version {} already exists. If the dataset has changed, " "please update the version number.".format( self.name, self._data_dir, self.info.version)) logging.info("Generating dataset %s (%s)", self.name, self._data_dir) self._log_download_bytes() # Create a tmp dir and rename to self._data_dir on successful exit. with file_format_adapter.incomplete_dir( self._data_dir) as tmp_data_dir: # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward # it to every sub function. with utils.temporary_assignment(self, "_data_dir", tmp_data_dir): self._download_and_prepare( dl_manager=dl_manager, max_examples_per_split=download_config. max_examples_per_split) # NOTE: If modifying the lines below to put additional information in # DatasetInfo, you'll likely also want to update # DatasetInfo.read_from_directory to possibly restore these attributes # when reading from package data. # Update the DatasetInfo metadata by computing statistics from the data. if download_config.compute_stats: already_has_stats = bool( self.info.splits.total_num_examples) if already_has_stats: logging.info( "Skipping computing stats because they are already " "populated.") else: self.info.compute_dynamic_properties() # Set checksums for all files downloaded self.info.download_checksums = ( dl_manager.recorded_download_checksums) # Set size of all files downloaded self.info.size_in_bytes = sum( [v for _, v in dl_manager.download_sizes.items()]) # Write DatasetInfo to disk, even if we haven't computed the statistics. self.info.write_to_directory(self._data_dir)
def download_and_prepare(self, download_dir=None, download_config=None): """Downloads and prepares dataset for reading. Args: download_dir: `str`, directory where downloaded files are stored. Defaults to "~/tensorflow-datasets/downloads". download_config: `tfds.download.DownloadConfig`, further configuration for downloading and preparing dataset. Raises: IOError: if there is not enough disk space available. """ download_config = download_config or download.DownloadConfig() data_exists = tf.io.gfile.exists(self._data_dir) if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS: logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return dl_manager = self._make_download_manager( download_dir=download_dir, download_config=download_config) # Currently it's not possible to overwrite the data because it would # conflict with versioning: If the last version has already been generated, # it will always be reloaded and data_dir will be set at construction. if data_exists: raise ValueError( "Trying to overwrite an existing dataset {} at {}. A dataset with " "the same version {} already exists. If the dataset has changed, " "please update the version number.".format( self.name, self._data_dir, self.version)) logging.info("Generating dataset %s (%s)", self.name, self._data_dir) if not utils.has_sufficient_disk_space(self.info.size_in_bytes, directory=self._data_dir_root): raise IOError("Not enough disk space. Needed: %s" % units.size_str(self.info.size_in_bytes)) self._log_download_bytes() # Create a tmp dir and rename to self._data_dir on successful exit. with file_format_adapter.incomplete_dir( self._data_dir) as tmp_data_dir: # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward # it to every sub function. with utils.temporary_assignment(self, "_data_dir", tmp_data_dir): if (download_config.try_download_gcs and gcs_utils.is_dataset_on_gcs(self.info.full_name)): logging.warning(GCS_HOSTED_MSG, self.name) gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir) self.info.read_from_directory(self._data_dir) else: self._download_and_prepare(dl_manager=dl_manager, download_config=download_config) # NOTE: If modifying the lines below to put additional information in # DatasetInfo, you'll likely also want to update # DatasetInfo.read_from_directory to possibly restore these attributes # when reading from package data. # Update DatasetInfo metadata by computing statistics from the data. if (download_config.compute_stats == download.ComputeStatsMode.SKIP or download_config.compute_stats == download.ComputeStatsMode.AUTO and bool(self.info.splits.total_num_examples)): logging.info("Skipping computing stats for mode %s.", download_config.compute_stats) else: # Mode is forced or stats do not exists yet logging.info("Computing statistics.") self.info.compute_dynamic_properties() self.info.size_in_bytes = dl_manager.downloaded_size # Write DatasetInfo to disk, even if we haven't computed statistics. self.info.write_to_directory(self._data_dir) self._log_download_done()
def download_and_prepare(self, download_dir=None, download_config=None): """Downloads and prepares dataset for reading. Args: download_dir: `str`, directory where downloaded files are stored. Defaults to "~/tensorflow-datasets/downloads". download_config: `tfds.download.DownloadConfig`, further configuration for downloading and preparing dataset. Raises: IOError: if there is not enough disk space available. """ download_config = download_config or download.DownloadConfig() data_exists = tf.io.gfile.exists(self._data_dir) if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS: logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return # Disable `download_and_prepare` (internally, we are still # allowing Py2 for the `dataset_builder_tests.py` & cie if _is_py2_download_and_prepare_disabled and six.PY2: raise NotImplementedError( "TFDS has dropped `builder.download_and_prepare` support for " "Python 2. Please update your code to Python 3.") if self.version.tfds_version_to_prepare: available_to_prepare = ", ".join( str(v) for v in self.versions if not v.tfds_version_to_prepare) raise AssertionError( "The version of the dataset you are trying to use ({}:{}) can only " "be generated using TFDS code synced @ {} or earlier. Either sync to " "that version of TFDS to first prepare the data or use another " "version of the dataset (available for `download_and_prepare`: " "{}).".format(self.name, self.version, self.version.tfds_version_to_prepare, available_to_prepare)) # Only `cls.VERSION` or `experimental_latest` versions can be generated. # Otherwise, users may accidentally generate an old version using the # code from newer versions. installable_versions = { str(v) for v in (self.canonical_version, max(self.versions)) } if str(self.version) not in installable_versions: msg = ( "The version of the dataset you are trying to use ({}) is too " "old for this version of TFDS so cannot be generated.").format( self.info.full_name) if self.version.tfds_version_to_prepare: msg += ( "{} can only be generated using TFDS code synced @ {} or earlier " "Either sync to that version of TFDS to first prepare the data or " "use another version of the dataset. ").format( self.version, self.version.tfds_version_to_prepare) else: msg += ( "Either sync to a previous version of TFDS to first prepare the " "data or use another version of the dataset. ") msg += "Available for `download_and_prepare`: {}".format( list(sorted(installable_versions))) raise ValueError(msg) # Currently it's not possible to overwrite the data because it would # conflict with versioning: If the last version has already been generated, # it will always be reloaded and data_dir will be set at construction. if data_exists: raise ValueError( "Trying to overwrite an existing dataset {} at {}. A dataset with " "the same version {} already exists. If the dataset has changed, " "please update the version number.".format( self.name, self._data_dir, self.version)) logging.info("Generating dataset %s (%s)", self.name, self._data_dir) if not utils.has_sufficient_disk_space( self.info.dataset_size + self.info.download_size, directory=self._data_dir_root): raise IOError( "Not enough disk space. Needed: {} (download: {}, generated: {})" .format( units.size_str(self.info.dataset_size + self.info.download_size), units.size_str(self.info.download_size), units.size_str(self.info.dataset_size), )) self._log_download_bytes() dl_manager = self._make_download_manager( download_dir=download_dir, download_config=download_config) # Create a tmp dir and rename to self._data_dir on successful exit. with utils.incomplete_dir(self._data_dir) as tmp_data_dir: # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward # it to every sub function. with utils.temporary_assignment(self, "_data_dir", tmp_data_dir): if (download_config.try_download_gcs and gcs_utils.is_dataset_on_gcs(self.info.full_name)): logging.warning(GCS_HOSTED_MSG, self.name) gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir) self.info.read_from_directory(self._data_dir) else: self._download_and_prepare(dl_manager=dl_manager, download_config=download_config) # NOTE: If modifying the lines below to put additional information in # DatasetInfo, you'll likely also want to update # DatasetInfo.read_from_directory to possibly restore these attributes # when reading from package data. # Skip statistics computation if tfdv isn't present try: import tensorflow_data_validation # pylint: disable=g-import-not-at-top,import-outside-toplevel,unused-import # pytype: disable=import-error skip_stats_computation = False except ImportError: skip_stats_computation = True splits = list(self.info.splits.values()) statistics_already_computed = bool( splits and splits[0].statistics.num_examples) # Update DatasetInfo metadata by computing statistics from the data. if (skip_stats_computation or download_config.compute_stats == download.ComputeStatsMode.SKIP or download_config.compute_stats == download.ComputeStatsMode.AUTO and statistics_already_computed): logging.info("Skipping computing stats for mode %s.", download_config.compute_stats) else: # Mode is forced or stats do not exists yet logging.info("Computing statistics.") self.info.compute_dynamic_properties() self.info.download_size = dl_manager.downloaded_size # Write DatasetInfo to disk, even if we haven't computed statistics. self.info.write_to_directory(self._data_dir) self._log_download_done()
def _get_dl_config_if_need_to_run(self): return download.DownloadConfig( beam_options=beam.options.pipeline_options.PipelineOptions(), )
def make_default_config(): return download.DownloadConfig()