Esempio n. 1
0
    def download_and_prepare(self, cache_dir=None, dl_manager=None):
        """Downloads and prepares dataset for reading.

    Subclasses must override _download_and_prepare.

    Args:
      cache_dir: (str) Cached directory where to extract the data. If None,
        a default tmp directory will be used.
      dl_manager: (`tfds.download.DownloadManager`) DownloadManager to use. Only
        one of dl_manager and cache_dir can be set

    Raises:
      ValueError: If the user defines both cache_dir and dl_manager
    """
        # Both args are set
        if cache_dir and dl_manager is not None:
            raise ValueError(
                "Only one of dl_manager and cache_dir can be defined.")
        # None are set. Use the data_dir as cache_dir
        if not cache_dir and dl_manager is None:
            cache_dir = os.path.join(self._data_dir_root, "tmp")

        # Create the download manager
        if cache_dir:
            dl_manager = download.DownloadManager(cache_dir=cache_dir)

        # If the dataset already exists (data_dir not empty) and that we do not
        # overwrite the dataset
        if (self._data_dir and dl_manager.mode
                == download.GenerateMode.REUSE_DATASET_IF_EXISTS):
            tf.logging.info("Reusing dataset %s (%s)", self.name,
                            self._data_dir)
            return

        # Otherwise, create a new version in a new data_dir.
        curr_date = datetime.datetime.now()
        version_str = curr_date.strftime("v_%Y%m%d_%H%M")
        data_dir = self._get_data_dir(version=version_str)
        self._data_dir = None
        tf.logging.info("Generating dataset %s (%s)", self.name, data_dir)

        # Print is intentional: we want this to always go to stdout so user has
        # information needed to cancel download/preparation if needed.
        # This comes right before the progress bar.
        size_text = termcolor.colored("%s GB" % self.SIZE or "?",
                                      attrs=["bold"])
        termcolor.cprint("Downloading / extracting dataset %s (%s) to %s..." %
                         (self.name, size_text, data_dir))

        # Wrap the Dataset generation in a .incomplete directory
        with file_format_adapter.incomplete_dir(data_dir) as data_dir_tmp:
            self._download_and_prepare(dl_manager=dl_manager,
                                       data_dir=data_dir_tmp)

        # Update the DatasetInfo metadata (splits info, num samples,...)
        self._data_dir = data_dir
        self.info.update_from_metadata_dir(self._data_dir)
    def download_and_prepare(self, cache_dir=None, dl_manager=None):
        """Downloads and prepares dataset for reading.

    Subclasses must override _download_and_prepare.

    Args:
      cache_dir (str): Cached directory where to extract the data. If None,
        a default tmp directory will be used.
      dl_manager (DownloadManager): DownloadManager to use. Only one of
        dl_manager and cache_dir can be set

    Raises:
      ValueError: If the user defines both cache_dir and dl_manager
    """
        # Both args are set
        if cache_dir and dl_manager is not None:
            raise ValueError(
                "Only one of dl_manager and cache_dir can be defined.")
        # None are set. Use the data_dir as cache_dir
        if not cache_dir and dl_manager is None:
            cache_dir = os.path.join(self._data_dir_root, "tmp")

        # Create the download manager
        if cache_dir:
            dl_manager = download.DownloadManager(cache_dir=cache_dir)

        # If the dataset already exists (data_dir not empty) and that we do not
        # overwrite the dataset
        if (self._data_dir and dl_manager.mode
                == download.GenerateMode.REUSE_DATASET_IF_EXISTS):
            tf.logging.info("Reusing dataset %s (%s)", self.name,
                            self._data_dir)
            return

        # Otherwise, create a new version in a new data_dir.
        curr_date = datetime.datetime.now()
        version_str = curr_date.strftime("v_%Y%m%d_%H%M")
        data_dir = self._get_data_dir(version=version_str)
        tf.logging.info("Generating dataset %s (%s)", self.name, data_dir)

        # Wrap the Dataset generation in a .incomplete directory
        with file_format_adapter.incomplete_dir(data_dir) as data_dir_tmp:
            # TODO(epot): Data_dir should be an argument of download_and_prepare.
            # Modify this once a better split API exists.
            self._data_dir = data_dir_tmp
            self._download_and_prepare(dl_manager)
            self._data_dir = data_dir
Esempio n. 3
0
    def download_and_prepare(self, download_dir=None, download_config=None):
        """Downloads and prepares dataset for reading.

    Args:
      download_dir: `str`, directory where downloaded files are stored.
        Defaults to "~/tensorflow-datasets/downloads".
      download_config: `tfds.download.DownloadConfig`, further configuration for
        downloading and preparing dataset.
    """

        download_config = download_config or download.DownloadConfig()
        data_exists = tf.io.gfile.exists(self._data_dir)
        if (data_exists
                and download_config.download_mode == REUSE_DATASET_IF_EXISTS):
            logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
            return

        dl_manager = self._make_download_manager(
            download_dir=download_dir, download_config=download_config)

        # Currently it's not possible to overwrite the data because it would
        # conflict with versioning: If the last version has already been generated,
        # it will always be reloaded and data_dir will be set at construction.
        if data_exists:
            raise ValueError(
                "Trying to overwrite an existing dataset {} at {}. A dataset with "
                "the same version {} already exists. If the dataset has changed, "
                "please update the version number.".format(
                    self.name, self._data_dir, self.info.version))
        logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
        self._log_download_bytes()

        # Create a tmp dir and rename to self._data_dir on successful exit.
        with file_format_adapter.incomplete_dir(
                self._data_dir) as tmp_data_dir:
            # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
            # it to every sub function.
            with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
                self._download_and_prepare(
                    dl_manager=dl_manager,
                    max_examples_per_split=download_config.
                    max_examples_per_split)

                # NOTE: If modifying the lines below to put additional information in
                # DatasetInfo, you'll likely also want to update
                # DatasetInfo.read_from_directory to possibly restore these attributes
                # when reading from package data.

                # Update the DatasetInfo metadata by computing statistics from the data.
                if download_config.compute_stats:
                    already_has_stats = bool(
                        self.info.splits.total_num_examples)
                    if already_has_stats:
                        logging.info(
                            "Skipping computing stats because they are already "
                            "populated.")
                    else:
                        self.info.compute_dynamic_properties()

                        # Set checksums for all files downloaded
                        self.info.download_checksums = (
                            dl_manager.recorded_download_checksums)
                        # Set size of all files downloaded
                        self.info.size_in_bytes = sum(
                            [v for _, v in dl_manager.download_sizes.items()])
                # Write DatasetInfo to disk, even if we haven't computed the statistics.
                self.info.write_to_directory(self._data_dir)
Esempio n. 4
0
    def download_and_prepare(self, download_dir=None, download_config=None):
        """Downloads and prepares dataset for reading.

    Args:
      download_dir: `str`, directory where downloaded files are stored.
        Defaults to "~/tensorflow-datasets/downloads".
      download_config: `tfds.download.DownloadConfig`, further configuration for
        downloading and preparing dataset.

    Raises:
      IOError: if there is not enough disk space available.
    """

        download_config = download_config or download.DownloadConfig()
        data_exists = tf.io.gfile.exists(self._data_dir)
        if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS:
            logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
            return

        dl_manager = self._make_download_manager(
            download_dir=download_dir, download_config=download_config)

        # Currently it's not possible to overwrite the data because it would
        # conflict with versioning: If the last version has already been generated,
        # it will always be reloaded and data_dir will be set at construction.
        if data_exists:
            raise ValueError(
                "Trying to overwrite an existing dataset {} at {}. A dataset with "
                "the same version {} already exists. If the dataset has changed, "
                "please update the version number.".format(
                    self.name, self._data_dir, self.version))
        logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
        if not utils.has_sufficient_disk_space(self.info.size_in_bytes,
                                               directory=self._data_dir_root):
            raise IOError("Not enough disk space. Needed: %s" %
                          units.size_str(self.info.size_in_bytes))
        self._log_download_bytes()

        # Create a tmp dir and rename to self._data_dir on successful exit.
        with file_format_adapter.incomplete_dir(
                self._data_dir) as tmp_data_dir:
            # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
            # it to every sub function.
            with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
                if (download_config.try_download_gcs
                        and gcs_utils.is_dataset_on_gcs(self.info.full_name)):
                    logging.warning(GCS_HOSTED_MSG, self.name)
                    gcs_utils.download_gcs_dataset(self.info.full_name,
                                                   self._data_dir)
                    self.info.read_from_directory(self._data_dir)
                else:
                    self._download_and_prepare(dl_manager=dl_manager,
                                               download_config=download_config)

                    # NOTE: If modifying the lines below to put additional information in
                    # DatasetInfo, you'll likely also want to update
                    # DatasetInfo.read_from_directory to possibly restore these attributes
                    # when reading from package data.

                    # Update DatasetInfo metadata by computing statistics from the data.
                    if (download_config.compute_stats
                            == download.ComputeStatsMode.SKIP
                            or download_config.compute_stats
                            == download.ComputeStatsMode.AUTO
                            and bool(self.info.splits.total_num_examples)):
                        logging.info("Skipping computing stats for mode %s.",
                                     download_config.compute_stats)
                    else:  # Mode is forced or stats do not exists yet
                        logging.info("Computing statistics.")
                        self.info.compute_dynamic_properties()
                    self.info.size_in_bytes = dl_manager.downloaded_size
                    # Write DatasetInfo to disk, even if we haven't computed statistics.
                    self.info.write_to_directory(self._data_dir)
        self._log_download_done()
Esempio n. 5
0
  def download_and_prepare(self, download_dir=None, download_config=None):
    """Downloads and prepares dataset for reading.

    Args:
      download_dir: `str`, directory where downloaded files are stored.
        Defaults to "~/tensorflow-datasets/downloads".
      download_config: `tfds.download.DownloadConfig`, further configuration for
        downloading and preparing dataset.

    Raises:
      IOError: if there is not enough disk space available.
    """

    download_config = download_config or download.DownloadConfig()
    data_exists = tf.io.gfile.exists(self._data_dir)
    if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS:
      logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
      return

    if self.version.tfds_version_to_prepare:
      available_to_prepare = ", ".join(str(v) for v in self.versions
                                       if not v.tfds_version_to_prepare)
      raise AssertionError(
          "The version of the dataset you are trying to use ({}:{}) can only "
          "be generated using TFDS code synced @ {} or earlier. Either sync to "
          "that version of TFDS to first prepare the data or use another "
          "version of the dataset (available for `download_and_prepare`: "
          "{}).".format(
              self.name, self.version, self.version.tfds_version_to_prepare,
              available_to_prepare))

    # Only `cls.VERSION` or `experimental_latest` versions can be generated.
    # Otherwise, users may accidentally generate an old version using the
    # code from newer versions.
    installable_versions = {
        str(v) for v in (self.canonical_version, max(self.versions))
    }
    if str(self.version) not in installable_versions:
      msg = (
          "The version of the dataset you are trying to use ({}) is too "
          "old for this version of TFDS so cannot be generated."
      ).format(self.info.full_name)
      if self.version.tfds_version_to_prepare:
        msg += (
            "{} can only be generated using TFDS code synced @ {} or earlier "
            "Either sync to that version of TFDS to first prepare the data or "
            "use another version of the dataset. "
        ).format(self.version, self.version.tfds_version_to_prepare)
      else:
        msg += (
            "Either sync to a previous version of TFDS to first prepare the "
            "data or use another version of the dataset. "
        )
      msg += "Available for `download_and_prepare`: {}".format(
          list(sorted(installable_versions)))
      raise ValueError(msg)

    # Currently it's not possible to overwrite the data because it would
    # conflict with versioning: If the last version has already been generated,
    # it will always be reloaded and data_dir will be set at construction.
    if data_exists:
      raise ValueError(
          "Trying to overwrite an existing dataset {} at {}. A dataset with "
          "the same version {} already exists. If the dataset has changed, "
          "please update the version number.".format(self.name, self._data_dir,
                                                     self.version))

    logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
    if not utils.has_sufficient_disk_space(
        self.info.dataset_size + self.info.download_size,
        directory=self._data_dir_root):
      raise IOError(
          "Not enough disk space. Needed: {} (download: {}, generated: {})"
          .format(
              units.size_str(self.info.dataset_size + self.info.download_size),
              units.size_str(self.info.download_size),
              units.size_str(self.info.dataset_size),
          ))
    self._log_download_bytes()

    dl_manager = self._make_download_manager(
        download_dir=download_dir,
        download_config=download_config)

    # Create a tmp dir and rename to self._data_dir on successful exit.
    with file_format_adapter.incomplete_dir(self._data_dir) as tmp_data_dir:
      # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
      # it to every sub function.
      with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
        if (download_config.try_download_gcs and
            gcs_utils.is_dataset_on_gcs(self.info.full_name)):
          logging.warning(GCS_HOSTED_MSG, self.name)
          gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir)
          self.info.read_from_directory(self._data_dir)
        else:
          self._download_and_prepare(
              dl_manager=dl_manager,
              download_config=download_config)

          # NOTE: If modifying the lines below to put additional information in
          # DatasetInfo, you'll likely also want to update
          # DatasetInfo.read_from_directory to possibly restore these attributes
          # when reading from package data.

          splits = list(self.info.splits.values())
          statistics_already_computed = bool(
              splits and splits[0].statistics.num_examples)
          # Update DatasetInfo metadata by computing statistics from the data.
          if (download_config.compute_stats == download.ComputeStatsMode.SKIP or
              download_config.compute_stats == download.ComputeStatsMode.AUTO
              and statistics_already_computed
             ):
            logging.info(
                "Skipping computing stats for mode %s.",
                download_config.compute_stats)
          else:  # Mode is forced or stats do not exists yet
            logging.info("Computing statistics.")
            self.info.compute_dynamic_properties()
          self.info.download_size = dl_manager.downloaded_size
          # Write DatasetInfo to disk, even if we haven't computed statistics.
          self.info.write_to_directory(self._data_dir)
    self._log_download_done()