Example #1
0
    def _make_download_manager(self, download_dir, download_config):
        """Creates a new download manager object."""
        download_dir = download_dir or os.path.join(self._data_dir_root,
                                                    "downloads")
        extract_dir = (download_config.extract_dir
                       or os.path.join(download_dir, "extracted"))

        # Use manual_dir only if MANUAL_DOWNLOAD_INSTRUCTIONS are set.
        if self.MANUAL_DOWNLOAD_INSTRUCTIONS:
            manual_dir = (download_config.manual_dir
                          or os.path.join(download_dir, "manual"))
        else:
            manual_dir = None

        return download.DownloadManager(
            dataset_name=self.name,
            download_dir=download_dir,
            extract_dir=extract_dir,
            manual_dir=manual_dir,
            manual_dir_instructions=utils.dedent(
                self.MANUAL_DOWNLOAD_INSTRUCTIONS),
            force_download=(download_config.download_mode == FORCE_REDOWNLOAD),
            force_extraction=(
                download_config.download_mode == FORCE_REDOWNLOAD),
            force_checksums_validation=download_config.
            force_checksums_validation,
            register_checksums=download_config.register_checksums,
        )
Example #2
0
  def _make_download_manager(self, download_dir, download_config):
    """Creates a new download manager object."""
    download_dir = (
        download_dir or os.path.join(self._data_dir_root, "downloads")
    )
    extract_dir = (
        download_config.extract_dir or os.path.join(download_dir, "extracted")
    )
    manual_dir = (
        download_config.manual_dir or os.path.join(download_dir, "manual")
    )

    if download_config.register_checksums:
      # Note: Error will be raised here if user try to record checksums
      # from a `zipapp`
      register_checksums_path = utils.to_write_path(self._checksums_path)
    else:
      register_checksums_path = None

    return download.DownloadManager(
        download_dir=download_dir,
        extract_dir=extract_dir,
        manual_dir=manual_dir,
        url_infos=self.url_infos,
        manual_dir_instructions=self.MANUAL_DOWNLOAD_INSTRUCTIONS,
        force_download=(download_config.download_mode == FORCE_REDOWNLOAD),
        force_extraction=(download_config.download_mode == FORCE_REDOWNLOAD),
        force_checksums_validation=download_config.force_checksums_validation,
        register_checksums=download_config.register_checksums,
        register_checksums_path=register_checksums_path,
        verify_ssl=download_config.verify_ssl,
        dataset_name=self.name,
    )
Example #3
0
    def download_and_prepare(self, cache_dir=None, dl_manager=None):
        """Downloads and prepares dataset for reading.

    Subclasses must override _download_and_prepare.

    Args:
      cache_dir: (str) Cached directory where to extract the data. If None,
        a default tmp directory will be used.
      dl_manager: (`tfds.download.DownloadManager`) DownloadManager to use. Only
        one of dl_manager and cache_dir can be set

    Raises:
      ValueError: If the user defines both cache_dir and dl_manager
    """
        # Both args are set
        if cache_dir and dl_manager is not None:
            raise ValueError(
                "Only one of dl_manager and cache_dir can be defined.")
        # None are set. Use the data_dir as cache_dir
        if not cache_dir and dl_manager is None:
            cache_dir = os.path.join(self._data_dir_root, "tmp")

        # Create the download manager
        if cache_dir:
            dl_manager = download.DownloadManager(cache_dir=cache_dir)

        # If the dataset already exists (data_dir not empty) and that we do not
        # overwrite the dataset
        if (self._data_dir and dl_manager.mode
                == download.GenerateMode.REUSE_DATASET_IF_EXISTS):
            tf.logging.info("Reusing dataset %s (%s)", self.name,
                            self._data_dir)
            return

        # Otherwise, create a new version in a new data_dir.
        curr_date = datetime.datetime.now()
        version_str = curr_date.strftime("v_%Y%m%d_%H%M")
        data_dir = self._get_data_dir(version=version_str)
        self._data_dir = None
        tf.logging.info("Generating dataset %s (%s)", self.name, data_dir)

        # Print is intentional: we want this to always go to stdout so user has
        # information needed to cancel download/preparation if needed.
        # This comes right before the progress bar.
        size_text = termcolor.colored("%s GB" % self.SIZE or "?",
                                      attrs=["bold"])
        termcolor.cprint("Downloading / extracting dataset %s (%s) to %s..." %
                         (self.name, size_text, data_dir))

        # Wrap the Dataset generation in a .incomplete directory
        with file_format_adapter.incomplete_dir(data_dir) as data_dir_tmp:
            self._download_and_prepare(dl_manager=dl_manager,
                                       data_dir=data_dir_tmp)

        # Update the DatasetInfo metadata (splits info, num samples,...)
        self._data_dir = data_dir
        self.info.update_from_metadata_dir(self._data_dir)
    def download_and_prepare(self, cache_dir=None, dl_manager=None):
        """Downloads and prepares dataset for reading.

    Subclasses must override _download_and_prepare.

    Args:
      cache_dir (str): Cached directory where to extract the data. If None,
        a default tmp directory will be used.
      dl_manager (DownloadManager): DownloadManager to use. Only one of
        dl_manager and cache_dir can be set

    Raises:
      ValueError: If the user defines both cache_dir and dl_manager
    """
        # Both args are set
        if cache_dir and dl_manager is not None:
            raise ValueError(
                "Only one of dl_manager and cache_dir can be defined.")
        # None are set. Use the data_dir as cache_dir
        if not cache_dir and dl_manager is None:
            cache_dir = os.path.join(self._data_dir_root, "tmp")

        # Create the download manager
        if cache_dir:
            dl_manager = download.DownloadManager(cache_dir=cache_dir)

        # If the dataset already exists (data_dir not empty) and that we do not
        # overwrite the dataset
        if (self._data_dir and dl_manager.mode
                == download.GenerateMode.REUSE_DATASET_IF_EXISTS):
            tf.logging.info("Reusing dataset %s (%s)", self.name,
                            self._data_dir)
            return

        # Otherwise, create a new version in a new data_dir.
        curr_date = datetime.datetime.now()
        version_str = curr_date.strftime("v_%Y%m%d_%H%M")
        data_dir = self._get_data_dir(version=version_str)
        tf.logging.info("Generating dataset %s (%s)", self.name, data_dir)

        # Wrap the Dataset generation in a .incomplete directory
        with file_format_adapter.incomplete_dir(data_dir) as data_dir_tmp:
            # TODO(epot): Data_dir should be an argument of download_and_prepare.
            # Modify this once a better split API exists.
            self._data_dir = data_dir_tmp
            self._download_and_prepare(dl_manager)
            self._data_dir = data_dir
Example #5
0
  def _make_download_manager(self, download_dir, download_config):
    download_dir = download_dir or os.path.join(self._data_dir_root,
                                                "downloads")
    extract_dir = (download_config.extract_dir or
                   os.path.join(download_dir, "extracted"))
    manual_dir = (download_config.manual_dir or
                  os.path.join(download_dir, "manual"))
    manual_dir = os.path.join(manual_dir, self.name)

    return download.DownloadManager(
        dataset_name=self.name,
        download_dir=download_dir,
        extract_dir=extract_dir,
        manual_dir=manual_dir,
        force_download=(download_config.download_mode == FORCE_REDOWNLOAD),
        force_extraction=(download_config.download_mode == FORCE_REDOWNLOAD),
        register_checksums=download_config.register_checksums,
    )