Example #1
0
    def _make_download_manager(self, download_dir, download_config):
        """Creates a new download manager object."""
        download_dir = download_dir or os.path.join(self._data_dir_root,
                                                    "downloads")
        extract_dir = (download_config.extract_dir
                       or os.path.join(download_dir, "extracted"))

        # Use manual_dir only if MANUAL_DOWNLOAD_INSTRUCTIONS are set.
        if self.MANUAL_DOWNLOAD_INSTRUCTIONS:
            manual_dir = (download_config.manual_dir
                          or os.path.join(download_dir, "manual"))
        else:
            manual_dir = None

        return download.DownloadManager(
            dataset_name=self.name,
            download_dir=download_dir,
            extract_dir=extract_dir,
            manual_dir=manual_dir,
            manual_dir_instructions=utils.dedent(
                self.MANUAL_DOWNLOAD_INSTRUCTIONS),
            force_download=(download_config.download_mode == FORCE_REDOWNLOAD),
            force_extraction=(
                download_config.download_mode == FORCE_REDOWNLOAD),
            force_checksums_validation=download_config.
            force_checksums_validation,
            register_checksums=download_config.register_checksums,
        )
  def _make_download_manager(self, download_dir, download_config):
    """Creates a new download manager object."""
    download_dir = (
        download_dir or os.path.join(self._data_dir_root, "downloads")
    )
    extract_dir = (
        download_config.extract_dir or os.path.join(download_dir, "extracted")
    )
    manual_dir = (
        download_config.manual_dir or os.path.join(download_dir, "manual")
    )

    if download_config.register_checksums:
      # Note: Error will be raised here if user try to record checksums
      # from a `zipapp`
      register_checksums_path = utils.to_write_path(self._checksums_path)
    else:
      register_checksums_path = None

    return download.DownloadManager(
        download_dir=download_dir,
        extract_dir=extract_dir,
        manual_dir=manual_dir,
        url_infos=self.url_infos,
        manual_dir_instructions=utils.dedent(self.MANUAL_DOWNLOAD_INSTRUCTIONS),
        force_download=(download_config.download_mode == FORCE_REDOWNLOAD),
        force_extraction=(download_config.download_mode == FORCE_REDOWNLOAD),
        force_checksums_validation=download_config.force_checksums_validation,
        register_checksums=download_config.register_checksums,
        register_checksums_path=register_checksums_path,
        verify_ssl=download_config.verify_ssl,
        dataset_name=self.name,
    )
Example #3
0
  def beam_pipeline(self) -> 'beam.Pipeline':
    """Instanciates and returns Apache Beam pipeline.

    Calling this function starts the Apache Beam mode.

    Returns:
      pipeline: The beam pipeline
    """
    if not self._in_contextmanager:
      raise AssertionError(
          'beam_pipeline has to be created from within `SplitBuilder` '
          'contextmanager.'
      )

    beam = lazy_imports_lib.lazy_imports.apache_beam

    # On Colab, stderr isn't displayed by default, so using `print`.
    print_fn = print if utils.is_notebook() else logging.warning
    if not self._beam_runner and not self._beam_options:
      msg = utils.dedent(
          """
          **************************** WARNING *********************************
          Warning: The dataset you're trying to generate is using Apache Beam,
          yet no `beam_runner` nor `beam_options` was explicitly provided.

          Some Beam datasets take weeks to generate, so are usually not suited
          for single machine generation. Please have a look at the instructions
          to setup distributed generation:

          https://www.tensorflow.org/datasets/beam_datasets#generating_a_beam_dataset
          **********************************************************************
          """
      )
      print_fn(msg)

    beam_options = (
        self._beam_options or beam.options.pipeline_options.PipelineOptions()
    )
    # Beam type checking assumes transforms multiple outputs are of same type,
    # which is not our case. Plus it doesn't handle correctly all types, so we
    # are better without it.
    beam_options.view_as(
        beam.options.pipeline_options.TypeOptions
    ).pipeline_type_check = False
    # Create the global pipeline object common for all splits
    pipeline = beam.Pipeline(runner=self._beam_runner, options=beam_options)
    self._beam_pipeline = pipeline.__enter__()
    return self._beam_pipeline
Example #4
0
  def __init__(self,
               builder,
               description=None,
               features=None,
               supervised_keys=None,
               homepage=None,
               citation=None,
               metadata=None,
               redistribution_info=None):
    """Constructs DatasetInfo.

    Args:
      builder: `DatasetBuilder`, dataset builder for this info.
      description: `str`, description of this dataset.
      features: `tfds.features.FeaturesDict`, Information on the feature dict
        of the `tf.data.Dataset()` object from the `builder.as_dataset()`
        method.
      supervised_keys: `tuple` of `(input_key, target_key)`, Specifies the
        input feature and the label for supervised learning, if applicable for
        the dataset. The keys correspond to the feature names to select in
        `info.features`. When calling `tfds.core.DatasetBuilder.as_dataset()`
        with `as_supervised=True`, the `tf.data.Dataset` object will yield
        the (input, target) defined here.
      homepage: `str`, optional, the homepage for this dataset.
      citation: `str`, optional, the citation to use for this dataset.
      metadata: `tfds.core.Metadata`, additonal object which will be
        stored/restored with the dataset. This allows for storing additional
        information with the dataset.
      redistribution_info: `dict`, optional, information needed for
        redistribution, as specified in `dataset_info_pb2.RedistributionInfo`.
        The content of the `license` subfield will automatically be written to a
        LICENSE file stored with the dataset.
    """
    self._builder = builder

    self._info_proto = dataset_info_pb2.DatasetInfo(
        name=builder.name,
        description=utils.dedent(description),
        version=str(builder._version),  # pylint: disable=protected-access
        citation=utils.dedent(citation),
        redistribution_info=dataset_info_pb2.RedistributionInfo(
            license=utils.dedent(redistribution_info.pop("license")),
            **redistribution_info) if redistribution_info else None)

    if homepage:
      self._info_proto.location.urls[:] = [homepage]

    if features:
      if not isinstance(features, top_level_feature.TopLevelFeature):
        raise ValueError(
            "DatasetInfo.features only supports FeaturesDict or Sequence at "
            "the top-level. Got {}".format(features))
      features._set_top_level()  # pylint: disable=protected-access
    self._features = features
    self._splits = splits_lib.SplitDict(self._builder.name)
    if supervised_keys is not None:
      assert isinstance(supervised_keys, tuple)
      assert len(supervised_keys) == 2
      self._info_proto.supervised_keys.input = supervised_keys[0]
      self._info_proto.supervised_keys.output = supervised_keys[1]

    if metadata and not isinstance(metadata, Metadata):
      raise ValueError(
          "Metadata should be a `tfds.core.Metadata` instance. Received "
          "{}".format(metadata))
    self._metadata = metadata

    # Is this object initialized with both the static and the dynamic data?
    self._fully_initialized = False
Example #5
0
    def __init__(
        self,
        *,
        download_dir: epath.PathLike,
        extract_dir: Optional[epath.PathLike] = None,
        manual_dir: Optional[epath.PathLike] = None,
        manual_dir_instructions: Optional[str] = None,
        url_infos: Optional[Dict[str, checksums.UrlInfo]] = None,
        dataset_name: Optional[str] = None,
        force_download: bool = False,
        force_extraction: bool = False,
        force_checksums_validation: bool = False,
        register_checksums: bool = False,
        register_checksums_path: Optional[epath.PathLike] = None,
        verify_ssl: bool = True,
    ):
        """Download manager constructor.

    Args:
      download_dir: Path to directory where downloads are stored.
      extract_dir: Path to directory where artifacts are extracted.
      manual_dir: Path to manually downloaded/extracted data directory.
      manual_dir_instructions: Human readable instructions on how to prepare
        contents of the manual_dir for this dataset.
      url_infos: Urls info for the checksums.
      dataset_name: Name of dataset this instance will be used for. If provided,
        downloads will contain which datasets they were used for.
      force_download: If True, always [re]download.
      force_extraction: If True, always [re]extract.
      force_checksums_validation: If True, raises an error if an URL do not have
        checksums.
      register_checksums: If True, dl checksums aren't checked, but stored into
        file.
      register_checksums_path: Path were to save checksums. Should be set if
        register_checksums is True.
      verify_ssl: `bool`, defaults to True. If True, will verify certificate
        when downloading dataset.

    Raises:
      FileNotFoundError: Raised if the register_checksums_path does not exists.
    """
        if register_checksums:
            if not register_checksums_path:
                raise ValueError(
                    'When register_checksums=True, register_checksums_path should be set.'
                )
            register_checksums_path = epath.Path(register_checksums_path)
            if not register_checksums_path.exists():
                # Create the file here to make sure user has write access before
                # starting downloads.
                register_checksums_path.touch()
            else:
                # Make sure the user has write access before downloading any files.
                # (e.g. TFDS installed by admin)
                register_checksums_path.write_text(
                    register_checksums_path.read_text())

        download_dir = epath.Path(download_dir).expanduser()
        if extract_dir:
            extract_dir = epath.Path(extract_dir).expanduser()
        else:
            extract_dir = download_dir / 'extracted'
        if manual_dir:
            manual_dir = epath.Path(manual_dir).expanduser()

        self._download_dir: epath.Path = download_dir
        self._extract_dir: epath.Path = extract_dir
        self._manual_dir: Optional[epath.Path] = manual_dir  # pytype: disable=annotation-type-mismatch  # attribute-variable-annotations
        self._manual_dir_instructions = utils.dedent(manual_dir_instructions)
        self._download_dir.mkdir(parents=True, exist_ok=True)
        self._extract_dir.mkdir(parents=True, exist_ok=True)

        self._force_download = force_download
        self._force_extraction = force_extraction
        self._force_checksums_validation = force_checksums_validation
        self._register_checksums = register_checksums
        self._register_checksums_path = register_checksums_path
        self._verify_ssl = verify_ssl
        self._dataset_name = dataset_name

        # All known URLs: {url: UrlInfo(size=, checksum=)}
        self._url_infos = checksums.get_all_url_infos()
        if url_infos is not None:
            self._url_infos.update(url_infos)

        # To record what is being used: {url: UrlInfo(size, checksum, filename)}
        self._recorded_url_infos: Dict[str, checksums.UrlInfo] = {}
        # These attributes are lazy-initialized since they must be cleared when this
        # object is pickled for Beam. They are then recreated on each worker.
        self.__downloader = None
        self.__extractor = None
        # Executor to avoid blocking other download/extractions when running I/O
        # operations (reading/renaming download file).
        # Only use a single thread as the read/ops are locked by the
        # `build_synchronize_decorator`.
        # Note: This thread is in additions of the download and extraction
        # executors threads.
        self._executor = concurrent.futures.ThreadPoolExecutor(1)
Example #6
0
    def __init__(
            self,
            *,
            builder: Union[DatasetIdentity, Any],
            description: Optional[str] = None,
            features: Optional[feature_lib.FeatureConnector] = None,
            supervised_keys: Optional[SupervisedKeysType] = None,
            disable_shuffling: bool = False,
            homepage: Optional[str] = None,
            citation: Optional[str] = None,
            metadata: Optional[Metadata] = None,
            license: Optional[str] = None,  # pylint: disable=redefined-builtin
            redistribution_info: Optional[Dict[str, str]] = None,
            split_dict: Optional[splits_lib.SplitDict] = None):
        # pyformat: disable
        """Constructs DatasetInfo.

    Args:
      builder: `DatasetBuilder` or `DatasetIdentity`. The dataset builder or
        identity will be used to populate this info.
      description: `str`, description of this dataset.
      features: `tfds.features.FeaturesDict`, Information on the feature dict of
        the `tf.data.Dataset()` object from the `builder.as_dataset()` method.
      supervised_keys: Specifies the input structure for supervised learning, if
        applicable for the dataset, used with "as_supervised". The keys
        correspond to the feature names to select in `info.features`. When
        calling `tfds.core.DatasetBuilder.as_dataset()` with
        `as_supervised=True`, the `tf.data.Dataset` object will yield the
        structure defined by the keys passed here, instead of that defined by
        the `features` argument. Typically this is a `(input_key, target_key)`
        tuple, and the dataset yields a tuple of tensors `(input, target)`
        tensors.

        To yield a more complex structure, pass a tuple of `tf.nest` compatible
        structures of feature keys. The resulting `Dataset` will yield
        structures with each key replaced by the coresponding tensor. For
        example, passing a triple of keys would return a dataset
        that yields `(feature, target, sample_weights)` triples for keras.
        Using `supervised_keys=({'a':'a','b':'b'}, 'c')` would create a dataset
        yielding a tuple with a dictionary of features in the `features`
        position.

        Note that selecting features in nested `tfds.features.FeaturesDict`
        objects is not supported.
      disable_shuffling: `bool`, specify whether to shuffle the examples.
      homepage: `str`, optional, the homepage for this dataset.
      citation: `str`, optional, the citation to use for this dataset.
      metadata: `tfds.core.Metadata`, additonal object which will be
        stored/restored with the dataset. This allows for storing additional
        information with the dataset.
      license: license of the dataset.
      redistribution_info: information needed for redistribution, as specified
        in `dataset_info_pb2.RedistributionInfo`. The content of the `license`
        subfield will automatically be written to a LICENSE file stored with the
        dataset.
      split_dict: information about the splits in this dataset.
    """
        # pyformat: enable
        self._builder_or_identity = builder
        if isinstance(builder, DatasetIdentity):
            self._identity = builder
        else:
            self._identity = DatasetIdentity.from_builder(builder)

        self._info_proto = dataset_info_pb2.DatasetInfo(
            name=self._identity.name,
            description=utils.dedent(description),
            version=str(self._identity.version),
            release_notes=self._identity.release_notes,
            disable_shuffling=disable_shuffling,
            config_name=self._identity.config_name,
            config_description=self._identity.config_description,
            citation=utils.dedent(citation),
            module_name=self._identity.module_name,
            redistribution_info=dataset_info_pb2.RedistributionInfo(
                license=utils.dedent(license
                                     or redistribution_info.pop("license")),
                **redistribution_info) if redistribution_info else None)

        if homepage:
            self._info_proto.location.urls[:] = [homepage]

        if features:
            if not isinstance(features, top_level_feature.TopLevelFeature):
                raise ValueError(
                    "DatasetInfo.features only supports FeaturesDict or Sequence at "
                    "the top-level. Got {}".format(features))
        self._features = features
        self._splits = splits_lib.SplitDict([])
        if split_dict:
            self.set_splits(split_dict)
        if supervised_keys is not None:
            self._info_proto.supervised_keys.CopyFrom(
                _supervised_keys_to_proto(supervised_keys))

        if metadata and not isinstance(metadata, Metadata):
            raise ValueError(
                "Metadata should be a `tfds.core.Metadata` instance. Received "
                "{}".format(metadata))
        self._metadata = metadata

        # Is this object initialized with both the static and the dynamic data?
        self._fully_initialized = False