コード例 #1
0
 def _sync_file_copy(self, filepath: str,
                     destination_path: str) -> checksums_lib.UrlInfo:
     out_path = os.path.join(destination_path, os.path.basename(filepath))
     tf.io.gfile.copy(filepath, out_path)
     hexdigest, size = utils.read_checksum_digest(
         out_path, checksum_cls=self._checksumer_cls)
     return checksums_lib.UrlInfo(checksum=hexdigest, size=size)
コード例 #2
0
    def test_download_url_info_in_info_file_missmatch(self):
        """Tests failure when downloaded checksums and `.INFO` mismatch."""

        a = Artifact('x')
        self.dl_results[a.url] = a.url_info

        # Download the url once
        dl_manager = self._get_manager(register_checksums=False)
        dl_manager.download(a.url)

        # The second time, download the url with a different checksum
        self.dl_results[a.url] = checksums_lib.UrlInfo(
            size=a.url_info.size,
            checksum=_sha256('Other content'),
            filename=a.url_info.filename,
        )
        dl_manager = self._get_manager(
            register_checksums=False,
            force_download=True,
        )
        with self.assertRaisesRegexp(ValueError,
                                     'contains a different checksum'):
            dl_manager.download(a.url)

        # If the url is re-downloaded with the same hash, no error is raised
        self.dl_results[a.url] = a.url_info
        dl_manager = self._get_manager(
            register_checksums=False,
            force_download=True,
        )
        dl_manager.download(a.url)
コード例 #3
0
    def _sync_download(self,
                       url: str,
                       destination_path: str,
                       verify: bool = True) -> checksums_lib.UrlInfo:
        """Synchronous version of `download` method.

    To download through a proxy, the `HTTP_PROXY`, `HTTPS_PROXY`,
    `REQUESTS_CA_BUNDLE`,... environment variables can be exported, as
    described in:
    https://requests.readthedocs.io/en/master/user/advanced/#proxies

    Args:
      url: url to download
      destination_path: path where to write it
      verify: whether to verify ssl certificates

    Returns:
      None

    Raises:
      DownloadError: when download fails.
    """
        try:
            # If url is on a filesystem that gfile understands, use copy. Otherwise,
            # use requests (http) or urllib (ftp).
            if not url.startswith('http'):
                return self._sync_file_copy(url, destination_path)
        except tf.errors.UnimplementedError:
            pass

        with _open_url(url, verify=verify) as (response, iter_content):
            fname = _get_filename(response)
            path = os.path.join(destination_path, fname)
            size = 0

            # Initialize the download size progress bar
            size_mb = 0
            unit_mb = units.MiB
            total_size = int(response.headers.get('Content-length',
                                                  0)) // unit_mb
            self._pbar_dl_size.update_total(total_size)
            with tf.io.gfile.GFile(path, 'wb') as file_:
                checksum = self._checksumer_cls()
                for block in iter_content:
                    size += len(block)
                    checksum.update(block)
                    file_.write(block)

                    # Update the download size progress bar
                    size_mb += len(block)
                    if size_mb > unit_mb:
                        self._pbar_dl_size.update(size_mb // unit_mb)
                        size_mb %= unit_mb
        self._pbar_url.update(1)
        return checksums_lib.UrlInfo(
            checksum=checksum.hexdigest(),
            size=size,
            filename=fname,
        )
コード例 #4
0
def _read_url_info(url_path: str) -> checksums.UrlInfo:
    """Loads the `UrlInfo` from the `.INFO` file."""
    file_info = resource_lib.read_info_file(url_path)
    if 'url_info' not in file_info:
        raise ValueError(
            'Could not found `url_info` in {}. This likelly indicates that '
            'the files where downloaded with a previous version of TFDS (<=3.1.0). '
        )
    return checksums.UrlInfo(**file_info['url_info'])
コード例 #5
0
def test_checksums(tmp_path: pathlib.Path):
  path = tmp_path / 'checksums.tsv'
  url_infos = {
      'http://abc.org/data': checksums.UrlInfo(
          checksum='abcd',
          size=1234,
          filename='a.zip',
      ),
      'http://edf.org/data': checksums.UrlInfo(
          checksum='abcd',
          size=1234,
          filename='b.zip',
      ),
  }

  checksums.save_url_infos(path, url_infos)
  loaded_url_infos = checksums.load_url_infos(path)
  assert loaded_url_infos == url_infos
コード例 #6
0
def _read_url_info(url_path: type_utils.PathLike) -> checksums.UrlInfo:
    """Loads the `UrlInfo` from the `.INFO` file."""
    file_info = resource_lib.read_info_file(url_path)
    if 'url_info' not in file_info:
        raise ValueError(
            'Could not found `url_info` in {}. This likelly indicates that '
            'the files where downloaded with a previous version of TFDS (<=3.1.0). '
        )
    url_info = file_info['url_info']
    url_info.setdefault('filename', None)
    return checksums.UrlInfo(**url_info)
コード例 #7
0
 def __init__(self, name, url=None):
     url = url or f'http://foo-bar.ch/{name}'
     content = f'content of {name}'
     self.url = url
     self.url_info = checksums_lib.UrlInfo(
         size=len(content),
         checksum=_sha256(content),
     )
     self.file_name = resource_lib.get_dl_fname(url, self.url_info.checksum)
     self.file_path = f'/dl_dir/{self.file_name}'
     self.url_name = resource_lib.get_dl_fname(url, _sha256(url))
     self.url_path = f'/dl_dir/{self.url_name}'
コード例 #8
0
def test_compute_url_info():
  filepath = utils.tfds_path() / 'testing/test_data/6pixels.png'

  expected_url_info = checksums.UrlInfo(
      checksum=
      '04f38ebed34d3b027d2683193766155912fba647158c583c3bdb4597ad8af34c',
      size=utils.Size(102),
      filename='6pixels.png',
  )
  url_info = checksums.compute_url_info(filepath, checksum_cls=hashlib.sha256)
  assert url_info == expected_url_info
  assert url_info.filename == expected_url_info.filename
コード例 #9
0
 def test_wrong_checksum(self):
     a = Artifact('a.tar.gz')
     sha_b = _sha256('content of another file')
     self.dl_results[a.url] = a.url_info
     manager = self._get_manager(
         register_checksums=False,
         url_infos={
             a.url: checksums_lib.UrlInfo(size=a.url_info.size,
                                          checksum=sha_b),
         },
     )
     with self.assertRaises(dm.NonMatchingChecksumError):
         manager.download(a.url)
コード例 #10
0
 def _sync_file_copy(
     self,
     filepath: str,
     destination_path: str,
 ) -> DownloadResult:
     filename = os.path.basename(filepath)
     out_path = os.path.join(destination_path, filename)
     tf.io.gfile.copy(filepath, out_path)
     hexdigest, size = utils.read_checksum_digest(
         out_path, checksum_cls=self._checksumer_cls)
     return DownloadResult(
         path=utils.as_path(out_path),
         url_info=checksums_lib.UrlInfo(
             checksum=hexdigest,
             size=size,
             filename=filename,
         ),
     )
コード例 #11
0
ファイル: downloader.py プロジェクト: sezan92/datasets-1
    def _sync_kaggle_download(self, kaggle_url, destination_path):
        """Download with Kaggle API."""
        kaggle_file = kaggle.KaggleFile.from_url(kaggle_url)
        downloader = self.kaggle_downloader(kaggle_file.competition)
        filepath = downloader.download_file(kaggle_file.filename,
                                            destination_path)

        dl_size = tf.io.gfile.stat(filepath).length
        checksum = self._checksumer_cls()
        with tf.io.gfile.GFile(filepath, 'rb') as f:
            while True:
                block = f.read(io.DEFAULT_BUFFER_SIZE)
                if not block:
                    break
                checksum.update(block)
        return checksums_lib.UrlInfo(
            checksum=checksum.hexdigest(),
            size=dl_size,
        )