コード例 #1
0
ファイル: kaggle.py プロジェクト: sezan92/datasets-1
 def download_file(self, fname, output_dir):
   """Downloads competition file to output_dir."""
   if fname not in self.competition_files:  # pylint: disable=unsupported-membership-test
     raise ValueError("%s is not one of the competition's "
                      "files: %s" % (fname, self.competition_files))
   command = [
       "kaggle",
       self._kaggle_type.download_cmd,
       "download",
       "--file",
       fname,
       "--path",
       output_dir,
       self._kaggle_type.dl_flag,
       self._competition_name
   ]
   if self._kaggle_type.extra_flag:
     command.append(self._kaggle_type.extra_flag)
   _run_kaggle_command(command, self._competition_name)
   # kaggle silently compresses some files to '.zip` files.
   # TODO(tfds): use --unzip once supported by kaggle
   # (https://github.com/Kaggle/kaggle-api/issues/9)
   fpath = os.path.join(output_dir, fname + ".zip")
   if zipfile.is_zipfile(fpath):
     e = extractor.get_extractor()
     with e.tqdm():
       e.extract(fpath, resource.ExtractMethod.ZIP, output_dir).get()
   return os.path.join(output_dir, fname)
コード例 #2
0
    def __init__(self,
                 dataset_name,
                 download_dir=None,
                 extract_dir=None,
                 manual_dir=None,
                 checksums=None,
                 force_download=False,
                 force_extraction=False):
        """Download manager constructor.

    Args:
      dataset_name: `str`, name of dataset this instance will be used for.
      download_dir: `str`, path to directory where downloads are stored.
      extract_dir: `str`, path to directory where artifacts are extracted.
      manual_dir: `str`, path to manually downloaded/extracted data directory.
      checksums: `dict<str url, str sha256>`, url to sha256 of resource.
        Only URLs present are checked.
      force_download: `bool`, default to False. If True, always [re]download.
      force_extraction: `bool`, default to False. If True, always [re]extract.
    """
        self._dataset_name = dataset_name
        self._checksums = checksums or {}
        self._recorded_download_checksums = {}
        self._download_sizes = {}
        self._download_dir = os.path.expanduser(download_dir)
        self._extract_dir = os.path.expanduser(extract_dir)
        self._manual_dir = os.path.expanduser(manual_dir)
        tf.gfile.MakeDirs(self._download_dir)
        tf.gfile.MakeDirs(self._extract_dir)
        self._force_download = force_download
        self._force_extraction = force_extraction
        self._extractor = extractor.get_extractor()
        self._downloader = downloader.get_downloader()
コード例 #3
0
  def setUp(self):
    super(ExtractorTest, self).setUp()
    self.extractor = extractor.get_extractor()
    # Where archive will be extracted:
    self.to_path = os.path.join(self.tmp_dir, 'extracted_arch')
    # Obviously it must not exist before test runs:
    self.assertFalse(tf.gfile.Exists(self.to_path))

    self.result_path = os.path.join(self.to_path, '6pixels.png')
コード例 #4
0
ファイル: download_manager.py プロジェクト: tom1484/datasets
    def __init__(self,
                 download_dir,
                 extract_dir=None,
                 manual_dir=None,
                 manual_dir_instructions=None,
                 dataset_name=None,
                 force_download=False,
                 force_extraction=False,
                 register_checksums=False):
        """Download manager constructor.

    Args:
      download_dir: `str`, path to directory where downloads are stored.
      extract_dir: `str`, path to directory where artifacts are extracted.
      manual_dir: `str`, path to manually downloaded/extracted data directory.
      manual_dir_instructions: `str`, human readable instructions on how to
                         prepare contents of the manual_dir for this dataset.
      dataset_name: `str`, name of dataset this instance will be used for. If
        provided, downloads will contain which datasets they were used for.
      force_download: `bool`, default to False. If True, always [re]download.
      force_extraction: `bool`, default to False. If True, always [re]extract.
      register_checksums: `bool`, default to False. If True, dl checksums aren't
        checked, but stored into file.
    """
        self._dataset_name = dataset_name
        self._download_dir = os.path.expanduser(download_dir)
        self._extract_dir = os.path.expanduser(
            extract_dir or os.path.join(download_dir, 'extracted'))
        self._manual_dir = manual_dir and os.path.expanduser(manual_dir)
        self._manual_dir_instructions = manual_dir_instructions
        tf.io.gfile.makedirs(self._download_dir)
        tf.io.gfile.makedirs(self._extract_dir)
        self._force_download = force_download
        self._force_extraction = force_extraction
        self._extractor = extractor.get_extractor()
        self._downloader = downloader.get_downloader()
        self._register_checksums = register_checksums
        # All known URLs: {url: (size, checksum)}
        self._sizes_checksums = checksums.get_all_sizes_checksums()
        # To record what is being used: {url: (size, checksum)}
        self._recorded_sizes_checksums = {}
コード例 #5
0
ファイル: kaggle.py プロジェクト: mbbessa/datasets
def _download_competition_or_dataset(competition_or_dataset: str,
                                     output_dir: str) -> None:
  """Downloads the data and extracts it if it was zipped by the kaggle api.

  Args:
    competition_or_dataset: Name of the kaggle competition/dataset.
    output_dir: Path to the dir where the data is to be downloaded.
  """
  _run_command([
      'kaggle',
      _get_kaggle_type(competition_or_dataset),
      'download',
      '--path',
      output_dir,
      competition_or_dataset,
  ])
  for download in tf.io.gfile.listdir(output_dir):
    fpath = os.path.join(output_dir, download)
    if zipfile.is_zipfile(fpath):
      ext = extractor.get_extractor()
      with ext.tqdm():
        ext.extract(fpath, resource.ExtractMethod.ZIP, output_dir).get()
コード例 #6
0
    def __init__(self,
                 download_dir,
                 extract_dir=None,
                 manual_dir=None,
                 dataset_name=None,
                 checksums=None,
                 force_download=False,
                 force_extraction=False):
        """Download manager constructor.

    Args:
      download_dir: `str`, path to directory where downloads are stored.
      extract_dir: `str`, path to directory where artifacts are extracted.
      manual_dir: `str`, path to manually downloaded/extracted data directory.
      dataset_name: `str`, name of dataset this instance will be used for. If
        provided, downloads will contain which datasets they were used for.
      checksums: `dict<str url, str sha256>`, url to sha256 of resource.
        Only URLs present are checked.
        If empty, checksum of (already) downloaded files is computed and can
        then be retrieved using `recorded_download_checksums` property.
      force_download: `bool`, default to False. If True, always [re]download.
      force_extraction: `bool`, default to False. If True, always [re]extract.
    """
        self._dataset_name = dataset_name
        self._checksums = checksums or {}
        self._record_checksum_size = not checksums
        self._recorded_download_checksums = {}
        self._download_sizes = {}
        self._download_dir = os.path.expanduser(download_dir)
        self._extract_dir = os.path.expanduser(
            extract_dir or os.path.join(download_dir, 'extracted'))
        self._manual_dir = manual_dir and os.path.expanduser(manual_dir)
        tf.io.gfile.makedirs(self._download_dir)
        tf.io.gfile.makedirs(self._extract_dir)
        self._force_download = force_download
        self._force_extraction = force_extraction
        self._extractor = extractor.get_extractor()
        self._downloader = downloader.get_downloader()
コード例 #7
0
 def _extractor(self):
     if not self.__extractor:
         self.__extractor = extractor.get_extractor()
     return self.__extractor