def download_file(self, fname, output_dir): """Downloads competition file to output_dir.""" if fname not in self.competition_files: # pylint: disable=unsupported-membership-test raise ValueError("%s is not one of the competition's " "files: %s" % (fname, self.competition_files)) command = [ "kaggle", self._kaggle_type.download_cmd, "download", "--file", fname, "--path", output_dir, self._kaggle_type.dl_flag, self._competition_name ] if self._kaggle_type.extra_flag: command.append(self._kaggle_type.extra_flag) _run_kaggle_command(command, self._competition_name) # kaggle silently compresses some files to '.zip` files. # TODO(tfds): use --unzip once supported by kaggle # (https://github.com/Kaggle/kaggle-api/issues/9) fpath = os.path.join(output_dir, fname + ".zip") if zipfile.is_zipfile(fpath): e = extractor.get_extractor() with e.tqdm(): e.extract(fpath, resource.ExtractMethod.ZIP, output_dir).get() return os.path.join(output_dir, fname)
def __init__(self, dataset_name, download_dir=None, extract_dir=None, manual_dir=None, checksums=None, force_download=False, force_extraction=False): """Download manager constructor. Args: dataset_name: `str`, name of dataset this instance will be used for. download_dir: `str`, path to directory where downloads are stored. extract_dir: `str`, path to directory where artifacts are extracted. manual_dir: `str`, path to manually downloaded/extracted data directory. checksums: `dict<str url, str sha256>`, url to sha256 of resource. Only URLs present are checked. force_download: `bool`, default to False. If True, always [re]download. force_extraction: `bool`, default to False. If True, always [re]extract. """ self._dataset_name = dataset_name self._checksums = checksums or {} self._recorded_download_checksums = {} self._download_sizes = {} self._download_dir = os.path.expanduser(download_dir) self._extract_dir = os.path.expanduser(extract_dir) self._manual_dir = os.path.expanduser(manual_dir) tf.gfile.MakeDirs(self._download_dir) tf.gfile.MakeDirs(self._extract_dir) self._force_download = force_download self._force_extraction = force_extraction self._extractor = extractor.get_extractor() self._downloader = downloader.get_downloader()
def setUp(self): super(ExtractorTest, self).setUp() self.extractor = extractor.get_extractor() # Where archive will be extracted: self.to_path = os.path.join(self.tmp_dir, 'extracted_arch') # Obviously it must not exist before test runs: self.assertFalse(tf.gfile.Exists(self.to_path)) self.result_path = os.path.join(self.to_path, '6pixels.png')
def __init__(self, download_dir, extract_dir=None, manual_dir=None, manual_dir_instructions=None, dataset_name=None, force_download=False, force_extraction=False, register_checksums=False): """Download manager constructor. Args: download_dir: `str`, path to directory where downloads are stored. extract_dir: `str`, path to directory where artifacts are extracted. manual_dir: `str`, path to manually downloaded/extracted data directory. manual_dir_instructions: `str`, human readable instructions on how to prepare contents of the manual_dir for this dataset. dataset_name: `str`, name of dataset this instance will be used for. If provided, downloads will contain which datasets they were used for. force_download: `bool`, default to False. If True, always [re]download. force_extraction: `bool`, default to False. If True, always [re]extract. register_checksums: `bool`, default to False. If True, dl checksums aren't checked, but stored into file. """ self._dataset_name = dataset_name self._download_dir = os.path.expanduser(download_dir) self._extract_dir = os.path.expanduser( extract_dir or os.path.join(download_dir, 'extracted')) self._manual_dir = manual_dir and os.path.expanduser(manual_dir) self._manual_dir_instructions = manual_dir_instructions tf.io.gfile.makedirs(self._download_dir) tf.io.gfile.makedirs(self._extract_dir) self._force_download = force_download self._force_extraction = force_extraction self._extractor = extractor.get_extractor() self._downloader = downloader.get_downloader() self._register_checksums = register_checksums # All known URLs: {url: (size, checksum)} self._sizes_checksums = checksums.get_all_sizes_checksums() # To record what is being used: {url: (size, checksum)} self._recorded_sizes_checksums = {}
def _download_competition_or_dataset(competition_or_dataset: str, output_dir: str) -> None: """Downloads the data and extracts it if it was zipped by the kaggle api. Args: competition_or_dataset: Name of the kaggle competition/dataset. output_dir: Path to the dir where the data is to be downloaded. """ _run_command([ 'kaggle', _get_kaggle_type(competition_or_dataset), 'download', '--path', output_dir, competition_or_dataset, ]) for download in tf.io.gfile.listdir(output_dir): fpath = os.path.join(output_dir, download) if zipfile.is_zipfile(fpath): ext = extractor.get_extractor() with ext.tqdm(): ext.extract(fpath, resource.ExtractMethod.ZIP, output_dir).get()
def __init__(self, download_dir, extract_dir=None, manual_dir=None, dataset_name=None, checksums=None, force_download=False, force_extraction=False): """Download manager constructor. Args: download_dir: `str`, path to directory where downloads are stored. extract_dir: `str`, path to directory where artifacts are extracted. manual_dir: `str`, path to manually downloaded/extracted data directory. dataset_name: `str`, name of dataset this instance will be used for. If provided, downloads will contain which datasets they were used for. checksums: `dict<str url, str sha256>`, url to sha256 of resource. Only URLs present are checked. If empty, checksum of (already) downloaded files is computed and can then be retrieved using `recorded_download_checksums` property. force_download: `bool`, default to False. If True, always [re]download. force_extraction: `bool`, default to False. If True, always [re]extract. """ self._dataset_name = dataset_name self._checksums = checksums or {} self._record_checksum_size = not checksums self._recorded_download_checksums = {} self._download_sizes = {} self._download_dir = os.path.expanduser(download_dir) self._extract_dir = os.path.expanduser( extract_dir or os.path.join(download_dir, 'extracted')) self._manual_dir = manual_dir and os.path.expanduser(manual_dir) tf.io.gfile.makedirs(self._download_dir) tf.io.gfile.makedirs(self._extract_dir) self._force_download = force_download self._force_extraction = force_extraction self._extractor = extractor.get_extractor() self._downloader = downloader.get_downloader()
def _extractor(self): if not self.__extractor: self.__extractor = extractor.get_extractor() return self.__extractor