def download_dataset(self, hq_files=True): """ Downloads the dataset and return the input paths Args: hq_files (bool): Whether to download the hq files or not Returns: list: [train_data, test_data, metadata_csv, train_masks_csv, train_masks_data] """ competition_name = "carvana-image-masking-challenge" script_dir = os.path.dirname(os.path.abspath(__file__)) # print(script_dir) # 1/0 destination_path = os.path.join( script_dir, '../../input/') #.\..\表示项目文件所在目录向上二级目录之下的目录。 prefix = "" if hq_files: prefix = "_hq" files = [ "train" + prefix + ".zip", "test" + prefix + ".zip", "metadata.csv.zip", "train_masks.csv.zip", "train_masks.zip" ] datasets_path = [ destination_path + "train" + prefix, destination_path + "test" + prefix, destination_path + "metadata.csv", destination_path + "train_masks.csv", destination_path + "train_masks" ] is_datasets_present = True # If the folders already exists then the files may already be extracted # This is a bit hacky but it's sufficient for our needs for dir_path in datasets_path: if not os.path.exists(dir_path): is_datasets_present = False if not is_datasets_present: # Put your Kaggle user name and password in a $KAGGLE_USER and $KAGGLE_PASSWD env vars respectively downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"), os.getenv("KAGGLE_PASSWD"), competition_name) for file in files: output_path = downloader.download_dataset( file, destination_path) downloader.decompress(output_path, destination_path) os.remove(output_path) else: print("All datasets are present.") self.train_data = datasets_path[0] self.test_data = datasets_path[1] self.train_masks_data = datasets_path[4] self.train_files = sorted(os.listdir(self.train_data)) self.test_files = sorted(os.listdir(self.test_data)) self.train_masks_files = sorted(os.listdir(self.train_masks_data)) return datasets_path
def download_dataset(url: str, output_folder: str, decompress: bool): """ Downloads the dataset and return the input paths. Do not download again if the data is already present. Args: url (str): Http link to the archive output_folder (str): Path to save the downloaded files decompress (bool): To uncompress the downloaded archive Returns: tuple: (file_name, file_path) """ file_name = os.path.split(url)[-1] output_file_arch = os.path.join(output_folder, file_name) if not os.path.exists(output_file_arch): if not os.path.exists(output_folder): os.makedirs(output_folder) print('Beginning file download...') with TqdmUpTo(unit='B', unit_scale=True, miniters=1, desc=f"Downloading {file_name}") as t: file, _ = urllib.request.urlretrieve(url, output_file_arch, reporthook=t.update_to) print("Unzipping file...") if decompress: KaggleDataDownloader.decompress(file, output_folder) else: print("File already exists.") return file_name, output_file_arch
def download_dataset(self, hq_files=True): """ Downloads the dataset and return the input paths Args: hq_files (bool): Whether to download the hq files or not Returns: list: [train_data, test_data, metadata_csv, train_masks_csv, train_masks_data] """ competition_name = "RSNA_Bone_Age" script_dir = os.path.dirname(os.path.abspath(__file__)) destination_path = os.path.join(script_dir, '../../input/') files = [ "train.zip", "test.zip", "metadata.csv.zip", "train_masks.csv.zip", "train_masks.zip" ] datasets_path = [ destination_path + "boneage-training-dataset", destination_path + "boneage-test-dataset", destination_path + "boneage-test-dataset.csv", destination_path + "boneage-training-dataset.csv", destination_path + "masks" ] is_datasets_present = True # If the folders already exists then the files may already be extracted # This is a bit hacky but it's sufficient for our needs for dir_path in datasets_path: if not os.path.exists(dir_path): is_datasets_present = False if not is_datasets_present: # Put your Kaggle user name and password in a $KAGGLE_USER and $KAGGLE_PASSWD env vars respectively downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"), os.getenv("KAGGLE_PASSWD"), competition_name) for file in files: output_path = downloader.download_dataset( file, destination_path) downloader.decompress(output_path, destination_path) os.remove(output_path) else: print("All datasets are present.") self.train_data = datasets_path[0] self.test_data = datasets_path[1] self.train_masks_data = datasets_path[4] self.train_files = sorted(os.listdir(self.train_data)) self.test_files = sorted(os.listdir(self.test_data)) self.train_masks_files = sorted(os.listdir(self.train_masks_data)) self.train_ids = list(set(t.split("_")[0] for t in self.train_files)) self.masks_ids = list( set(t.split("_")[0] for t in self.train_masks_files)) self.test_ids = list(set(t.split("_")[0] for t in self.test_files)) return datasets_path
def test_download_data(self): competition_name = "planet-understanding-the-amazon-from-space" dataset_name = "test-jpg-additional.tar.7z" labels_name = "train_v2.csv.zip" destination_path = "input/" downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"), os.getenv("KAGGLE_PASSWD"), competition_name) output_path = downloader.download_dataset(dataset_name, destination_path) downloader.decompress(output_path, destination_path) downloader.decompress(destination_path + "test-jpg-additional.tar", destination_path) labels_output_path = downloader.download_dataset( labels_name, destination_path) downloader.decompress(labels_output_path, destination_path)
def download_dataset(competition_name: str, competition_files: list, competition_files_ext: list, output_folder: str): """ Downloads the dataset and return the input paths. Do not download again if the data is already present. You need to define $KAGGLE_USER and $KAGGLE_PASSWD in your environment and you must accept the competition rules beforehand. This downloader uses https://github.com/EKami/kaggle-data-downloader and assumes everything is properly installed. Args: competition_name (str): The name of the competition competition_files (list): List of files for the competition (in their uncompressed format) competition_files_ext (list): List of extensions for the competition files in the same order as competition_files. Ex: 'zip', '7z', 'xz' output_folder (str): Path to save the downloaded files Returns: tuple: (file_names, files_path) """ assert len(competition_files) == len(competition_files_ext), \ "Length of competition_files and competition_files_ext do not match" datasets_path = [output_folder + f for f in competition_files] is_dataset_present = True for file in datasets_path: if not os.path.exists(file): is_dataset_present = False if not is_dataset_present: # Put your Kaggle user name and password in a $KAGGLE_USER and $KAGGLE_PASSWD env vars respectively downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"), os.getenv("KAGGLE_PASSWD"), competition_name) zipfiles = [ file + "." + ext for file, ext in zip(competition_files, competition_files_ext) ] for file in zipfiles: downloader.download_dataset(file, output_folder) # Unzip the files zipdatasets_path = [output_folder + f for f in zipfiles] for path in zipdatasets_path: downloader.decompress(path, output_folder) os.remove(path) else: print("All datasets are present.") return competition_files, datasets_path
def download(user_pwd, competition_name, data_file_name, directory=None, file_name=""): if directory is None: directory = os.getcwd() file_path = os.path.join(directory, file_name) if not os.path.exists(file_path): os.makedirs(file_path) print(os.path.abspath(file_path)) # We can not download the data without user info downloader = KaggleDataDownloader(user_pwd[0], user_pwd[1], competition_name) download_file_path = os.path.join(directory, data_file_name) if os.path.exists(download_file_path): print("Data exists") else: downloader.download_dataset(data_file_name, directory) downloader.decompress(download_file_path, file_path)
test, test_u = "test-jpg.tar.7z", "test-jpg.tar" test_additional, test_additional_u = "test-jpg-additional.tar.7z", "test-jpg-additional.tar" test_labels = "train_v2.csv.zip" destination_path = "../input/" is_datasets_present = False # If the folders already exists then the files may already be extracted # This is a bit hacky but it's sufficient for our needs datasets_path = data_helper.get_jpeg_data_files_paths() for dir_path in datasets_path: if os.path.exists(dir_path): is_datasets_present = True if not is_datasets_present: # Put your Kaggle user name and password in a $KAGGLE_USER and $KAGGLE_PASSWD env vars respectively downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"), os.getenv("KAGGLE_PASSWD"), competition_name) train_output_path = downloader.download_dataset(train, destination_path) downloader.decompress(train_output_path, destination_path) # Outputs a tar file downloader.decompress(destination_path + train_u, destination_path) # Extract the content of the previous tar file os.remove(train_output_path) # Removes the 7z file os.remove(destination_path + train_u) # Removes the tar file test_output_path = downloader.download_dataset(test, destination_path) downloader.decompress(test_output_path, destination_path) # Outputs a tar file downloader.decompress(destination_path + test_u, destination_path) # Extract the content of the previous tar file os.remove(test_output_path) # Removes the 7z file os.remove(destination_path + test_u) # Removes the tar file test_add_output_path = downloader.download_dataset(test_additional, destination_path) downloader.decompress(test_add_output_path, destination_path) # Outputs a tar file