def test_validate_checksum(): expected_checksum = 123456 wrong_checksum = 123455 with patch("datasetinsights.io.download.compute_checksum") as mocked: mocked.return_value = wrong_checksum with pytest.raises(ChecksumError): validate_checksum("filepath/not/important", expected_checksum)
def _download_http(source_uri, dest_path, version): """ Download dataset from Public HTTP URL. Args: source_uri (str): source url where the file should be downloaded dest_path (str): destination path of the file Raises: DownloadError if the download file failed ChecksumError if the download file checksum does not match """ try: logger.info("Downloading the dataset.") download_file(source_uri=source_uri, dest_path=dest_path) except DownloadError as e: logger.info( f"The request download from {source_uri} -> {dest_path} can't " f"be completed.") raise e expected_checksum = GroceriesReal.GROCERIES_REAL_DATASET_TABLES[ version].checksum try: validate_checksum(dest_path, expected_checksum) except ChecksumError as e: logger.info("Checksum mismatch. Delete the downloaded files.") os.remove(dest_path) raise e
def download(self, source_uri, output, checksum_file=None, **kwargs): """ This method is used to download the dataset from HTTP or HTTPS url. Args: source_uri (str): This is the downloader-uri that indicates where the dataset should be downloaded from. output (str): This is the path to the directory where the download will store the dataset. checksum_file (str): This is path of the txt file that contains checksum of the dataset to be downloaded. It can be HTTP or HTTPS url or local path. Raises: ChecksumError: This will raise this error if checksum doesn't matches """ dataset_path = download_file(source_uri, output) if checksum_file: logger.debug("Reading checksum from checksum file.") checksum = get_checksum_from_file(checksum_file) try: logger.debug("Validating checksum!!") validate_checksum(dataset_path, int(checksum)) except ChecksumError as e: logger.info("Checksum mismatch. Deleting the downloaded file.") os.remove(dataset_path) raise e
def download(data_root, version): """Downloads dataset zip file and unzips it. Args: data_root (str): Path where to download the dataset. version (str): version of GroceriesReal dataset, e.g. "v1" Raises: ValueError if the dataset version is not supported ChecksumError if the download file checksum does not match DownloadError if the download file failed Note: Synthetic dataset is downloaded and unzipped to data_root/synthetic. """ if version not in SynDetection2D.SYNTHETIC_DATASET_TABLES.keys(): raise ValueError( f"A valid dataset version is required. Available versions are:" f"{SynDetection2D.SYNTHETIC_DATASET_TABLES.keys()}" ) source_uri = SynDetection2D.SYNTHETIC_DATASET_TABLES[version].source_uri expected_checksum = SynDetection2D.SYNTHETIC_DATASET_TABLES[ version ].checksum dataset_file = SynDetection2D.SYNTHETIC_DATASET_TABLES[version].filename extract_folder = os.path.join(data_root, const.SYNTHETIC_SUBFOLDER) dataset_path = os.path.join(extract_folder, dataset_file) if os.path.exists(dataset_path): logger.info("The dataset file exists. Skip download.") try: validate_checksum(dataset_path, expected_checksum) except ChecksumError: logger.info( "The checksum of the previous dataset mismatches. " "Delete the previously downloaded dataset." ) os.remove(dataset_path) if not os.path.exists(dataset_path): logger.info(f"Downloading dataset to {extract_folder}.") download_file(source_uri, dataset_path) try: validate_checksum(dataset_path, expected_checksum) except ChecksumError as e: logger.info("Checksum mismatch. Delete the downloaded files.") os.remove(dataset_path) raise e SynDetection2D.unzip_file( filepath=dataset_path, destination=extract_folder )
def _checksum(self, blob, filename): """validate checksum and delete file if checksum does not match Raises: ChecksumError: This will raise this error if checksum doesn't matches """ expected_checksum = blob.md5_hash if expected_checksum: expected_checksum_hex = self._md5_hex(expected_checksum) try: validate_checksum( filename, expected_checksum_hex, algorithm="MD5" ) except ChecksumError as e: logger.exception( "Checksum mismatch. Delete the downloaded files." ) os.remove(filename) raise e
def download(data_root, version): """ Download dataset from Public HTTP URL. If the file already exists and the checksum matches, it will skip the download step. If not, it would delete the previous file and download it again. If the file doesn't exist, it would download the file. Args: data_root (str): Root directory prefix of datasets version (str): version of GroceriesReal dataset, e.g. "v3" Raises: ValueError if the dataset version is not supported ChecksumError if the download file checksum does not match DownloadError if the download file failed """ if version not in GroceriesReal.GROCERIES_REAL_DATASET_TABLES.keys(): raise ValueError( f"A valid dataset version is required. Available versions are:" f"{GroceriesReal.GROCERIES_REAL_DATASET_TABLES.keys()}") dest_path = os.path.join(data_root, GroceriesReal.LOCAL_PATH, f"{version}.zip") expected_checksum = GroceriesReal.GROCERIES_REAL_DATASET_TABLES[ version].checksum extract_folder = os.path.join(data_root, GroceriesReal.LOCAL_PATH) if os.path.exists(dest_path): logger.info("The dataset file exists. Skip download.") try: validate_checksum(dest_path, expected_checksum) except ChecksumError: logger.info("The checksum of the previous dataset mismatches. " "Delete the previously downloaded dataset.") os.remove(dest_path) if not os.path.exists(dest_path): source_uri = GroceriesReal.GROCERIES_REAL_DATASET_TABLES[ version].source_uri GroceriesReal._download_http(source_uri, dest_path, version) GroceriesReal._extract_file(dest_path, extract_folder)