def test_validate_checksum():
    expected_checksum = 123456
    wrong_checksum = 123455
    with patch("datasetinsights.io.download.compute_checksum") as mocked:
        mocked.return_value = wrong_checksum
        with pytest.raises(ChecksumError):
            validate_checksum("filepath/not/important", expected_checksum)
    def _download_http(source_uri, dest_path, version):
        """ Download dataset from Public HTTP URL.

        Args:
            source_uri (str): source url where the file should be downloaded
            dest_path (str): destination path of the file

        Raises:
            DownloadError if the download file failed
            ChecksumError if the download file checksum does not match
        """

        try:
            logger.info("Downloading the dataset.")
            download_file(source_uri=source_uri, dest_path=dest_path)
        except DownloadError as e:
            logger.info(
                f"The request download from {source_uri} -> {dest_path} can't "
                f"be completed.")
            raise e
        expected_checksum = GroceriesReal.GROCERIES_REAL_DATASET_TABLES[
            version].checksum
        try:
            validate_checksum(dest_path, expected_checksum)
        except ChecksumError as e:
            logger.info("Checksum mismatch. Delete the downloaded files.")
            os.remove(dest_path)
            raise e
Exemple #3
0
    def download(self, source_uri, output, checksum_file=None, **kwargs):
        """ This method is used to download the dataset from HTTP or HTTPS url.

        Args:
            source_uri (str): This is the downloader-uri that indicates where
                              the dataset should be downloaded from.

            output (str): This is the path to the directory where the download
                          will store the dataset.

            checksum_file (str): This is path of the txt file that contains
                                 checksum of the dataset to be downloaded. It
                                 can be HTTP or HTTPS url or local path.

        Raises:
            ChecksumError: This will raise this error if checksum doesn't
                           matches

        """
        dataset_path = download_file(source_uri, output)

        if checksum_file:
            logger.debug("Reading checksum from checksum file.")
            checksum = get_checksum_from_file(checksum_file)
            try:
                logger.debug("Validating checksum!!")
                validate_checksum(dataset_path, int(checksum))
            except ChecksumError as e:
                logger.info("Checksum mismatch. Deleting the downloaded file.")
                os.remove(dataset_path)
                raise e
    def download(data_root, version):
        """Downloads dataset zip file and unzips it.

        Args:
            data_root (str): Path where to download the dataset.
            version (str): version of GroceriesReal dataset, e.g. "v1"

        Raises:
             ValueError if the dataset version is not supported
             ChecksumError if the download file checksum does not match
             DownloadError if the download file failed

        Note: Synthetic dataset is downloaded and unzipped to
        data_root/synthetic.
        """
        if version not in SynDetection2D.SYNTHETIC_DATASET_TABLES.keys():
            raise ValueError(
                f"A valid dataset version is required. Available versions are:"
                f"{SynDetection2D.SYNTHETIC_DATASET_TABLES.keys()}"
            )

        source_uri = SynDetection2D.SYNTHETIC_DATASET_TABLES[version].source_uri
        expected_checksum = SynDetection2D.SYNTHETIC_DATASET_TABLES[
            version
        ].checksum
        dataset_file = SynDetection2D.SYNTHETIC_DATASET_TABLES[version].filename

        extract_folder = os.path.join(data_root, const.SYNTHETIC_SUBFOLDER)
        dataset_path = os.path.join(extract_folder, dataset_file)

        if os.path.exists(dataset_path):
            logger.info("The dataset file exists. Skip download.")
            try:
                validate_checksum(dataset_path, expected_checksum)
            except ChecksumError:
                logger.info(
                    "The checksum of the previous dataset mismatches. "
                    "Delete the previously downloaded dataset."
                )
                os.remove(dataset_path)

        if not os.path.exists(dataset_path):
            logger.info(f"Downloading dataset to {extract_folder}.")
            download_file(source_uri, dataset_path)
            try:
                validate_checksum(dataset_path, expected_checksum)
            except ChecksumError as e:
                logger.info("Checksum mismatch. Delete the downloaded files.")
                os.remove(dataset_path)
                raise e

        SynDetection2D.unzip_file(
            filepath=dataset_path, destination=extract_folder
        )
Exemple #5
0
    def _checksum(self, blob, filename):
        """validate checksum and delete file if checksum does not match

        Raises:
            ChecksumError: This will raise this error if checksum doesn't
                           matches
        """
        expected_checksum = blob.md5_hash
        if expected_checksum:
            expected_checksum_hex = self._md5_hex(expected_checksum)
            try:
                validate_checksum(
                    filename, expected_checksum_hex, algorithm="MD5"
                )
            except ChecksumError as e:
                logger.exception(
                    "Checksum mismatch. Delete the downloaded files."
                )
                os.remove(filename)
                raise e
    def download(data_root, version):
        """ Download dataset from Public HTTP URL.

        If the file already exists and the checksum matches, it will skip the
        download step. If not, it would delete the previous file and download
        it again. If the file doesn't exist, it would download the file.

        Args:
            data_root (str): Root directory prefix of datasets
            version (str): version of GroceriesReal dataset, e.g. "v3"

        Raises:
            ValueError if the dataset version is not supported
            ChecksumError if the download file checksum does not match
            DownloadError if the download file failed
        """
        if version not in GroceriesReal.GROCERIES_REAL_DATASET_TABLES.keys():
            raise ValueError(
                f"A valid dataset version is required. Available versions are:"
                f"{GroceriesReal.GROCERIES_REAL_DATASET_TABLES.keys()}")
        dest_path = os.path.join(data_root, GroceriesReal.LOCAL_PATH,
                                 f"{version}.zip")
        expected_checksum = GroceriesReal.GROCERIES_REAL_DATASET_TABLES[
            version].checksum
        extract_folder = os.path.join(data_root, GroceriesReal.LOCAL_PATH)
        if os.path.exists(dest_path):
            logger.info("The dataset file exists. Skip download.")
            try:
                validate_checksum(dest_path, expected_checksum)
            except ChecksumError:
                logger.info("The checksum of the previous dataset mismatches. "
                            "Delete the previously downloaded dataset.")
                os.remove(dest_path)
        if not os.path.exists(dest_path):
            source_uri = GroceriesReal.GROCERIES_REAL_DATASET_TABLES[
                version].source_uri
            GroceriesReal._download_http(source_uri, dest_path, version)
        GroceriesReal._extract_file(dest_path, extract_folder)