Beispiel #1
0
    def get_collection_by_id(self, collection_id):
        """Build STAC collection by its id

        :param collection_id: product type as collection ID
        :type collection_id: str
        :returns: collection dictionnary
        :rtype: dict
        """
        collection_list = self.__get_collection_list()

        try:
            collection = [
                c for c in collection_list if c["id"] == collection_id
            ][0]
        except IndexError:
            raise NotAvailableError("%s collection not found" % collection_id)

        self.update_data(collection)
        return self.as_dict()
Beispiel #2
0
    def download(
        self,
        product,
        auth=None,
        progress_callback=None,
        wait=DEFAULT_DOWNLOAD_WAIT,
        timeout=DEFAULT_DOWNLOAD_TIMEOUT,
        **kwargs
    ):
        """Download a product using HTTP protocol.

        The downloaded product is assumed to be a Zip file. If it is not,
        the user is warned, it is renamed to remove the zip extension and
        no further treatment is done (no extraction)
        """
        fs_path, record_filename = self._prepare_download(product, **kwargs)
        if not fs_path or not record_filename:
            return fs_path

        # progress bar init
        if progress_callback is None:
            progress_callback = get_progress_callback()
        progress_callback.desc = product.properties.get("id", "")
        progress_callback.position = 1

        # download assets if exist instead of remote_location
        try:
            return self._download_assets(
                product,
                fs_path.replace(".zip", ""),
                record_filename,
                auth,
                progress_callback,
                **kwargs
            )
        except NotAvailableError:
            pass

        url = product.remote_location

        # order product if it is offline
        ordered_message = ""
        if (
            "orderLink" in product.properties
            and "storageStatus" in product.properties
            and product.properties["storageStatus"] == OFFLINE_STATUS
        ):
            order_method = getattr(self.config, "order_method", "GET")
            with requests.request(
                method=order_method,
                url=product.properties["orderLink"],
                auth=auth,
                headers=getattr(self.config, "order_headers", {}),
            ) as response:
                try:
                    response.raise_for_status()
                    ordered_message = response.text
                    logger.debug(ordered_message)
                except HTTPError as e:
                    logger.warning(
                        "%s could not be ordered, request returned %s",
                        product.properties["title"],
                        e,
                    )

        # initiate retry loop
        start_time = datetime.now()
        stop_time = datetime.now() + timedelta(minutes=timeout)
        product.next_try = start_time
        retry_count = 0
        not_available_info = "The product could not be downloaded"
        # another output for notebooks
        nb_info = NotebookWidgets()

        while "Loop until products download succeeds or timeout is reached":

            if datetime.now() >= product.next_try:
                product.next_try += timedelta(minutes=wait)
                try:
                    params = kwargs.pop("dl_url_params", None) or getattr(
                        self.config, "dl_url_params", {}
                    )
                    with requests.get(
                        url,
                        stream=True,
                        auth=auth,
                        params=params,
                    ) as stream:
                        try:
                            stream.raise_for_status()
                        except HTTPError as e:
                            # check if error is identified as auth_error in provider conf
                            auth_errors = getattr(
                                self.config, "auth_error_code", [None]
                            )
                            if not isinstance(auth_errors, list):
                                auth_errors = [auth_errors]
                            if e.response.status_code in auth_errors:
                                raise AuthenticationError(
                                    "HTTP Error %s returned, %s\nPlease check your credentials for %s"
                                    % (
                                        e.response.status_code,
                                        e.response.text.strip(),
                                        self.provider,
                                    )
                                )
                            # product not available
                            elif (
                                product.properties.get("storageStatus", ONLINE_STATUS)
                                != ONLINE_STATUS
                            ):
                                msg = (
                                    ordered_message
                                    if ordered_message and not e.response.text
                                    else e.response.text
                                )
                                raise NotAvailableError(
                                    "%s(initially %s) requested, returned: %s"
                                    % (
                                        product.properties["title"],
                                        product.properties["storageStatus"],
                                        msg,
                                    )
                                )
                            else:
                                import traceback as tb

                                logger.error(
                                    "Error while getting resource :\n%s",
                                    tb.format_exc(),
                                )
                        else:
                            stream_size = int(stream.headers.get("content-length", 0))
                            if (
                                stream_size == 0
                                and "storageStatus" in product.properties
                                and product.properties["storageStatus"] != ONLINE_STATUS
                            ):
                                raise NotAvailableError(
                                    "%s(initially %s) ordered, got: %s"
                                    % (
                                        product.properties["title"],
                                        product.properties["storageStatus"],
                                        stream.reason,
                                    )
                                )
                            progress_callback.max_size = stream_size
                            progress_callback.reset()
                            with open(fs_path, "wb") as fhandle:
                                for chunk in stream.iter_content(chunk_size=64 * 1024):
                                    if chunk:
                                        fhandle.write(chunk)
                                        progress_callback(len(chunk), stream_size)

                            with open(record_filename, "w") as fh:
                                fh.write(url)
                            logger.debug("Download recorded in %s", record_filename)

                            # Check that the downloaded file is really a zip file
                            if not zipfile.is_zipfile(fs_path):
                                logger.warning(
                                    "Downloaded product is not a Zip File. Please check its file type before using it"
                                )
                                new_fs_path = fs_path[: fs_path.index(".zip")]
                                shutil.move(fs_path, new_fs_path)
                                return new_fs_path
                            return self._finalize(fs_path, **kwargs)

                except NotAvailableError as e:
                    if not getattr(self.config, "order_enabled", False):
                        raise NotAvailableError(
                            "Product is not available for download and order is not supported for %s, %s"
                            % (self.provider, e)
                        )
                    not_available_info = e
                    pass

            if datetime.now() < product.next_try and datetime.now() < stop_time:
                wait_seconds = (product.next_try - datetime.now()).seconds
                retry_count += 1
                retry_info = (
                    "[Retry #%s] Waiting %ss until next download try (retry every %s' for %s')"
                    % (retry_count, wait_seconds, wait, timeout)
                )
                logger.debug(not_available_info)
                # Retry-After info from Response header
                retry_server_info = stream.headers.get("Retry-After", "")
                if retry_server_info:
                    logger.debug(
                        "[%s response] Retry-After: %s"
                        % (self.provider, retry_server_info)
                    )
                logger.info(retry_info)
                nb_info.display_html(retry_info)
                sleep(wait_seconds + 1)
            elif datetime.now() >= stop_time and timeout > 0:
                if "storageStatus" not in product.properties:
                    product.properties["storageStatus"] = "N/A status"
                logger.info(not_available_info)
                raise NotAvailableError(
                    "%s is not available (%s) and could not be downloaded, timeout reached"
                    % (product.properties["title"], product.properties["storageStatus"])
                )
            elif datetime.now() >= stop_time:
                raise NotAvailableError(not_available_info)
Beispiel #3
0
    def _download_assets(
        self,
        product,
        fs_dir_path,
        record_filename,
        auth=None,
        progress_callback=None,
        **kwargs
    ):
        """Download product assets if they exist"""
        assets_urls = [
            a["href"] for a in getattr(product, "assets", {}).values() if "href" in a
        ]

        if not assets_urls:
            raise NotAvailableError("No assets available for %s" % product)

        # remove existing incomplete file
        if os.path.isfile(fs_dir_path):
            os.remove(fs_dir_path)
        # create product dest dir
        if not os.path.isdir(fs_dir_path):
            os.makedirs(fs_dir_path)

        # product conf overrides provider conf for "flatten_top_dirs"
        product_conf = getattr(self.config, "products", {}).get(
            product.product_type, {}
        )
        flatten_top_dirs = product_conf.get(
            "flatten_top_dirs", getattr(self.config, "flatten_top_dirs", False)
        )

        total_size = sum(
            [
                int(
                    requests.head(asset_url, auth=auth).headers.get("Content-length", 0)
                )
                for asset_url in assets_urls
            ]
        )
        progress_callback.max_size = total_size
        progress_callback.reset()
        error_messages = set()

        for asset_url in assets_urls:

            params = kwargs.pop("dl_url_params", None) or getattr(
                self.config, "dl_url_params", {}
            )
            with requests.get(
                asset_url,
                stream=True,
                auth=auth,
                params=params,
            ) as stream:
                try:
                    stream.raise_for_status()
                except HTTPError as e:
                    # check if error is identified as auth_error in provider conf
                    auth_errors = getattr(self.config, "auth_error_code", [None])
                    if not isinstance(auth_errors, list):
                        auth_errors = [auth_errors]
                    if e.response.status_code in auth_errors:
                        raise AuthenticationError(
                            "HTTP Error %s returned, %s\nPlease check your credentials for %s"
                            % (
                                e.response.status_code,
                                e.response.text.strip(),
                                self.provider,
                            )
                        )
                    else:
                        logger.warning("Unexpected error: %s" % e)
                        logger.warning("Skipping %s" % asset_url)
                    error_messages.add(str(e))
                else:
                    asset_rel_path = (
                        asset_url.replace(product.location, "")
                        .replace("https://", "")
                        .replace("http://", "")
                    )
                    asset_abs_path = os.path.join(fs_dir_path, asset_rel_path)
                    asset_abs_path_dir = os.path.dirname(asset_abs_path)
                    if not os.path.isdir(asset_abs_path_dir):
                        os.makedirs(asset_abs_path_dir)

                    if not os.path.isfile(asset_abs_path):
                        with open(asset_abs_path, "wb") as fhandle:
                            for chunk in stream.iter_content(chunk_size=64 * 1024):
                                if chunk:
                                    fhandle.write(chunk)
                                    progress_callback(len(chunk))

        # could not download any file
        if len(os.listdir(fs_dir_path)) == 0:
            raise HTTPError(", ".join(error_messages))

        # flatten directory structure
        if flatten_top_dirs:
            tmp_product_local_path = "%s-tmp" % fs_dir_path
            for d, dirs, files in os.walk(fs_dir_path):
                if len(files) != 0:
                    shutil.copytree(d, tmp_product_local_path)
                    shutil.rmtree(fs_dir_path)
                    os.rename(tmp_product_local_path, fs_dir_path)
                    break

        # save hash/record file
        with open(record_filename, "w") as fh:
            fh.write(product.remote_location)
        logger.debug("Download recorded in %s", record_filename)

        return fs_dir_path
Beispiel #4
0
    def download(self, product, auth=None, progress_callback=None, **kwargs):
        """Download data from USGS catalogues"""
        url = product.remote_location
        if not url:
            logger.debug(
                "Unable to get download url for %s, skipping download",
                product)
            return
        logger.info("Download url: %s", url)

        filename = product.properties["title"] + ".tar.bz"
        local_file_path = os.path.join(self.config.outputs_prefix, filename)
        download_records = os.path.join(self.config.outputs_prefix,
                                        ".downloaded")
        if not os.path.exists(download_records):
            os.makedirs(download_records)
        url_hash = hashlib.md5(url.encode("utf-8")).hexdigest()
        record_filename = os.path.join(download_records, url_hash)
        if os.path.isfile(record_filename) and os.path.isfile(local_file_path):
            logger.info("Product already downloaded. Retrieve it at %s",
                        local_file_path)
            return local_file_path
        # Remove the record file if local_file_path is absent (e.g. it was deleted
        # while record wasn't)
        elif os.path.isfile(record_filename):
            logger.debug("Record file found (%s) but not the actual file",
                         record_filename)
            logger.debug("Removing record file : %s", record_filename)
            os.remove(record_filename)

        with requests.get(
                url,
                stream=True,
                auth=auth,
                params=getattr(self.config, "dl_url_params", {}),
                verify=False,
                hooks={
                    "response": lambda r, *args, **kwargs: print("\n", r.url)
                },
        ) as stream:
            stream_size = int(stream.headers.get("content-length", 0))
            with open(local_file_path, "wb") as fhandle:
                for chunk in stream.iter_content(chunk_size=64 * 1024):
                    if chunk:
                        fhandle.write(chunk)
                        progress_callback(len(chunk), stream_size)
            try:
                stream.raise_for_status()
            except HTTPError as e:
                if e.response.status_code == 404:
                    raise NotAvailableError(
                        "%s not available, request returned: %s" %
                        (product.properties["title"], e))
                else:
                    import traceback

                    logger.error("Error while getting resource : %s",
                                 traceback.format_exc())
            else:
                with open(record_filename, "w") as fh:
                    fh.write(url)
                logger.debug("Download recorded in %s", record_filename)
                if self.config.extract and zipfile.is_zipfile(local_file_path):
                    logger.info("Extraction activated")
                    with zipfile.ZipFile(local_file_path, "r") as zfile:
                        fileinfos = zfile.infolist()
                        with tqdm(
                                fileinfos,
                                unit="file",
                                desc="Extracting files from {}".format(
                                    local_file_path),
                        ) as progressbar:
                            for fileinfo in progressbar:
                                zfile.extract(
                                    fileinfo,
                                    path=self.config["outputs_prefix"])
                    return local_file_path[:local_file_path.index(".tar.bz")]
                else:
                    return local_file_path
Beispiel #5
0
    def download(self, product, auth=None, progress_callback=None, **kwargs):
        """Download data from USGS catalogues"""

        fs_path, record_filename = self._prepare_download(
            product, outputs_extension=".tar.gz", **kwargs)
        if not fs_path or not record_filename:
            return fs_path

        # progress bar init
        if progress_callback is None:
            progress_callback = get_progress_callback()
        progress_callback.desc = product.properties.get("id", "")
        progress_callback.position = 1

        try:
            api.login(
                self.config.credentials["username"],
                self.config.credentials["password"],
                save=True,
            )
        except USGSError:
            raise AuthenticationError(
                "Please check your USGS credentials.") from None

        download_options = api.download_options(
            product.properties["productType"], product.properties["id"])

        try:
            product_ids = [
                p["id"] for p in download_options["data"]
                if p["downloadSystem"] == "dds"
            ]
        except KeyError as e:
            raise NotAvailableError("%s not found in %s's products" %
                                    (e, product.properties["id"]))

        if not product_ids:
            raise NotAvailableError("No USGS products found for %s" %
                                    product.properties["id"])

        req_urls = []
        for product_id in product_ids:
            download_request = api.download_request(
                product.properties["productType"], product.properties["id"],
                product_id)
            try:
                req_urls.extend([
                    x["url"]
                    for x in download_request["data"]["preparingDownloads"]
                ])
            except KeyError as e:
                raise NotAvailableError("%s not found in %s download_request" %
                                        (e, product.properties["id"]))

        if len(req_urls) > 1:
            logger.warning(
                "%s usgs products found for %s. Only first will be downloaded"
                % (len(req_urls), product.properties["id"]))
        elif not req_urls:
            raise NotAvailableError("No usgs request url was found for %s" %
                                    product.properties["id"])

        req_url = req_urls[0]
        progress_callback.reset()
        with requests.get(
                req_url,
                stream=True,
        ) as stream:
            try:
                stream.raise_for_status()
            except HTTPError:
                import traceback as tb

                logger.error(
                    "Error while getting resource :\n%s",
                    tb.format_exc(),
                )
            else:
                stream_size = int(stream.headers.get("content-length", 0))
                progress_callback.max_size = stream_size
                progress_callback.reset()
                with open(fs_path, "wb") as fhandle:
                    for chunk in stream.iter_content(chunk_size=64 * 1024):
                        if chunk:
                            fhandle.write(chunk)
                            progress_callback(len(chunk), stream_size)

        with open(record_filename, "w") as fh:
            fh.write(product.properties["downloadLink"])
        logger.debug("Download recorded in %s", record_filename)

        api.logout()

        # Check that the downloaded file is really a tar file
        if not tarfile.is_tarfile(fs_path):
            logger.warning(
                "Downloaded product is not a tar File. Please check its file type before using it"
            )
            new_fs_path = fs_path[:fs_path.index(".tar.gz")]
            shutil.move(fs_path, new_fs_path)
            return new_fs_path
        return self._finalize(fs_path, outputs_extension=".tar.gz", **kwargs)
Beispiel #6
0
    def download(self, product, auth=None, progress_callback=None, **kwargs):
        """Download method for S3 REST API.

        :param product: The EO product to download
        :type product: :class:`~eodag.api.product.EOProduct`
        :param auth: (optional) The configuration of a plugin of type Authentication
        :type auth: :class:`~eodag.config.PluginConfig`
        :param progress_callback: (optional) A method or a callable object
                                  which takes a current size and a maximum
                                  size as inputs and handle progress bar
                                  creation and update to give the user a
                                  feedback on the download progress
        :type progress_callback: :class:`~eodag.utils.ProgressCallback` or None
        :return: The absolute path to the downloaded product in the local filesystem
        :rtype: str
        """
        # get bucket urls
        bucket_name, prefix = self.get_bucket_name_and_prefix(product)

        if (bucket_name is None and "storageStatus" in product.properties
                and product.properties["storageStatus"] == OFFLINE_STATUS):
            raise NotAvailableError(
                "%s is not available for download on %s (status = %s)" % (
                    product.properties["title"],
                    self.provider,
                    product.properties["storageStatus"],
                ))

        bucket_url = urljoin(
            product.downloader.config.base_uri.strip("/") + "/", bucket_name)
        nodes_list_url = bucket_url + "?prefix=" + prefix.strip("/")

        # get nodes/files list contained in the bucket
        logger.debug("Retrieving product content from %s", nodes_list_url)
        bucket_contents = requests.get(nodes_list_url, auth=auth)
        try:
            bucket_contents.raise_for_status()
        except requests.HTTPError as err:
            # check if error is identified as auth_error in provider conf
            auth_errors = getattr(self.config, "auth_error_code", [None])
            if not isinstance(auth_errors, list):
                auth_errors = [auth_errors]
            if err.response.status_code in auth_errors:
                raise AuthenticationError(
                    "HTTP Error %s returned, %s\nPlease check your credentials for %s"
                    % (
                        err.response.status_code,
                        err.response.text.strip(),
                        self.provider,
                    ))
            # other error
            else:
                logger.exception(
                    "Could not get content from %s (provider:%s, plugin:%s)\n%s",
                    nodes_list_url,
                    self.provider,
                    self.__class__.__name__,
                    bucket_contents.text,
                )
                raise RequestError(str(err))
        try:
            xmldoc = minidom.parseString(bucket_contents.text)
        except ExpatError as err:
            logger.exception("Could not parse xml data from %s",
                             bucket_contents)
            raise DownloadError(str(err))
        nodes_xml_list = xmldoc.getElementsByTagName("Contents")

        if len(nodes_xml_list) == 0:
            logger.warning("Could not load any content from %s",
                           nodes_list_url)
        elif len(nodes_xml_list) == 1:
            # single file download
            product.remote_location = urljoin(
                bucket_url.strip("/") + "/", prefix.strip("/"))
            return HTTPDownload(self.provider, self.config).download(
                product=product,
                auth=auth,
                progress_callback=progress_callback,
                **kwargs)

        # destination product path
        outputs_prefix = kwargs.pop("ouputs_prefix",
                                    None) or self.config.outputs_prefix
        abs_outputs_prefix = os.path.abspath(outputs_prefix)
        product_local_path = os.path.join(abs_outputs_prefix,
                                          prefix.split("/")[-1])

        # .downloaded cache record directory
        download_records_dir = os.path.join(abs_outputs_prefix, ".downloaded")
        try:
            os.makedirs(download_records_dir)
        except OSError as exc:
            import errno

            if exc.errno != errno.EEXIST:  # Skip error if dir exists
                import traceback as tb

                logger.warning("Unable to create records directory. Got:\n%s",
                               tb.format_exc())
        # check if product has already been downloaded
        url_hash = hashlib.md5(
            product.remote_location.encode("utf-8")).hexdigest()
        record_filename = os.path.join(download_records_dir, url_hash)
        if os.path.isfile(record_filename) and os.path.exists(
                product_local_path):
            return product_local_path
        # Remove the record file if product_local_path is absent (e.g. it was deleted while record wasn't)
        elif os.path.isfile(record_filename):
            logger.debug("Record file found (%s) but not the actual file",
                         record_filename)
            logger.debug("Removing record file : %s", record_filename)
            os.remove(record_filename)

        # total size for progress_callback
        total_size = sum([
            int(node.firstChild.nodeValue)
            for node in xmldoc.getElementsByTagName("Size")
        ])

        # download each node key
        for node_xml in nodes_xml_list:
            node_key = node_xml.getElementsByTagName(
                "Key")[0].firstChild.nodeValue
            # As "Key", "Size" and "ETag" (md5 hash) can also be retrieved from node_xml
            node_url = urljoin(
                bucket_url.strip("/") + "/", node_key.strip("/"))
            # output file location
            local_filename = os.path.join(self.config.outputs_prefix,
                                          "/".join(node_key.split("/")[6:]))
            local_filename_dir = os.path.dirname(
                os.path.realpath(local_filename))
            if not os.path.isdir(local_filename_dir):
                os.makedirs(local_filename_dir)

            with requests.get(node_url, stream=True, auth=auth) as stream:
                try:
                    stream.raise_for_status()
                except HTTPError:
                    import traceback as tb

                    logger.error("Error while getting resource :\n%s",
                                 tb.format_exc())
                else:
                    with open(local_filename, "wb") as fhandle:
                        for chunk in stream.iter_content(chunk_size=64 * 1024):
                            if chunk:
                                fhandle.write(chunk)
                                progress_callback(len(chunk), total_size)

            # TODO: check md5 hash ?

        with open(record_filename, "w") as fh:
            fh.write(product.remote_location)
        logger.debug("Download recorded in %s", record_filename)

        return product_local_path