Beispiel #1
0
    def validate(self):
        super().validate()
        res = None
        try:
            http_cache = self.job.parameters["dispatcher"].get(
                "http_url_format_string", "")
            if http_cache:
                self.logger.info("Using caching service: '%s'", http_cache)
                try:
                    self.url = urlparse(http_cache %
                                        quote_plus(self.url.geturl()))
                except TypeError as exc:
                    self.logger.error("Invalid http_url_format_string: '%s'",
                                      exc)
                    self.errors = "Invalid http_url_format_string: '%s'" % str(
                        exc)
                    return

            headers = {"Accept-Encoding": ""}
            if self.params and "headers" in self.params:
                headers.update(self.params["headers"])
            self.logger.debug("Validating that %s exists", self.url.geturl())
            # Force the non-use of Accept-Encoding: gzip, this will permit to know the final size
            res = requests_retry().head(
                self.url.geturl(),
                allow_redirects=True,
                headers=headers,
                timeout=HTTP_DOWNLOAD_TIMEOUT,
            )
            if res.status_code != requests.codes.OK:
                # try using (the slower) get for services with broken redirect support
                self.logger.debug(
                    "Using GET because HEAD is not supported properly")
                res.close()
                # Like for HEAD, we need get a size, so disable gzip
                res = requests_retry().get(
                    self.url.geturl(),
                    allow_redirects=True,
                    stream=True,
                    headers=headers,
                    timeout=HTTP_DOWNLOAD_TIMEOUT,
                )
                if res.status_code != requests.codes.OK:
                    self.errors = "Resource unavailable at '%s' (%d)" % (
                        self.url.geturl(),
                        res.status_code,
                    )
                    return

            self.size = int(res.headers.get("content-length", -1))
        except requests.Timeout:
            self.logger.error("Request timed out")
            self.errors = "'%s' timed out" % (self.url.geturl())
        except requests.RequestException as exc:
            self.logger.error("Resource not available")
            self.errors = "Unable to get '%s': %s" % (self.url.geturl(),
                                                      str(exc))
        finally:
            if res is not None:
                res.close()
Beispiel #2
0
 def reader(self):
     res = None
     try:
         # FIXME: When requests 3.0 is released, use the enforce_content_length
         # parameter to raise an exception the file is not fully downloaded
         headers = None
         if self.params and "headers" in self.params:
             headers = self.params["headers"]
         res = requests_retry().get(
             self.url.geturl(),
             allow_redirects=True,
             stream=True,
             headers=headers,
             timeout=HTTP_DOWNLOAD_TIMEOUT,
         )
         if res.status_code != requests.codes.OK:
             # This is an Infrastructure error because the validate function
             # checked that the file does exist.
             raise InfrastructureError("Unable to download '%s'" %
                                       (self.url.geturl()))
         for buff in res.iter_content(HTTP_DOWNLOAD_CHUNK_SIZE):
             yield buff
     except requests.RequestException as exc:
         raise InfrastructureError("Unable to download '%s': %s" %
                                   (self.url.geturl(), str(exc)))
     finally:
         if res is not None:
             res.close()