def getExternalFile(self, url, output_path, headers=None): host = urlsplit(url).netloc if output_path: if not headers: headers = self.HEADERS else: headers.update(self.HEADERS) session = self.getExternalSession(host) r = session.get(url, headers=headers, stream=True, verify=certifi.where()) if r.status_code != 200: file_error = "File transfer failed: [%s]" % output_path url_error = 'HTTP GET Failed for url: %s' % url host_error = "Host %s responded:\n\n%s" % (urlsplit(url).netloc, r.text) raise RuntimeError('%s\n\n%s\n%s' % (file_error, url_error, host_error)) else: total = 0 start = datetime.datetime.now() logging.debug("Transferring file %s to %s" % (url, output_path)) with open(output_path, 'wb') as data_file: for chunk in r.iter_content(chunk_size=DEFAULT_CHUNK_SIZE): data_file.write(chunk) total += len(chunk) elapsed = datetime.datetime.now() - start summary = get_transfer_summary(total, elapsed) logging.info("File [%s] transfer successful. %s" % (output_path, summary)) return output_path, r
def getExternalUrl(self, url): urlparts = urlsplit(url) if urlparts.path.startswith(self.store_base): path_only = url.startswith(self.store_base) server_uri = urlparts.scheme + "://" + urlparts.netloc if server_uri == self.store.get_server_uri() or path_only: url = ''.join([self.store.get_server_uri(), url]) if path_only else url else: if not (urlparts.scheme and urlparts.netloc): urlparts = urlsplit(self.catalog.get_server_uri()) server_uri = urlparts.scheme + "://" + urlparts.netloc url = ''.join([server_uri, url]) return url
def getHatracStore(self, url): urlparts = urlsplit(url) if not urlparts.path.startswith(self.store_base): return None if url.startswith(self.store_base): return self.store else: serverURI = urlparts.scheme + "://" + urlparts.netloc if serverURI == self.store.get_server_uri(): return self.store else: # do we need to deal with the possibility of a fully qualified URL referencing a different hatrac host? raise DerivaDownloadConfigurationError( "Got a reference to a Hatrac server [%s] that is different from the expected Hatrac server: %s" % ( serverURI, self.store.get_server_uri))
def headForHeaders(self, url, raise_for_status=False): store = self.getHatracStore(url) if store: r = store.head(url, headers=self.HEADERS) if raise_for_status: r.raise_for_status() headers = r.headers else: session = self.getExternalSession(urlsplit(url).hostname) r = session.head(url, headers=self.HEADERS) if raise_for_status: r.raise_for_status() headers = r.headers return headers
def process(self): target_url_param = "target_url" target_url = self.parameters.get(target_url_param) if not target_url: raise DerivaDownloadConfigurationError( "%s is missing required parameter '%s' from %s" % (self.__class__.__name__, target_url_param, PROCESSOR_PARAMS_KEY)) if self.envars: target_url = target_url.format(**self.envars) target_url = target_url.strip(" ") upr = urlsplit(target_url, "https") self.scheme = upr.scheme.lower() self.netloc = upr.netloc self.path = upr.path.strip("/") host = urlunsplit((self.scheme, upr.netloc, "", "", "")) creds = get_credential(host) if not creds: logging.info("Unable to locate credential entry for: %s" % host) self.credentials = creds or dict() return self.outputs