def test_request_extra_zero_values(): """ Test that zero-value ``retries`` and ``backoff`` are not ignored. """ def mock_request(*_, **__): mocked_resp = Response() mocked_resp.status_code = HTTPNotFound.code return mocked_resp with mock.patch("requests.Session.request", side_effect=mock_request) as mocked_request: resp = request_extra("get", "http://whatever", retries=0, allowed_codes=[HTTPOk.code]) assert resp.status_code == HTTPGatewayTimeout.code, "failing request with no retry should produce timeout" assert mocked_request.call_count == 1 sleep_counter = {"called_count": 0, "called_with": []} def mock_sleep(delay): sleep_counter["called_count"] += 1 sleep_counter["called_with"].append(delay) with mock.patch("weaver.utils.get_settings", return_value={"cache.requests.enable": "false"}): with mock.patch("requests.Session.request", side_effect=mock_request) as mocked_request: with mock.patch("weaver.utils.time.sleep", side_effect=mock_sleep): # if backoff is not correctly handled as explicit zero, the default backoff value would be used # to calculate the delay between requests which should increase with backoff formula and retry count resp = request_extra("get", "http://whatever", backoff=0, retries=3, allowed_codes=[HTTPOk.code]) assert resp.status_code == HTTPGatewayTimeout.code assert mocked_request.call_count == 4 # first called directly, then 3 times for each retry # since backoff factor multiplies all incrementally increasing delays between requests, # proper detection of input backoff=0 makes all sleep calls equal to zero assert all(backoff == 0 for backoff in sleep_counter["called_with"]) assert sleep_counter["called_count"] == 3 # first direct call doesn't have any sleep from retry
def get_processes(request): """ List registered processes (GetCapabilities). Optionally list both local and provider processes. """ detail = asbool(request.params.get("detail", True)) try: # get local processes and filter according to schema validity # (previously deployed process schemas can become invalid because of modified schema definitions processes, invalid_processes = get_processes_filtered_by_valid_schemas( request) if invalid_processes: raise HTTPServiceUnavailable( "Previously deployed processes are causing invalid schema integrity errors. " "Manual cleanup of following processes is required: {}".format( invalid_processes)) response_body = { "processes": processes if detail else [get_any_id(p) for p in processes] } # if 'EMS' and '?providers=True', also fetch each provider's processes settings = get_settings(request) if get_weaver_configuration(settings) == WEAVER_CONFIGURATION_EMS: queries = parse_request_query(request) if "providers" in queries and asbool( queries["providers"][0]) is True: prov_url = "{host}/providers".format(host=request.host_url) providers_response = request_extra("GET", prov_url, settings=settings, headers=request.headers, cookies=request.cookies) providers = providers_response.json() response_body.update({"providers": providers}) for i, provider in enumerate(providers): provider_id = get_any_id(provider) proc_url = "{host}/providers/{prov}/processes".format( host=request.host_url, prov=provider_id) response = request_extra("GET", proc_url, settings=settings, headers=request.headers, cookies=request.cookies) processes = response.json().get("processes", []) response_body["providers"][i].update({ "processes": processes if detail else [get_any_id(p) for p in processes] }) return HTTPOk(json=response_body) except colander.Invalid as ex: raise HTTPBadRequest("Invalid schema: [{!s}]".format(ex))
def _request_extra_various(_mime_type): """ Attempts multiple request-retry variants to be as permissive as possible to sporadic/temporary failures. """ _mime_type_url = "{}{}".format( IANA_NAMESPACE_DEFINITION[IANA_NAMESPACE], _mime_type) try: resp = request_extra( "get", _mime_type_url, retries=3, allowed_codes=[HTTPOk.code, HTTPNotFound.code]) if resp.status_code == HTTPOk.code: return _make_if_ref(IANA_NAMESPACE_DEFINITION, IANA_NAMESPACE, _mime_type) except ConnectionError: pass try: resp = urlopen( _mime_type_url) # nosec: B310 # is hardcoded HTTP(S) if resp.code == HTTPOk.code: return _make_if_ref(IANA_NAMESPACE_DEFINITION, IANA_NAMESPACE, _mime_type) except HTTPError: pass return None
def test_request_extra_intervals(): """ Verifies that ``intervals`` are used for calling the retry operations instead of ``backoff``/``retries``. """ def mock_request(*_, **__): m_resp = Response() m_resp.status_code = HTTPNotFound.code return m_resp sleep_counter = {"called_count": 0, "called_with": []} def mock_sleep(delay): if delay > 1e5: sleep_counter["called_count"] += 1 sleep_counter["called_with"].append(delay) with mock.patch("weaver.utils.get_settings", return_value={"cache.requests.enable": "false"}): with mock.patch("requests.Session.request", side_effect=mock_request) as mocked_request: with mock.patch("weaver.utils.time.sleep", side_effect=mock_sleep): intervals = [1e6, 3e6, 5e6] # random values that shouldn't normally be used with sleep() (too big) # values will not match if backoff/retries are not automatically corrected by internals parameter resp = request_extra("get", "http://whatever", only_server_errors=False, intervals=intervals, backoff=1000, retries=10) # backoff/retries must be ignored here assert resp.status_code == HTTPGatewayTimeout.code assert mocked_request.call_count == 4 # first called directly, then 3 times, one for each interval # WARNING: # cannot safely use mock counter since everything can increase it # notably debugger/breakpoints that uses more calls to sleep() # instead use our custom counter that employs unrealistic values assert sleep_counter["called_count"] == 3 # first direct call doesn't have any sleep interval assert all(called == expect for called, expect in zip(sleep_counter["called_with"], intervals))
def capabilities(self, url=None): # type: (Optional[str]) -> OperationResult """ List all available :term:`Process` on the instance. .. seealso:: :ref:`proc_op_getcap` :param url: Instance URL if not already provided during client creation. """ base = self._get_url(url) path = f"{base}/processes" query = { "detail": False } # not supported by non-Weaver, but save the work if possible resp = request_extra("GET", path, params=query, headers=self._headers, settings=self._settings) result = self._parse_result(resp) processes = result.body.get("processes") if isinstance(processes, list) and all( isinstance(proc, dict) for proc in processes): processes = [get_any_id(proc) for proc in processes] result.body = processes return result
def _query_features_paginated(self, params): # type: (Dict) -> Iterable[Dict, str] """ :param params: query parameters """ start_index = 1 maximum_records = params.get("maximumRecords") template_url = self.get_template_url() base_url, query_params = self._prepare_query_url(template_url, params) while True: query_params["startRecord"] = start_index response = request_extra("get", base_url, params=query_params, intervals=list(range(1, 5)), allowed_codes=[HTTPOk.code], settings=self.settings) if response.status_code != 200: break json_body = response.json() features = json_body.get("features", []) for feature in features: yield feature, response.url n_received_features = len(features) n_received_so_far = start_index + n_received_features - 1 # index starts at 1 total_results = json_body["totalResults"] if not n_received_features: break if n_received_so_far >= total_results: break if maximum_records and n_received_so_far >= maximum_records: break start_index += n_received_features
def make_request( self, method, # type: str url, # type: str retry=False, # type: Union[bool, int] cookies=None, # type: Optional[AnyCookiesContainer] headers=None, # type: Optional[AnyHeadersContainer] **kwargs, # type: Any ): # type: (...) -> AnyResponseType """ Sends the request with additional parameter handling for the current process definition. """ retries = int(retry) if retry is not None else 0 cookies = CaseInsensitiveDict(cookies or {}) headers = CaseInsensitiveDict(headers or {}) cookies.update(self.get_auth_cookies()) headers.update(self.headers.copy()) headers.update(self.get_auth_headers()) response = request_extra(method, url=url, settings=self.settings, retries=retries, headers=headers, cookies=cookies, **kwargs) return response
def _request_extra_various(_mime_type): """ Attempts multiple request-retry variants to be as permissive as possible to sporadic/temporary failures. """ _mime_type_url = "{}{}".format( IANA_NAMESPACE_DEFINITION[IANA_NAMESPACE], _mime_type) try: resp = request_extra( "head", _mime_type_url, retries=3, timeout=0.5, allow_redirects=True, allowed_codes=[HTTPOk.code, HTTPNotFound.code]) if resp.status_code == HTTPOk.code: return _make_if_ref(IANA_NAMESPACE_DEFINITION, IANA_NAMESPACE, _mime_type) except ConnectionError as exc: LOGGER.debug("Format request [%s] connection error: [%s]", _mime_type_url, exc) try: resp = urlopen(_mime_type_url, timeout=1) # nosec: B310 # is hardcoded HTTP(S) if resp.code == HTTPOk.code: return _make_if_ref(IANA_NAMESPACE_DEFINITION, IANA_NAMESPACE, _mime_type) except HTTPError: pass return None
def check_wps_status( location=None, # type: Optional[str] response=None, # type: Optional[XML] sleep_secs=2, # type: int verify=True, # type: bool settings=None, # type: Optional[AnySettingsContainer] ): # type: (...) -> WPSExecution """ Run :func:`owslib.wps.WPSExecution.checkStatus` with additional exception handling. :param location: job URL or file path where to look for job status. :param response: WPS response document of job status. :param sleep_secs: number of seconds to sleep before returning control to the caller. :param verify: Flag to enable SSL verification. :param settings: Application settings to retrieve any additional request parameters as applicable. :return: OWSLib.wps.WPSExecution object. """ def _retry_file(): LOGGER.warning( "Failed retrieving WPS status-location, attempting with local file." ) out_path = get_wps_local_status_location(location, settings) if not out_path: raise HTTPNotFound( "Could not find file resource from [{}].".format(location)) LOGGER.info("Resolved WPS status-location using local file reference.") return open(out_path, "r").read() execution = WPSExecution() if response: LOGGER.debug("Retrieving WPS status from XML response document...") xml = response elif location: xml_resp = HTTPNotFound() try: LOGGER.debug("Attempt to retrieve WPS status-location from URL...") xml_resp = request_extra("get", location, verify=verify, settings=settings) xml = xml_resp.content except Exception as ex: LOGGER.debug("Got exception during get status: [%r]", ex) xml = _retry_file() if xml_resp.status_code == HTTPNotFound.code: LOGGER.debug("Got not-found during get status: [%r]", xml) xml = _retry_file() else: raise Exception( "Missing status-location URL/file reference or response with XML object." ) if isinstance(xml, str): xml = xml.encode("utf8", errors="ignore") execution.checkStatus(response=xml, sleepSecs=sleep_secs) if execution.response is None: raise Exception("Missing response, cannot check status.") if not isinstance(execution.response, lxml.etree._Element): # noqa execution.response = lxml.etree.fromstring(execution.response) return execution
def results(self, job_reference, out_dir=None, download=False, url=None): # type: (str, Optional[str], bool, Optional[str]) -> OperationResult """ Obtain the results of a successful :term:`Job` execution. :param job_reference: Either the full :term:`Job` status URL or only its UUID. :param out_dir: Output directory where to store downloaded files if requested (default: CURDIR/JobID/<outputs>). :param download: Download any file reference found within results (CAUTION: could transfer lots of data!). :param url: Instance URL if not already provided during client creation. :returns: Result details and local paths if downloaded. """ job_id, job_url = self._parse_job_ref(job_reference, url) status = self.status(job_url) if not status.success: return OperationResult( False, "Cannot process results from incomplete or failed job.", status.body) # use results endpoint instead of outputs to be OGC-API compliant, should be able to target non-Weaver instance # with this endpoint, outputs IDs are directly at the root of the body result_url = f"{job_url}/results" resp = request_extra("GET", result_url, headers=self._headers, settings=self._settings) res_out = self._parse_result(resp) outputs = res_out.body if not res_out.success or not isinstance(res_out.body, dict): return OperationResult( False, "Could not retrieve any output results from job.", outputs) if not download: return OperationResult(True, "Listing job results.", outputs) # download file results if not any("href" in value for value in outputs.values()): return OperationResult( False, "Outputs were found but none are downloadable (only raw values?).", outputs) if not out_dir: out_dir = os.path.join(os.path.realpath(os.path.curdir), job_id) os.makedirs(out_dir, exist_ok=True) LOGGER.info("Will store job [%s] output results in [%s]", job_id, out_dir) for output, value in outputs.items(): is_list = True if not isinstance(value, list): value = [value] is_list = False for i, item in enumerate(value): if "href" in item: file_path = fetch_file(item["href"], out_dir, link=False) if is_list: outputs[output][i]["path"] = file_path else: outputs[output]["path"] = file_path return OperationResult(True, "Retrieved job results.", outputs)
def monitor(self, job_reference, timeout=None, interval=None, wait_for_status=STATUS_SUCCEEDED, url=None): # type: (str, Optional[int], Optional[int], str, Optional[str]) -> OperationResult """ Monitor the execution of a :term:`Job` until completion. .. seealso:: :ref:`proc_op_monitor` :param job_reference: Either the full :term:`Job` status URL or only its UUID. :param timeout: timeout (seconds) of maximum wait time for monitoring if completion is not reached. :param interval: wait interval (seconds) between polling monitor requests. :param wait_for_status: monitor until the requested status is reached (default: job failed or succeeded). :param url: Instance URL if not already provided during client creation. :return: result of the successful or failed job, or timeout of monitoring process. """ job_id, job_url = self._parse_job_ref(job_reference, url) remain = timeout = timeout or self.monitor_timeout delta = interval or self.monitor_interval LOGGER.info("Monitoring job [%s] for %ss at intervals of %ss.", job_id, timeout, delta) once = True body = None while remain >= 0 or once: resp = request_extra("GET", job_url, headers=self._headers, settings=self._settings) if resp.status_code != 200: return OperationResult( False, "Could not find job with specified reference.", {"job": job_reference}) body = resp.json() status = body.get("status") if status == wait_for_status: return OperationResult( True, f"Requested job status reached [{wait_for_status}].", body) if status in JOB_STATUS_CATEGORIES[JOB_STATUS_CATEGORY_FINISHED]: return OperationResult( False, "Requested job status not reached, but job has finished.", body) time.sleep(delta) remain -= delta once = False return OperationResult( False, f"Monitoring timeout reached ({timeout}s). Job did not complete in time.", body)
def get_template_url(self): resp = request_extra("get", self.osdd_url, params=self.params, settings=self.settings) resp.raise_for_status() xml = lxml.etree.fromstring(resp.content) xpath = "//*[local-name() = 'Url'][@rel='results']" url = xml.xpath(xpath)[0] # type: XML return url.attrib["template"]
def get_user_auth_header(self): # TODO: find a better way to generalize this to Magpie credentials? if not asbool(self.settings.get("ades.use_auth_token", True)): return {} ades_usr = self.settings.get("ades.username", None) ades_pwd = self.settings.get("ades.password", None) ades_url = self.settings.get("ades.wso2_hostname", None) ades_client = self.settings.get("ades.wso2_client_id", None) ades_secret = self.settings.get("ades.wso2_client_secret", None) access_token = None if ades_usr and ades_pwd and ades_url and ades_client and ades_secret: ades_body = { "grant_type": "password", "client_id": ades_client, "client_secret": ades_secret, "username": ades_usr, "password": ades_pwd, "scope": "openid", } ades_headers = { "Content-Type": CONTENT_TYPE_APP_FORM, "Accept": CONTENT_TYPE_APP_JSON } ades_access_token_url = "{}/oauth2/token".format(ades_url) cred_resp = request_extra("post", ades_access_token_url, data=ades_body, headers=ades_headers, settings=self.settings) cred_resp.raise_for_status() if CONTENT_TYPE_APP_JSON not in cred_resp.headers.get( "Content-Type"): raise HTTPUnauthorized( "Cannot retrieve valid access token using credential or ADES configurations." ) access_token = cred_resp.json().get("access_token", None) if not access_token: warnings.warn( "Could not retrieve valid access token although response is expected to contain one.", MissingParameterWarning) else: warnings.warn( "Could not retrieve at least one of required login parameters: " "[ades.username, ades.password, ades.wso2_hostname, ades.wso2_client_id, ades.wso2_client_secret]", MissingParameterWarning) return { "Authorization": "Bearer {}".format(access_token) if access_token else None }
def make_request(self, method, url, retry=False, **kwargs): # type: (str, str, Union[bool, int], Any) -> AnyResponseType """ Sends the request with additional parameter handling for the current process definition. """ retries = int(retry) if retry is not None else 0 response = request_extra(method, url=url, settings=self.settings, retries=retries, headers=self.headers, cookies=self.cookies, **kwargs) return response
def _fetch_datatsets_from_alternates_links(self, alternate_links): # Try loading from atom alternate link for link in alternate_links: if link["type"] == "application/atom+xml": resp = request_extra("get", link["href"], settings=self.settings) resp.raise_for_status() xml = lxml.etree.fromstring(resp.content) xpath = "//*[local-name() = 'entry']/*[local-name() = 'link']" links = xml.xpath(xpath) # type: List[XML] return [link.attrib for link in links] return []
def dismiss(self, job_reference, url=None): """ Dismiss pending or running :term:`Job`, or clear result artifacts from a completed :term:`Job`. :param job_reference: Either the full :term:`Job` status URL or only its UUID. :param url: Instance URL if not already provided during client creation. :returns: Obtained result from the operation. """ job_id, job_url = self._parse_job_ref(job_reference, url) LOGGER.debug("Dismissing job: [%s]", job_id) resp = request_extra("DELETE", job_url, headers=self._headers, settings=self._settings) return self._parse_result(resp)
def undeploy(self, process_id, url=None): # type: (str, Optional[str]) -> OperationResult """ Undeploy an existing :term:`Process`. :param process_id: Identifier of the process to undeploy. :param url: Instance URL if not already provided during client creation. """ base = self._get_url(url) path = f"{base}/processes/{process_id}" resp = request_extra("DELETE", path, headers=self._headers, settings=self._settings) return self._parse_result(resp)
def test_request_extra_allowed_codes(): """ Verifies that ``allowed_codes`` only are considered as valid status instead of any non-error HTTP code. """ mocked_codes = {"codes": [HTTPCreated.code, HTTPOk.code, HTTPCreated.code]} # note: used in reverse order def mocked_request(*_, **__): mocked_resp = Response() mocked_resp.status_code = mocked_codes["codes"].pop() return mocked_resp with mock.patch("requests.Session.request", side_effect=mocked_request) as mocked: resp = request_extra("get", "http://whatever", retries=3, allowed_codes=[HTTPOk.code]) assert resp.status_code == HTTPOk.code assert mocked.call_count == 2
def status(self, job_reference, url=None): """ Obtain the status of a :term:`Job`. .. seealso:: :ref:`proc_op_status` :param job_reference: Either the full :term:`Job` status URL or only its UUID. :param url: Instance URL if not already provided during client creation. :returns: retrieved status of the job. """ job_id, job_url = self._parse_job_ref(job_reference, url) LOGGER.info("Getting job status: [%s]", job_id) resp = request_extra("GET", job_url, headers=self._headers, settings=self._settings) return self._parse_result(resp)
def test_frontpage_format(self): resp = self.testapp.get(sd.api_frontpage_service.path, headers=self.json_headers) assert resp.status_code == 200 body = resp.json try: sd.FrontpageSchema().deserialize(body) except colander.Invalid as ex: body = json.dumps(body, indent=2, ensure_ascii=False) self.fail( "expected valid response format as defined in schema [{!s}] in\n{}" .format(ex, body)) refs = [link["rel"] for link in body["links"]] assert len(body["links"]) == len( set(refs)), "Link relationships must all be unique" for link in body["links"]: path = link["href"] rtype = link["type"] if rtype in CONTENT_TYPE_ANY_XML: rtype = CONTENT_TYPE_ANY_XML else: rtype = [rtype] rel = link["rel"] if "localhost" in path: resp = self.testapp.get( urlparse(path).path, expect_errors=True) # allow error for wps without queries else: resp = request_extra("GET", path, retries=3, retry_after=True, ssl_verify=False, allow_redirects=True) code = resp.status_code test = "({}) [{}]".format(rel, path) assert code in [ 200, 400 ], "Reference link expected to be found, got [{}] for {}".format( code, test) ctype = resp.headers.get("Content-Type", "").split(";")[0].strip() assert ctype in rtype, "Reference link content does not match [{}]!=[{}] for {}".format( ctype, rtype, test)
def request_callback(request): # type: (AnyRequestType) -> Tuple[int, Dict[str, str], str] """ Operation called when the file-server URL is matched against incoming requests that have been mocked. """ if (mock_head and request.method == "HEAD") or (mock_get and request.method == "GET"): file_url = "file://{}".format( request.url.replace(url, directory, 1)) resp = request_extra(request.method, file_url, settings=settings) if resp.status_code == 200: headers = resp.headers content = resp.content file_path = file_url.replace("file://", "") mime_type, encoding = mimetypes.guess_type(file_path) headers.update({ "Server": "mocked_wps_output", "Date": str(datetime.datetime.utcnow()), "Content-Type": mime_type or CONTENT_TYPE_TEXT_PLAIN, "Content-Encoding": encoding or "", "Last-Modified": str( datetime.datetime.fromtimestamp( os.stat(file_path).st_mtime)) }) if request.method == "HEAD": headers.pop("Content-Length", None) content = "" if request.method == "GET": headers.update({ "Content-Length": str(headers.get("Content-Length", len(resp.content))), }) headers.update(headers_override or {}) return resp.status_code, headers, content else: return 405, {}, "" return 404, {}, ""
def make_request(self, method, url, retry, status_code_mock=None, **kwargs): response = request_extra(method, url=url, settings=self.settings, headers=self.headers, cookies=self.cookies, verify=self.verify, **kwargs) # TODO: Remove patch for Geomatys unreliable server if response.status_code == HTTPBadGateway.code and retry: sleep(10) response = self.make_request(method, url, False, **kwargs) if response.status_code == HTTPBadGateway.code and status_code_mock: response.status_code = status_code_mock return response
def describe(self, process_id, url=None): # type: (str, Optional[str]) -> OperationResult """ Describe the specified :term:`Process`. .. seealso:: :ref:`proc_op_describe` :param process_id: Identifier of the process to describe. :param url: Instance URL if not already provided during client creation. """ base = self._get_url(url) path = f"{base}/processes/{process_id}" resp = request_extra("GET", path, headers=self._headers, settings=self._settings) # API response from this request can contain 'description' matching the process description # rather than a generic response 'description'. Enforce the provided message to avoid confusion. return self._parse_result( resp, message="Process description successfully retrieved.")
def check_wps_status( location=None, # type: Optional[str] response=None, # type: Optional[xml_util.XML] sleep_secs=2, # type: int verify=True, # type: bool settings=None, # type: Optional[AnySettingsContainer] ): # type: (...) -> WPSExecution """ Run :func:`owslib.wps.WPSExecution.checkStatus` with additional exception handling. :param location: job URL or file path where to look for job status. :param response: WPS response document of job status. :param sleep_secs: number of seconds to sleep before returning control to the caller. :param verify: flag to enable SSL verification. :param settings: application settings to retrieve any additional request parameters as applicable. :returns: OWSLib.wps.WPSExecution object. """ def _retry_file(): # type: () -> str LOGGER.warning( "Failed retrieving WPS status-location, attempting with local file." ) out_path = get_wps_local_status_location(location, settings) if not out_path: raise HTTPNotFound( f"Could not find file resource from [{location}].") LOGGER.info("Resolved WPS status-location using local file reference.") with open(out_path, mode="r", encoding="utf-8") as f: return f.read() execution = WPSExecution() if response: LOGGER.debug("Retrieving WPS status from XML response document...") xml_data = response elif location: xml_resp = HTTPNotFound() xml_data = None try: LOGGER.debug( "Attempt to retrieve WPS status-location from URL [%s]...", location) xml_resp = request_extra("get", location, verify=verify, settings=settings) xml_data = xml_resp.content except Exception as ex: LOGGER.debug( "Got exception during get status: [%r]. Will retry with local reference.", ex) if xml_resp.status_code != HTTPOk.code: LOGGER.debug( "WPS XML status not found: [%r]. Retrying with local reference.", xml_data) xml_data = _retry_file() else: raise Exception( "Missing status-location URL/file reference or response with XML object." ) if isinstance(xml_data, str): xml_data = xml_data.encode("utf8", errors="ignore") execution.checkStatus(response=xml_data, sleepSecs=sleep_secs) if execution.response is None: raise Exception("Missing response, cannot check status.") if not isinstance(execution.response, xml_util.XML): execution.response = xml_util.fromstring(execution.response) return execution
def test_frontpage_format(self): resp = self.testapp.get(sd.api_frontpage_service.path, headers=self.json_headers) assert resp.status_code == 200 body = resp.json try: sd.FrontpageSchema().deserialize(body) except colander.Invalid as ex: body = json.dumps(body, indent=2, ensure_ascii=False) self.fail( f"expected valid response format as defined in schema [{ex!s}] in\n{body}" ) refs = [link["rel"] for link in body["links"]] assert len(body["links"]) == len( set(refs)), "Link relationships must all be unique" for link in body["links"]: path = link["href"] rtype = link["type"] if rtype in ContentType.ANY_XML: rtype = ContentType.ANY_XML else: rtype = [rtype] rel = link["rel"] # request endpoint to validate it is accessible if "localhost" in path: resp = self.testapp.get( urlparse(path).path, expect_errors=True) # allow error for wps without queries else: resp = request_extra("GET", path, retries=3, retry_after=True, ssl_verify=False, allow_redirects=True) user_agent = get_header("user-agent", resp.request.headers) if resp.status_code == 403 and "python" in user_agent: # some sites will explicitly block bots, retry with mocked user-agent simulating human user access resp = request_extra("GET", path, headers={"User-Agent": "Mozilla"}, retries=3, retry_after=True, ssl_verify=False, allow_redirects=True) # validate contents and expected media-type code = resp.status_code test = f"({rel}) [{path}]" assert code in [ 200, 400 ], f"Reference link expected to be found, got [{code}] for {test}" # FIXME: patch broken content-type from reference websites # (see https://github.com/opengeospatial/NamingAuthority/issues/183) ctype_header_links = { "http://schemas.opengis.net/wps/": ContentType.APP_XML } ctype = resp.headers.get("Content-Type", "").split(";")[0].strip() if not ctype: for ref_link in ctype_header_links: if path.startswith(ref_link): ctype = ctype_header_links[ref_link] break assert ctype in rtype, f"Reference link content does not match [{ctype}]!=[{rtype}] for {test}"
def execute(self, workflow_inputs, out_dir, expected_outputs): # TODO: test visible = self.is_visible() if not visible: # includes private visibility and non-existing cases if visible is None: LOGGER.info( "Process [%s] access is unauthorized on [%s] - deploying as admin.", self.process, self.url) elif visible is False: LOGGER.info( "Process [%s] is not deployed on [%s] - deploying.", self.process, self.url) # TODO: Maybe always redeploy? What about cases of outdated deployed process? try: self.deploy() except Exception as exc: # FIXME: support for Spacebel, avoid conflict error incorrectly handled, remove 500 when fixed pass_http_error(exc, [HTTPConflict, HTTPInternalServerError]) LOGGER.info("Process [%s] enforced to public visibility.", self.process) try: self.set_visibility(visibility=VISIBILITY_PUBLIC) # TODO: support for Spacebel, remove when visibility route properly implemented on ADES except Exception as exc: pass_http_error(exc, HTTPNotFound) self.update_status("Preparing execute request for remote ADES.", REMOTE_JOB_PROGRESS_REQ_PREP, status.STATUS_RUNNING) LOGGER.debug("Execute process WPS request for [%s]", self.process) execute_body_inputs = [] execute_req_id = "id" execute_req_input_val_href = "href" execute_req_input_val_data = "data" for workflow_input_key, workflow_input_value in workflow_inputs.items( ): if isinstance(workflow_input_value, list): for workflow_input_value_item in workflow_input_value: if isinstance( workflow_input_value_item, dict) and "location" in workflow_input_value_item: execute_body_inputs.append({ execute_req_id: workflow_input_key, execute_req_input_val_href: workflow_input_value_item["location"] }) else: execute_body_inputs.append({ execute_req_id: workflow_input_key, execute_req_input_val_data: workflow_input_value_item }) else: if isinstance(workflow_input_value, dict) and "location" in workflow_input_value: execute_body_inputs.append({ execute_req_id: workflow_input_key, execute_req_input_val_href: workflow_input_value["location"] }) else: execute_body_inputs.append({ execute_req_id: workflow_input_key, execute_req_input_val_data: workflow_input_value }) for exec_input in execute_body_inputs: if execute_req_input_val_href in exec_input and isinstance( exec_input[execute_req_input_val_href], str): if exec_input[execute_req_input_val_href].startswith( "{0}://".format(OPENSEARCH_LOCAL_FILE_SCHEME)): exec_input[execute_req_input_val_href] = "file{0}".format( exec_input[execute_req_input_val_href] [len(OPENSEARCH_LOCAL_FILE_SCHEME):]) elif exec_input[execute_req_input_val_href].startswith( "file://"): exec_input[execute_req_input_val_href] = self.host_file( exec_input[execute_req_input_val_href]) LOGGER.debug("Hosting intermediate input [%s] : [%s]", exec_input[execute_req_id], exec_input[execute_req_input_val_href]) execute_body_outputs = [{ execute_req_id: output, "transmissionMode": EXECUTE_TRANSMISSION_MODE_REFERENCE } for output in expected_outputs] self.update_status("Executing job on remote ADES.", REMOTE_JOB_PROGRESS_EXECUTION, status.STATUS_RUNNING) execute_body = dict(mode=EXECUTE_MODE_ASYNC, response=EXECUTE_RESPONSE_DOCUMENT, inputs=execute_body_inputs, outputs=execute_body_outputs) request_url = self.url + process_jobs_uri.format( process_id=self.process) response = self.make_request(method="POST", url=request_url, json=execute_body, retry=True) if response.status_code != 201: raise Exception( "Was expecting a 201 status code from the execute request : {0}" .format(request_url)) job_status_uri = response.headers["Location"] job_status = self.get_job_status(job_status_uri) job_status_value = status.map_status(job_status["status"]) self.update_status( "Monitoring job on remote ADES : {0}".format(job_status_uri), REMOTE_JOB_PROGRESS_MONITORING, status.STATUS_RUNNING) while job_status_value not in status.JOB_STATUS_CATEGORIES[ status.STATUS_CATEGORY_FINISHED]: sleep(5) job_status = self.get_job_status(job_status_uri) job_status_value = status.map_status(job_status["status"]) LOGGER.debug( get_log_monitor_msg(job_status["jobID"], job_status_value, job_status.get("percentCompleted", 0), get_any_message(job_status), job_status.get("statusLocation"))) self.update_status( get_job_log_msg(status=job_status_value, message=get_any_message(job_status), progress=job_status.get("percentCompleted", 0), duration=job_status.get( "duration", None)), # get if available map_progress(job_status.get("percentCompleted", 0), REMOTE_JOB_PROGRESS_MONITORING, REMOTE_JOB_PROGRESS_FETCH_OUT), status.STATUS_RUNNING) if job_status_value != status.STATUS_SUCCEEDED: LOGGER.debug( get_log_monitor_msg(job_status["jobID"], job_status_value, job_status.get("percentCompleted", 0), get_any_message(job_status), job_status.get("statusLocation"))) raise Exception(job_status) self.update_status("Fetching job outputs from remote ADES.", REMOTE_JOB_PROGRESS_FETCH_OUT, status.STATUS_RUNNING) results = self.get_job_results(job_status["jobID"]) for result in results: if get_any_id(result) in expected_outputs: # This is where cwl expect the output file to be written # TODO We will probably need to handle multiple output value... dst_fn = "/".join([ out_dir.rstrip("/"), expected_outputs[get_any_id(result)] ]) # TODO Should we handle other type than File reference? resp = request_extra("get", get_any_value(result), allow_redirects=True, settings=self.settings) LOGGER.debug( "Fetching result output from [%s] to cwl output destination: [%s]", get_any_value(result), dst_fn) with open(dst_fn, mode="wb") as dst_fh: dst_fh.write(resp.content) self.update_status("Execution on remote ADES completed.", REMOTE_JOB_PROGRESS_COMPLETED, status.STATUS_SUCCEEDED)
def estimate_workflow_quote(quote, process): # type: (Quote, Process) -> Quote """ Loop :term:`Workflow` sub-:term:`Process` steps to get their respective :term:`Quote`. """ settings = get_settings() process_url = process.href(settings) quote_steps = [] quote_params = [] workflow_steps = get_package_workflow_steps(process_url) for step in workflow_steps: # retrieve quote from provider ADES # TODO: data source mapping process_step_url = get_process_location(step["reference"]) process_quote_url = f"{process_step_url}/quotations" # FIXME: how to estimate data transfer if remote process (?) # FIXME: how to produce intermediate process inputs (?) - remove xfail in functional test once resolved # FIXME: must consider fan-out in case of parallel steps data = {"inputs": [], "outputs": []} resp = request_extra("POST", process_quote_url, json=data, headers={"Prefer": "respond-async"}) href = resp.headers.get("Location") status = QuoteStatus.SUBMITTED retry = 0 abort = 3 while status != QuoteStatus.COMPLETED and abort > 0: wait = wait_secs(retry) retry += 1 resp = request_extra("GET", href) if resp.status_code != 200: abort -= 1 wait = 5 else: body = resp.json() status = QuoteStatus.get(body.get("status")) if status == QuoteStatus.COMPLETED: quote_steps.append(href) quote_params.append(body) break if status == QuoteStatus.FAILED or status is None: LOGGER.error( "Quote estimation for sub-process [%s] under [%s] failed.", step["name"], process.id) break if abort <= 0: time.sleep(wait) if len(workflow_steps) != len(quote_params): raise QuoteEstimationError( "Could not obtain intermediate quote estimations for all Workflow steps." ) # FIXME: what if different currencies are defined (?) currency = "CAD" params = { "price": 0, "currency": currency, "seconds": 0, "steps": quote_steps, } for step_params in quote_params: params["price"] += step_params["price"] params["seconds"] += step_params["estimatedSeconds"] quote.update(**params) return quote
def execute(self, process_id, inputs=None, monitor=False, timeout=None, interval=None, url=None): # type: (str, Optional[Union[str, JSON]], bool, Optional[int], Optional[int], Optional[str]) -> OperationResult """ Execute a :term:`Job` for the specified :term:`Process` with provided inputs. When submitting inputs with :term:`OGC API - Processes` schema, top-level ``inputs`` key is expected. Under it, either the mapping (key-value) or listing (id,value) representation are accepted. If ``inputs`` is not found, the alternative :term:`CWL` will be assumed. When submitting inputs with :term:`CWL` *job* schema, plain key-value(s) pairs are expected. All values should be provided directly under the key (including arrays), except for ``File`` type that must include the ``class`` and ``path`` details. .. seealso:: :ref:`proc_op_execute` :param process_id: Identifier of the process to execute. :param inputs: Literal :term:`JSON` or :term:`YAML` contents of the inputs submitted and inserted into the execution body, using either the :term:`OGC API - Processes` or :term:`CWL` format, or a file path/URL referring to them. :param monitor: Automatically perform :term:`Job` execution monitoring until completion or timeout to obtain final results. If requested, this operation will become blocking until either the completed status or timeout is reached. :param timeout: Monitoring timeout (seconds) if requested. :param interval: Monitoring interval (seconds) between job status polling requests. :param url: Instance URL if not already provided during client creation. :returns: results of the operation. """ if isinstance(inputs, list) and all( isinstance(item, list) for item in inputs): inputs = [items for sub in inputs for items in sub] # flatten 2D->1D list values = self._parse_inputs(inputs) if isinstance(values, OperationResult): return values data = { # NOTE: since sync is not yet properly implemented in Weaver, simulate with monitoring after if requested # FIXME: support 'sync' (https://github.com/crim-ca/weaver/issues/247) "mode": EXECUTE_MODE_ASYNC, "inputs": values, # FIXME: support 'response: raw' (https://github.com/crim-ca/weaver/issues/376) "response": EXECUTE_RESPONSE_DOCUMENT, # FIXME: allow omitting 'outputs' (https://github.com/crim-ca/weaver/issues/375) # FIXME: allow 'transmissionMode: value/reference' selection (https://github.com/crim-ca/weaver/issues/377) "outputs": {} } # FIXME: since (https://github.com/crim-ca/weaver/issues/375) not implemented, auto-populate all the outputs base = self._get_url(url) result = self.describe(process_id, url=base) if not result.success: return OperationResult( False, "Could not obtain process description for execution.", body=result.body, headers=result.headers, code=result.code, text=result.text) outputs = result.body.get("outputs") for output_id in outputs: # use 'value' to have all outputs reported in body as 'value/href' rather than 'Link' headers data["outputs"][output_id] = { "transmissionMode": EXECUTE_TRANSMISSION_MODE_VALUE } LOGGER.info("Executing [%s] with inputs:\n%s", process_id, _json2text(inputs)) path = f"{base}/processes/{process_id}/execution" # use OGC-API compliant endpoint (not '/jobs') resp = request_extra("POST", path, json=data, headers=self._headers, settings=self._settings) result = self._parse_result(resp) if not monitor or not result.success: return result # although Weaver returns "jobID" in the body for convenience, # employ the "Location" header to be OGC-API compliant job_url = resp.headers.get("Location", "") time.sleep( 1 ) # small delay to ensure process execution had a chance to start before monitoring return self.monitor(job_url, timeout=timeout, interval=interval)
def deploy( self, process_id=None, # type: Optional[str] body=None, # type: Optional[Union[JSON, str]] cwl=None, # type: Optional[Union[CWL, str]] wps=None, # type: Optional[str] token=None, # type: Optional[str] username=None, # type: Optional[str] password=None, # type: Optional[str] undeploy=False, # type: bool url=None, # type: Optional[str] ): # type: (...) -> OperationResult """ Deploy a new :term:`Process` with specified metadata and reference to an :term:`Application Package`. The referenced :term:`Application Package` must be one of: - :term:`CWL` body, local file or URL in :term:`JSON` or :term:`YAML` format - :term:`WPS` process URL with :term:`XML` response - :term:`WPS-REST` process URL with :term:`JSON` response - :term:`OGC API - Processes` process URL with :term:`JSON` response If the reference is resolved to be a :term:`Workflow`, all its underlying :term:`Process` steps must be available under the same URL that this client was initialized with. .. seealso:: :ref:`proc_op_deploy` :param process_id: Desired process identifier. Can be omitted if already provided in body contents or file. :param body: Literal :term:`JSON` contents, either using string representation of actual Python objects forming the request body, or file path/URL to :term:`YAML` or :term:`JSON` contents of the request body. Other parameters (:paramref:`process_id`, :paramref:`cwl`) can override corresponding fields within the provided body. :param cwl: Literal :term:`JSON` or :term:`YAML` contents, either using string representation of actual Python objects, or file path/URL with contents of the :term:`CWL` definition of the :term:`Application package` to be inserted into the body. :param wps: URL to an existing :term:`WPS` process (WPS-1/2 or WPS-REST/OGC-API). :param token: Authentication token for accessing private Docker registry if :term:`CWL` refers to such image. :param username: Username to form the authentication token to a private Docker registry. :param password: Password to form the authentication token to a private Docker registry. :param undeploy: Perform undeploy step as applicable prior to deployment to avoid conflict with exiting :term:`Process`. :param url: Instance URL if not already provided during client creation. :returns: results of the operation. """ result = self._parse_deploy_body(body, process_id) if not result.success: return result headers = copy.deepcopy(self._headers) headers.update(self._parse_auth_token(token, username, password)) data = result.body result = self._parse_deploy_package(data, cwl, wps, process_id, headers) if not result.success: return result p_id = result.message data = result.body base = self._get_url(url) if undeploy: LOGGER.debug("Performing requested undeploy of process: [%s]", p_id) result = self.undeploy(process_id=p_id, url=base) if result.code not in [200, 404]: return OperationResult( False, "Failed requested undeployment prior deployment.", body=result.body, text=result.text, code=result.code, headers=result.headers) path = f"{base}/processes" resp = request_extra("POST", path, json=data, headers=headers, settings=self._settings) return self._parse_result(resp)
def execute(self, workflow_inputs, out_dir, expected_outputs): self.update_status("Preparing execute request for remote WPS1 provider.", REMOTE_JOB_PROGRESS_REQ_PREP, status.STATUS_RUNNING) LOGGER.debug("Execute process WPS request for %s", self.process) try: try: wps = WebProcessingService(url=self.provider, headers=self.cookies, verify=self.verify) raise_on_xml_exception(wps._capabilities) # noqa: W0212 except Exception as ex: raise OWSNoApplicableCode("Failed to retrieve WPS capabilities. Error: [{}].".format(str(ex))) try: process = wps.describeprocess(self.process) except Exception as ex: raise OWSNoApplicableCode("Failed to retrieve WPS process description. Error: [{}].".format(str(ex))) # prepare inputs complex_inputs = [] for process_input in process.dataInputs: if WPS_COMPLEX_DATA in process_input.dataType: complex_inputs.append(process_input.identifier) # remove any 'null' input, should employ the 'default' of the remote WPS process inputs_provided_keys = filter(lambda i: workflow_inputs[i] != "null", workflow_inputs) wps_inputs = [] for input_key in inputs_provided_keys: input_val = workflow_inputs[input_key] # in case of array inputs, must repeat (id,value) # in case of complex input (File), obtain location, otherwise get data value if not isinstance(input_val, list): input_val = [input_val] input_values = [] for val in input_val: if isinstance(val, dict): val = val["location"] # owslib only accepts strings, not numbers directly if isinstance(val, (int, float)): val = str(val) if val.startswith("file://"): # we need to host file starting with file:// scheme val = self.host_file(val) input_values.append(val) # need to use ComplexDataInput structure for complex input # TODO: BoundingBox not supported for input_value in input_values: if input_key in complex_inputs: input_value = ComplexDataInput(input_value) wps_inputs.append((input_key, input_value)) # prepare outputs outputs = [(o.identifier, o.dataType == WPS_COMPLEX_DATA) for o in process.processOutputs if o.identifier in expected_outputs] self.update_status("Executing job on remote WPS1 provider.", REMOTE_JOB_PROGRESS_EXECUTION, status.STATUS_RUNNING) mode = EXECUTE_MODE_ASYNC execution = wps.execute(self.process, inputs=wps_inputs, output=outputs, mode=mode, lineage=True) if not execution.process and execution.errors: raise execution.errors[0] self.update_status("Monitoring job on remote WPS1 provider : [{0}]".format(self.provider), REMOTE_JOB_PROGRESS_MONITORING, status.STATUS_RUNNING) max_retries = 5 num_retries = 0 run_step = 0 job_id = "<undefined>" while execution.isNotComplete() or run_step == 0: if num_retries >= max_retries: raise Exception("Could not read status document after {} retries. Giving up.".format(max_retries)) try: execution = check_wps_status(location=execution.statusLocation, verify=self.verify, sleep_secs=wait_secs(run_step)) job_id = execution.statusLocation.replace(".xml", "").split("/")[-1] LOGGER.debug(get_log_monitor_msg(job_id, status.map_status(execution.getStatus()), execution.percentCompleted, execution.statusMessage, execution.statusLocation)) self.update_status(get_job_log_msg(status=status.map_status(execution.getStatus()), message=execution.statusMessage, progress=execution.percentCompleted, duration=None), # get if available map_progress(execution.percentCompleted, REMOTE_JOB_PROGRESS_MONITORING, REMOTE_JOB_PROGRESS_FETCH_OUT), status.STATUS_RUNNING) except Exception as exc: num_retries += 1 LOGGER.debug("Exception raised: %r", exc) sleep(1) else: num_retries = 0 run_step += 1 if not execution.isSucceded(): exec_msg = execution.statusMessage or "Job failed." LOGGER.debug(get_log_monitor_msg(job_id, status.map_status(execution.getStatus()), execution.percentCompleted, exec_msg, execution.statusLocation)) raise Exception(execution.statusMessage or "Job failed.") self.update_status("Fetching job outputs from remote WPS1 provider.", REMOTE_JOB_PROGRESS_FETCH_OUT, status.STATUS_RUNNING) results = [ows2json_output(output, process) for output in execution.processOutputs] for result in results: result_id = get_any_id(result) result_val = get_any_value(result) if result_id in expected_outputs: # This is where cwl expect the output file to be written # TODO We will probably need to handle multiple output value... dst_fn = "/".join([out_dir.rstrip("/"), expected_outputs[result_id]]) # TODO Should we handle other type than File reference? resp = request_extra("get", result_val, allow_redirects=True, settings=self.settings) LOGGER.debug("Fetching result output from [%s] to cwl output destination: [%s]", result_val, dst_fn) with open(dst_fn, mode="wb") as dst_fh: dst_fh.write(resp.content) except Exception as exc: exception_class = "{}.{}".format(type(exc).__module__, type(exc).__name__) errors = "{0}: {1!s}".format(exception_class, exc) LOGGER.exception(exc) raise Exception(errors) self.update_status("Execution on remote WPS1 provider completed.", REMOTE_JOB_PROGRESS_COMPLETED, status.STATUS_SUCCEEDED)