def test_fetch_file_local_with_protocol(): """ Test function :func:`weaver.utils.fetch_file` when the reference is a pre-fetched local file. """ tmp_dir = tempfile.gettempdir() with tempfile.NamedTemporaryFile(dir=tmp_dir, mode="w", suffix=".json") as tmp_json: tmp_data = {"message": "fetch-file-protocol"} tmp_json.write(json.dumps(tmp_data)) tmp_json.seek(0) tmp_name = os.path.split(tmp_json.name)[-1] res_dir = os.path.join(tmp_dir, inspect.currentframe().f_code.co_name) res_path = os.path.join(res_dir, tmp_name) try: make_dirs(res_dir, exist_ok=True) for protocol in ["", "file://"]: tmp_path = protocol + tmp_json.name fetch_file(tmp_path, res_dir) assert os.path.isfile( res_path ), "File [{}] should be accessible under [{}]".format( tmp_path, res_path) assert json.load( open(res_path) ) == tmp_data, "File should be properly copied/referenced from original" except Exception: raise finally: shutil.rmtree(res_dir, ignore_errors=True)
def m2n(metalink_reference, index, output_dir): # type: (str, int, str) -> None LOGGER.info( "Got arguments: metalink_reference=%s index=%s output_dir=%s", metalink_reference, index, output_dir ) LOGGER.info("Process '%s' execution starting...", PACKAGE_NAME) LOGGER.debug("Process '%s' output directory: [%s].", PACKAGE_NAME, output_dir) try: if not os.path.isdir(output_dir): raise ValueError("Output dir [{}] does not exist.".format(output_dir)) with TemporaryDirectory(prefix="wps_process_{}_".format(PACKAGE_NAME)) as tmp_dir: LOGGER.debug("Fetching Metalink file: [%s]", metalink_reference) metalink_path = fetch_file(metalink_reference, tmp_dir, timeout=10, retry=3) LOGGER.debug("Reading Metalink file: [%s]", metalink_path) xml_data = xml_util.parse(metalink_path) LOGGER.debug("Parsing Metalink file references.") nc_file_url = xml_data.xpath("string(//metalink/file[" + str(index) + "]/metaurl)") LOGGER.debug("Fetching NetCDF reference from Metalink file: [%s]", metalink_reference) LOGGER.debug("NetCDF file URL : %s", nc_file_url) fetch_file(nc_file_url, output_dir) except Exception as exc: # log only debug for tracking, re-raise and actual error wil be logged by top process monitor LOGGER.debug("Process '%s' raised an exception: [%s]", PACKAGE_NAME, exc) raise LOGGER.info("Process '%s' execution completed.", PACKAGE_NAME)
def j2n(json_reference, output_dir): # type: (str, str) -> None LOGGER.info("Process '%s' execution starting...", PACKAGE_NAME) LOGGER.debug("Process '%s' output directory: [%s].", PACKAGE_NAME, output_dir) try: if not os.path.isdir(output_dir): raise ValueError(f"Output dir [{output_dir}] does not exist.") with TemporaryDirectory(prefix=f"wps_process_{PACKAGE_NAME}_") as tmp_dir: LOGGER.debug("Fetching JSON file: [%s]", json_reference) json_path = fetch_file(json_reference, tmp_dir, timeout=10, retry=3) LOGGER.debug("Reading JSON file: [%s]", json_path) with open(json_path, mode="r", encoding="utf-8") as json_file: json_content = json.load(json_file) if not isinstance(json_content, list) or any(not is_netcdf_url(f) for f in json_content): LOGGER.error("Invalid JSON: [%s]", json_content) raise ValueError("Invalid JSON file format, expected a plain array of NetCDF file URL strings.") LOGGER.debug("Parsing JSON file references.") for file_url in json_content: LOGGER.debug("Fetching NetCDF reference from JSON file: [%s]", file_url) fetch_file(file_url, output_dir, timeout=10, retry=3) except Exception as exc: # log only debug for tracking, re-raise and actual error wil be logged by top process monitor LOGGER.debug("Process '%s' raised an exception: [%s]", PACKAGE_NAME, exc) raise LOGGER.info("Process '%s' execution completed.", PACKAGE_NAME)
def test_fetch_file_remote_with_request(): """ Test function :func:`weaver.utils.fetch_file` when the reference is an URL. Also validates retries of the failing request. """ tmp_dir = tempfile.gettempdir() with contextlib.ExitStack() as stack: tmp_json = stack.enter_context( tempfile.NamedTemporaryFile(dir=tmp_dir, mode="w", suffix=".json")) tmp_data = {"message": "fetch-file-request"} tmp_json.write(json.dumps(tmp_data)) tmp_json.seek(0) tmp_name = os.path.split(tmp_json.name)[-1] tmp_http = "http://weaver.mock" + tmp_json.name tmp_retry = 2 # share in below mocked_request, 'nonlocal' back compatible with Python 2 tmp = {"retry": tmp_retry, "json": tmp_json, "http": tmp_http} def mocked_request(*args, **kwargs): # noqa: E811 tmp["retry"] -= 1 if not tmp["retry"]: return mocked_file_response(tmp["json"].name, tmp["http"]) resp = HTTPInternalServerError( ) # internal retry expect at least a 5xx code to retry return resp # will be available on next call (to test retries) stack.enter_context( mock.patch("requests.request", side_effect=mocked_request)) stack.enter_context( mock.patch("requests.sessions.Session.request", side_effect=mocked_request)) m_request = stack.enter_context( mock.patch("requests.Session.request", side_effect=mocked_request)) res_dir = os.path.join(tmp_dir, inspect.currentframe().f_code.co_name) res_path = os.path.join(res_dir, tmp_name) try: make_dirs(res_dir, exist_ok=True) fetch_file(tmp_http, res_dir, retry=tmp_retry + 1) assert os.path.isfile( res_path), "File [{}] should be accessible under [{}]".format( tmp_http, res_path) assert m_request.call_count == 2, "Request method should have been called twice because of retries" assert json.load( open(res_path) ) == tmp_data, "File should be properly generated from HTTP reference" except Exception: raise finally: shutil.rmtree(res_dir, ignore_errors=True)
def stage_results(self, results, expected_outputs, out_dir): # type: (JobResults, CWL_ExpectedOutputs, str) -> None """ Retrieves the remote execution :term:`Job` results for staging locally into the specified output directory. This operation should be called by the implementing remote :term:`Process` definition after :meth:`execute`. .. note:: The :term:`CWL` runner expects the output file(s) to be written matching definition in ``expected_outputs``, but this definition could be a glob pattern to match multiple file and/or nested directories. We cannot rely on specific file names to be mapped, since glob can match many (eg: ``"*.txt"``). """ for result in results: res_id = get_any_id(result) if res_id not in expected_outputs: continue # plan ahead when list of multiple output values could be supported result_values = get_any_value(result) if not isinstance(result_values, list): result_values = [result_values] cwl_out_dir = out_dir.rstrip("/") for value in result_values: src_name = value.split("/")[-1] dst_path = "/".join([cwl_out_dir, src_name]) # performance improvement: # Bypass download if file can be resolved as local resource (already fetched or same server). # Because CWL expects the file to be in specified 'out_dir', make a link for it to be found # even though the file is stored in the full job output location instead (already staged by step). map_path = map_wps_output_location(value, self.settings) as_link = False if map_path: LOGGER.info( "Detected result [%s] from [%s] as local reference to this instance. " "Skipping fetch and using local copy in output destination: [%s]", res_id, value, dst_path) LOGGER.debug("Mapped result [%s] to local reference: [%s]", value, map_path) src_path = map_path as_link = True else: LOGGER.info( "Fetching result [%s] from [%s] to CWL output destination: [%s]", res_id, value, dst_path) src_path = value fetch_file(src_path, cwl_out_dir, settings=self.settings, link=as_link)
def _write_outputs(self, url, output_dir, expected_outputs): """Write the output netcdf url to a local drive""" message = "Downloading outputs." self.update_status(message, Percent.COMPUTE_DONE, STATUS_RUNNING) nc_outputs = [ v for v in expected_outputs.values() if v.lower().endswith(".nc") ] if len(nc_outputs) > 1: raise NotImplementedError("Multiple outputs are not implemented") fetch_file(url, output_dir, settings=self.settings) message = "Download successful." self.update_status(message, Percent.FINISHED, STATUS_SUCCEEDED)
def host_file(self, file_path): """ Hosts an intermediate file between :term:`Workflow` steps for processes that require external or remote access. :param file_path: Intermediate file location (local path expected). :return: Hosted temporary HTTP file location. """ wps_out_url = get_wps_output_url(self.settings) wps_out_dir = get_wps_output_dir(self.settings) file_path = os.path.realpath(file_path.replace( "file://", "")) # in case CWL->WPS outputs link was made if file_path.startswith(wps_out_dir): file_href = file_path.replace(wps_out_dir, wps_out_url, 1) LOGGER.debug( "Hosting file [%s] skipped since already on WPS outputs as [%s]", file_path, file_href) else: tmp_out_dir = tempfile.mkdtemp(dir=wps_out_dir) file_link = fetch_file(file_path, tmp_out_dir, self.settings, link=True) file_href = file_link.replace(wps_out_dir, wps_out_url, 1) self.temp_staging.add(tmp_out_dir) LOGGER.debug("Hosting file [%s] as [%s] on [%s]", file_path, file_link, file_href) return file_href
def results(self, job_reference, out_dir=None, download=False, url=None): # type: (str, Optional[str], bool, Optional[str]) -> OperationResult """ Obtain the results of a successful :term:`Job` execution. :param job_reference: Either the full :term:`Job` status URL or only its UUID. :param out_dir: Output directory where to store downloaded files if requested (default: CURDIR/JobID/<outputs>). :param download: Download any file reference found within results (CAUTION: could transfer lots of data!). :param url: Instance URL if not already provided during client creation. :returns: Result details and local paths if downloaded. """ job_id, job_url = self._parse_job_ref(job_reference, url) status = self.status(job_url) if not status.success: return OperationResult( False, "Cannot process results from incomplete or failed job.", status.body) # use results endpoint instead of outputs to be OGC-API compliant, should be able to target non-Weaver instance # with this endpoint, outputs IDs are directly at the root of the body result_url = f"{job_url}/results" resp = request_extra("GET", result_url, headers=self._headers, settings=self._settings) res_out = self._parse_result(resp) outputs = res_out.body if not res_out.success or not isinstance(res_out.body, dict): return OperationResult( False, "Could not retrieve any output results from job.", outputs) if not download: return OperationResult(True, "Listing job results.", outputs) # download file results if not any("href" in value for value in outputs.values()): return OperationResult( False, "Outputs were found but none are downloadable (only raw values?).", outputs) if not out_dir: out_dir = os.path.join(os.path.realpath(os.path.curdir), job_id) os.makedirs(out_dir, exist_ok=True) LOGGER.info("Will store job [%s] output results in [%s]", job_id, out_dir) for output, value in outputs.items(): is_list = True if not isinstance(value, list): value = [value] is_list = False for i, item in enumerate(value): if "href" in item: file_path = fetch_file(item["href"], out_dir, link=False) if is_list: outputs[output][i]["path"] = file_path else: outputs[output]["path"] = file_path return OperationResult(True, "Retrieved job results.", outputs)
def test_fetch_file_local_links(): """ Test handling of symbolic links by function :func:`weaver.utils.fetch_file` for local files. """ tmp_dir = tempfile.gettempdir() src_dir = os.path.join(tmp_dir, str(uuid.uuid4())) dst_dir = os.path.join(tmp_dir, str(uuid.uuid4())) try: make_dirs(src_dir, exist_ok=True) make_dirs(dst_dir, exist_ok=True) with tempfile.NamedTemporaryFile(dir=src_dir, mode="w", suffix=".json") as tmp_json: tmp_data = {"message": "fetch-file-link"} tmp_json.write(json.dumps(tmp_data)) tmp_json.seek(0) tmp_file = tmp_json.name tmp_path, tmp_name = os.path.split(tmp_file) tmp_link = os.path.join(tmp_path, "link.json") os.symlink(tmp_file, tmp_link) dst_path = os.path.join(dst_dir, tmp_name) for src_path, as_link, result_link in [ (tmp_file, True, True), (tmp_file, False, False), (tmp_file, None, False), (tmp_link, True, True), (tmp_link, False, False), (tmp_link, None, True), ]: if os.path.exists(dst_path): os.remove(dst_path) fetch_file(src_path, dst_dir, link=as_link) assert os.path.isfile(dst_path), ( f"File [{tmp_file}] should be accessible under [{dst_path}]. " f"Failed with: {(src_path, as_link, result_link)}" ) if result_link: assert os.path.islink(dst_path), "Result is not a link when it is expected to be one." else: assert not os.path.islink(dst_path), "Result is a link when it is expected not to be one." assert json.load(open(dst_path)) == tmp_data, "File should be properly copied/referenced from original" except OSError as exc: pytest.fail(f"Unexpected error raised during test: [{exc!s}]") finally: shutil.rmtree(src_dir, ignore_errors=True) shutil.rmtree(dst_dir, ignore_errors=True)
def test_fetch_file_remote_s3_bucket(): with tempfile.TemporaryDirectory() as tmpdir: test_file_name = "test-file.txt" test_file_data = "dummy file" test_bucket_name = "test-fake-bucket" test_bucket_ref = mocked_aws_s3_bucket_test_file(test_bucket_name, test_file_name, test_file_data) result = fetch_file(test_bucket_ref, tmpdir) assert result == os.path.join(tmpdir, test_file_name) assert os.path.isfile(result) with open(result, mode="r") as test_file: assert test_file.read() == test_file_data
def test_execute_docker_embedded_python_script(self): test_proc = "test-docker-python-script" cwl = load_file(os.path.join(WEAVER_ROOT_DIR, "docs/examples/docker-python-script-report.cwl")) body = { "processDescription": { "process": { "id": test_proc } }, "executionUnit": [{"unit": cwl}], "deploymentProfileName": "http://www.opengis.net/profiles/eoc/dockerizedApplication" } self.deploy_process(body) with contextlib.ExitStack() as stack: for mock_exec in mocked_execute_process(): stack.enter_context(mock_exec) path = f"/processes/{test_proc}/execution" cost = 2.45 amount = 3 body = { "mode": EXECUTE_MODE_ASYNC, "response": EXECUTE_RESPONSE_DOCUMENT, "inputs": [ {"id": "amount", "value": amount}, {"id": "cost", "value": cost} ], "outputs": [ {"id": "quote", "transmissionMode": EXECUTE_TRANSMISSION_MODE_REFERENCE}, ] } resp = mocked_sub_requests(self.app, "POST", path, json=body, headers=self.json_headers, only_local=True) status_url = resp.headers["Location"] results = self.monitor_job(status_url) assert results["quote"]["href"].startswith("http") stack.enter_context(mocked_wps_output(self.settings)) tmpdir = stack.enter_context(tempfile.TemporaryDirectory()) report_file = fetch_file(results["quote"]["href"], tmpdir, self.settings) report_data = load_file(report_file, text=True) assert report_data == f"Order Total: {amount * cost:0.2f}$\n"
def stage_results(self, results, expected_outputs, out_dir): # type: (JobResults, CWL_ExpectedOutputs, str) -> None """ Retrieves the remote execution :term:`Job` results for staging locally into the specified output directory. This operation should be called by the implementing remote :term:`Process` definition after :meth:`execute`. .. note:: The :term:`CWL` runner expects the output file(s) to be written matching definition in ``expected_outputs``, but this definition could be a glob pattern to match multiple file and/or nested directories. We cannot rely on specific file names to be mapped, since glob can match many (eg: ``"*.txt"``). .. seealso:: Function :func:`weaver.processes.convert.any2cwl_io` defines a generic glob pattern using the output ID and expected file extension based on Content-Type format. Since the remote :term:`WPS` :term:`Process` doesn't necessarily produces file names with the output ID as expected to find them (could be anything), staging must patch locations to let :term:`CWL` runtime resolve the files according to glob definitions. .. warning:: Only remote :term:`Provider` implementations (which auto-generate a pseudo :term:`CWL` to map components) that produce outputs with inconsistent file names as described above should set attribute :attr:`WpsProcessInterface.stage_output_id_nested` accordingly. For :term:`Process` that directly provide an actual :term:`CWL` :term:`Application Package` definition (e.g.: Docker application), auto-mapping of glob patterns should be avoided, as it is expected that the :term:`CWL` contains real mapping to be respected for correct execution and retrieval of outputs from the application. """ for result in results: res_id = get_any_id(result) if res_id not in expected_outputs: continue # plan ahead when list of multiple output values could be supported result_values = get_any_value(result) if not isinstance(result_values, list): result_values = [result_values] if self.stage_output_id_nested: cwl_out_dir = "/".join([out_dir.rstrip("/"), res_id]) else: cwl_out_dir = out_dir.rstrip("/") os.makedirs(cwl_out_dir, mode=0o700, exist_ok=True) for value in result_values: src_name = value.split("/")[-1] dst_path = "/".join([cwl_out_dir, src_name]) # performance improvement: # Bypass download if file can be resolved as local resource (already fetched or same server). # Because CWL expects the file to be in specified 'out_dir', make a link for it to be found # even though the file is stored in the full job output location instead (already staged by step). map_path = map_wps_output_location(value, self.settings) as_link = False if map_path: LOGGER.info( "Detected result [%s] from [%s] as local reference to this instance. " "Skipping fetch and using local copy in output destination: [%s]", res_id, value, dst_path) LOGGER.debug("Mapped result [%s] to local reference: [%s]", value, map_path) src_path = map_path as_link = True else: LOGGER.info( "Fetching result [%s] from [%s] to CWL output destination: [%s]", res_id, value, dst_path) src_path = value fetch_file(src_path, cwl_out_dir, settings=self.settings, link=as_link)
def test_fetch_file_http_content_disposition_filename(): tmp_dir = tempfile.gettempdir() with contextlib.ExitStack() as stack: tmp_json = stack.enter_context(tempfile.NamedTemporaryFile(dir=tmp_dir, mode="w", suffix=".json")) # noqa tmp_data = {"message": "fetch-file-request"} tmp_text = json.dumps(tmp_data) tmp_json.write(tmp_text) tmp_json.seek(0) tmp_random = "123456" tmp_normal = "spécial.json" tmp_escape = quote(tmp_normal) # % characters tmp_name = os.path.split(tmp_json.name)[-1] tmp_http = f"http://weaver.mock/{tmp_random}" # pseudo endpoint where file name is not directly visible def mock_response(__request, test_headers): test_headers.update({ "Content-Type": ContentType.APP_JSON, "Content-Length": str(len(tmp_text)) }) return 200, headers, tmp_text res_dir = os.path.join(tmp_dir, str(uuid.uuid4())) req_mock = stack.enter_context(responses.RequestsMock()) try: make_dirs(res_dir, exist_ok=True) for target, headers in [ (tmp_name, { "Content-Disposition": f"attachment; filename=\"{tmp_name}\";filename*=UTF-8''{tmp_name}" }), (tmp_name, { # unusual spacing/order does not matter "Content-Disposition": f" filename*=UTF-8''{tmp_name}; filename=\"{tmp_name}\";attachment;" }), (tmp_name, { "Content-Disposition": f"attachment; filename=\"{tmp_name}\"" }), (tmp_name, { "Content-Disposition": f"attachment; filename={tmp_name}" }), (tmp_normal, { "Content-Disposition": f"attachment; filename=\"{tmp_normal}\";filename*=UTF-8''{tmp_escape}" }), (tmp_normal, { # disallowed escape character in 'filename', but 'filename*' is valid and used first "Content-Disposition": f"attachment; filename=\"{tmp_escape}\";filename*=UTF-8''{tmp_normal}" }), (tmp_random, { # disallowed escape character in 'filename', reject since no alternative "Content-Disposition": f"attachment; filename=\"{tmp_escape}\"" }), (tmp_random, { # empty header "Content-Disposition": "" }), (tmp_random, { # missing header }), (tmp_random, { # missing filename "Content-Disposition": "attachment" }), (tmp_random, { # invalid filename "Content-Disposition": "attachment; filename*=UTF-8''exec%20'echo%20test'" }), (tmp_random, { # invalid encoding "Content-Disposition": "attachment; filename*=random''%47%4F%4F%44.json" }), ("GOOD.json", { # valid encoding and allowed characters after escape "Content-Disposition": "attachment; filename*=UTF-8''%47%4F%4F%44.json" }) ]: req_mock.remove("GET", tmp_http) # reset previous iter req_mock.add_callback("GET", tmp_http, callback=lambda req: mock_response(req, headers)) try: res_path = fetch_file(tmp_http, res_dir) except Exception as exc: raise AssertionError(f"Unexpected exception when testing with: [{headers}]. Exception: [{exc}]") assert res_path == os.path.join(res_dir, target), f"Not expected name when testing with: [{headers}]" assert os.path.isfile(res_path), f"File [{tmp_http}] should be accessible under [{res_path}]" assert json.load(open(res_path)) == tmp_data, "File should be properly generated from HTTP reference" except Exception: raise finally: shutil.rmtree(res_dir, ignore_errors=True)
def mocked_file_request(file_reference, file_outdir, **kwargs): if file_reference and file_reference.startswith(MOCK_HTTP_REF): file_reference = file_reference.replace(MOCK_HTTP_REF, "") file_path = fetch_file(file_reference, file_outdir, **kwargs) return file_path