Python fetch_file Examples, weaver.utils.fetch_file Python Examples

Example #1

0

Show file

File: test_utils.py Project: 00mjk/weaver

def test_fetch_file_local_with_protocol():
    """
    Test function :func:`weaver.utils.fetch_file` when the reference is a pre-fetched local file.
    """
    tmp_dir = tempfile.gettempdir()
    with tempfile.NamedTemporaryFile(dir=tmp_dir, mode="w",
                                     suffix=".json") as tmp_json:
        tmp_data = {"message": "fetch-file-protocol"}
        tmp_json.write(json.dumps(tmp_data))
        tmp_json.seek(0)
        tmp_name = os.path.split(tmp_json.name)[-1]
        res_dir = os.path.join(tmp_dir, inspect.currentframe().f_code.co_name)
        res_path = os.path.join(res_dir, tmp_name)
        try:
            make_dirs(res_dir, exist_ok=True)
            for protocol in ["", "file://"]:
                tmp_path = protocol + tmp_json.name
                fetch_file(tmp_path, res_dir)
                assert os.path.isfile(
                    res_path
                ), "File [{}] should be accessible under [{}]".format(
                    tmp_path, res_path)
                assert json.load(
                    open(res_path)
                ) == tmp_data, "File should be properly copied/referenced from original"
        except Exception:
            raise
        finally:
            shutil.rmtree(res_dir, ignore_errors=True)

Example #2

0

Show file

def m2n(metalink_reference, index, output_dir):
    # type: (str, int, str) -> None
    LOGGER.info(
        "Got arguments: metalink_reference=%s index=%s output_dir=%s", metalink_reference, index, output_dir
    )
    LOGGER.info("Process '%s' execution starting...", PACKAGE_NAME)
    LOGGER.debug("Process '%s' output directory: [%s].", PACKAGE_NAME, output_dir)
    try:
        if not os.path.isdir(output_dir):
            raise ValueError("Output dir [{}] does not exist.".format(output_dir))
        with TemporaryDirectory(prefix="wps_process_{}_".format(PACKAGE_NAME)) as tmp_dir:
            LOGGER.debug("Fetching Metalink file: [%s]", metalink_reference)
            metalink_path = fetch_file(metalink_reference, tmp_dir, timeout=10, retry=3)
            LOGGER.debug("Reading Metalink file: [%s]", metalink_path)
            xml_data = xml_util.parse(metalink_path)
            LOGGER.debug("Parsing Metalink file references.")
            nc_file_url = xml_data.xpath("string(//metalink/file[" + str(index) + "]/metaurl)")
            LOGGER.debug("Fetching NetCDF reference from Metalink file: [%s]", metalink_reference)
            LOGGER.debug("NetCDF file URL : %s", nc_file_url)
            fetch_file(nc_file_url, output_dir)
    except Exception as exc:
        # log only debug for tracking, re-raise and actual error wil be logged by top process monitor
        LOGGER.debug("Process '%s' raised an exception: [%s]", PACKAGE_NAME, exc)
        raise
    LOGGER.info("Process '%s' execution completed.", PACKAGE_NAME)

Example #3

0

Show file

File: jsonarray2netcdf.py Project: crim-ca/weaver

def j2n(json_reference, output_dir):
    # type: (str, str) -> None
    LOGGER.info("Process '%s' execution starting...", PACKAGE_NAME)
    LOGGER.debug("Process '%s' output directory: [%s].", PACKAGE_NAME, output_dir)
    try:
        if not os.path.isdir(output_dir):
            raise ValueError(f"Output dir [{output_dir}] does not exist.")
        with TemporaryDirectory(prefix=f"wps_process_{PACKAGE_NAME}_") as tmp_dir:
            LOGGER.debug("Fetching JSON file: [%s]", json_reference)
            json_path = fetch_file(json_reference, tmp_dir, timeout=10, retry=3)
            LOGGER.debug("Reading JSON file: [%s]", json_path)
            with open(json_path, mode="r", encoding="utf-8") as json_file:
                json_content = json.load(json_file)
            if not isinstance(json_content, list) or any(not is_netcdf_url(f) for f in json_content):
                LOGGER.error("Invalid JSON: [%s]", json_content)
                raise ValueError("Invalid JSON file format, expected a plain array of NetCDF file URL strings.")
            LOGGER.debug("Parsing JSON file references.")
            for file_url in json_content:
                LOGGER.debug("Fetching NetCDF reference from JSON file: [%s]", file_url)
                fetch_file(file_url, output_dir, timeout=10, retry=3)
    except Exception as exc:
        # log only debug for tracking, re-raise and actual error wil be logged by top process monitor
        LOGGER.debug("Process '%s' raised an exception: [%s]", PACKAGE_NAME, exc)
        raise
    LOGGER.info("Process '%s' execution completed.", PACKAGE_NAME)

Example #4

0

Show file

File: test_utils.py Project: 00mjk/weaver

def test_fetch_file_remote_with_request():
    """
    Test function :func:`weaver.utils.fetch_file` when the reference is an URL.
    Also validates retries of the failing request.
    """
    tmp_dir = tempfile.gettempdir()
    with contextlib.ExitStack() as stack:
        tmp_json = stack.enter_context(
            tempfile.NamedTemporaryFile(dir=tmp_dir, mode="w", suffix=".json"))
        tmp_data = {"message": "fetch-file-request"}
        tmp_json.write(json.dumps(tmp_data))
        tmp_json.seek(0)
        tmp_name = os.path.split(tmp_json.name)[-1]
        tmp_http = "http://weaver.mock" + tmp_json.name
        tmp_retry = 2

        # share in below mocked_request, 'nonlocal' back compatible with Python 2
        tmp = {"retry": tmp_retry, "json": tmp_json, "http": tmp_http}

        def mocked_request(*args, **kwargs):  # noqa: E811
            tmp["retry"] -= 1
            if not tmp["retry"]:
                return mocked_file_response(tmp["json"].name, tmp["http"])
            resp = HTTPInternalServerError(
            )  # internal retry expect at least a 5xx code to retry
            return resp  # will be available on next call (to test retries)

        stack.enter_context(
            mock.patch("requests.request", side_effect=mocked_request))
        stack.enter_context(
            mock.patch("requests.sessions.Session.request",
                       side_effect=mocked_request))
        m_request = stack.enter_context(
            mock.patch("requests.Session.request", side_effect=mocked_request))

        res_dir = os.path.join(tmp_dir, inspect.currentframe().f_code.co_name)
        res_path = os.path.join(res_dir, tmp_name)
        try:
            make_dirs(res_dir, exist_ok=True)
            fetch_file(tmp_http, res_dir, retry=tmp_retry + 1)
            assert os.path.isfile(
                res_path), "File [{}] should be accessible under [{}]".format(
                    tmp_http, res_path)
            assert m_request.call_count == 2, "Request method should have been called twice because of retries"
            assert json.load(
                open(res_path)
            ) == tmp_data, "File should be properly generated from HTTP reference"
        except Exception:
            raise
        finally:
            shutil.rmtree(res_dir, ignore_errors=True)

Example #5

0

Show file

File: wps_process_base.py Project: crim-ca/weaver

    def stage_results(self, results, expected_outputs, out_dir):
        # type: (JobResults, CWL_ExpectedOutputs, str) -> None
        """
        Retrieves the remote execution :term:`Job` results for staging locally into the specified output directory.

        This operation should be called by the implementing remote :term:`Process` definition after :meth:`execute`.

        .. note::
            The :term:`CWL` runner expects the output file(s) to be written matching definition in ``expected_outputs``,
            but this definition could be a glob pattern to match multiple file and/or nested directories.
            We cannot rely on specific file names to be mapped, since glob can match many (eg: ``"*.txt"``).
        """
        for result in results:
            res_id = get_any_id(result)
            if res_id not in expected_outputs:
                continue

            # plan ahead when list of multiple output values could be supported
            result_values = get_any_value(result)
            if not isinstance(result_values, list):
                result_values = [result_values]
            cwl_out_dir = out_dir.rstrip("/")
            for value in result_values:
                src_name = value.split("/")[-1]
                dst_path = "/".join([cwl_out_dir, src_name])
                # performance improvement:
                #   Bypass download if file can be resolved as local resource (already fetched or same server).
                #   Because CWL expects the file to be in specified 'out_dir', make a link for it to be found
                #   even though the file is stored in the full job output location instead (already staged by step).
                map_path = map_wps_output_location(value, self.settings)
                as_link = False
                if map_path:
                    LOGGER.info(
                        "Detected result [%s] from [%s] as local reference to this instance. "
                        "Skipping fetch and using local copy in output destination: [%s]",
                        res_id, value, dst_path)
                    LOGGER.debug("Mapped result [%s] to local reference: [%s]",
                                 value, map_path)
                    src_path = map_path
                    as_link = True
                else:
                    LOGGER.info(
                        "Fetching result [%s] from [%s] to CWL output destination: [%s]",
                        res_id, value, dst_path)
                    src_path = value
                fetch_file(src_path,
                           cwl_out_dir,
                           settings=self.settings,
                           link=as_link)

Example #6

0

Show file

    def _write_outputs(self, url, output_dir, expected_outputs):
        """Write the output netcdf url to a local drive"""
        message = "Downloading outputs."
        self.update_status(message, Percent.COMPUTE_DONE, STATUS_RUNNING)

        nc_outputs = [
            v for v in expected_outputs.values() if v.lower().endswith(".nc")
        ]
        if len(nc_outputs) > 1:
            raise NotImplementedError("Multiple outputs are not implemented")

        fetch_file(url, output_dir, settings=self.settings)

        message = "Download successful."
        self.update_status(message, Percent.FINISHED, STATUS_SUCCEEDED)

Example #7

0

Show file

    def host_file(self, file_path):
        """
        Hosts an intermediate file between :term:`Workflow` steps for processes that require external or remote access.

        :param file_path: Intermediate file location (local path expected).
        :return: Hosted temporary HTTP file location.
        """
        wps_out_url = get_wps_output_url(self.settings)
        wps_out_dir = get_wps_output_dir(self.settings)
        file_path = os.path.realpath(file_path.replace(
            "file://", ""))  # in case CWL->WPS outputs link was made
        if file_path.startswith(wps_out_dir):
            file_href = file_path.replace(wps_out_dir, wps_out_url, 1)
            LOGGER.debug(
                "Hosting file [%s] skipped since already on WPS outputs as [%s]",
                file_path, file_href)
        else:
            tmp_out_dir = tempfile.mkdtemp(dir=wps_out_dir)
            file_link = fetch_file(file_path,
                                   tmp_out_dir,
                                   self.settings,
                                   link=True)
            file_href = file_link.replace(wps_out_dir, wps_out_url, 1)
            self.temp_staging.add(tmp_out_dir)
            LOGGER.debug("Hosting file [%s] as [%s] on [%s]", file_path,
                         file_link, file_href)
        return file_href

Example #8

0

Show file

    def results(self, job_reference, out_dir=None, download=False, url=None):
        # type: (str, Optional[str], bool, Optional[str]) -> OperationResult
        """
        Obtain the results of a successful :term:`Job` execution.

        :param job_reference: Either the full :term:`Job` status URL or only its UUID.
        :param out_dir: Output directory where to store downloaded files if requested (default: CURDIR/JobID/<outputs>).
        :param download: Download any file reference found within results (CAUTION: could transfer lots of data!).
        :param url: Instance URL if not already provided during client creation.
        :returns: Result details and local paths if downloaded.
        """
        job_id, job_url = self._parse_job_ref(job_reference, url)
        status = self.status(job_url)
        if not status.success:
            return OperationResult(
                False, "Cannot process results from incomplete or failed job.",
                status.body)
        # use results endpoint instead of outputs to be OGC-API compliant, should be able to target non-Weaver instance
        # with this endpoint, outputs IDs are directly at the root of the body
        result_url = f"{job_url}/results"
        resp = request_extra("GET",
                             result_url,
                             headers=self._headers,
                             settings=self._settings)
        res_out = self._parse_result(resp)
        outputs = res_out.body
        if not res_out.success or not isinstance(res_out.body, dict):
            return OperationResult(
                False, "Could not retrieve any output results from job.",
                outputs)
        if not download:
            return OperationResult(True, "Listing job results.", outputs)

        # download file results
        if not any("href" in value for value in outputs.values()):
            return OperationResult(
                False,
                "Outputs were found but none are downloadable (only raw values?).",
                outputs)
        if not out_dir:
            out_dir = os.path.join(os.path.realpath(os.path.curdir), job_id)
        os.makedirs(out_dir, exist_ok=True)
        LOGGER.info("Will store job [%s] output results in [%s]", job_id,
                    out_dir)
        for output, value in outputs.items():
            is_list = True
            if not isinstance(value, list):
                value = [value]
                is_list = False
            for i, item in enumerate(value):
                if "href" in item:
                    file_path = fetch_file(item["href"], out_dir, link=False)
                    if is_list:
                        outputs[output][i]["path"] = file_path
                    else:
                        outputs[output]["path"] = file_path
        return OperationResult(True, "Retrieved job results.", outputs)

Example #9

0

Show file

File: test_utils.py Project: crim-ca/weaver

def test_fetch_file_local_links():
    """
    Test handling of symbolic links by function :func:`weaver.utils.fetch_file` for local files.
    """
    tmp_dir = tempfile.gettempdir()
    src_dir = os.path.join(tmp_dir, str(uuid.uuid4()))
    dst_dir = os.path.join(tmp_dir, str(uuid.uuid4()))
    try:
        make_dirs(src_dir, exist_ok=True)
        make_dirs(dst_dir, exist_ok=True)
        with tempfile.NamedTemporaryFile(dir=src_dir, mode="w", suffix=".json") as tmp_json:
            tmp_data = {"message": "fetch-file-link"}
            tmp_json.write(json.dumps(tmp_data))
            tmp_json.seek(0)
            tmp_file = tmp_json.name
            tmp_path, tmp_name = os.path.split(tmp_file)
            tmp_link = os.path.join(tmp_path, "link.json")
            os.symlink(tmp_file, tmp_link)
            dst_path = os.path.join(dst_dir, tmp_name)
            for src_path, as_link, result_link in [
                (tmp_file, True, True),
                (tmp_file, False, False),
                (tmp_file, None, False),
                (tmp_link, True, True),
                (tmp_link, False, False),
                (tmp_link, None, True),
            ]:
                if os.path.exists(dst_path):
                    os.remove(dst_path)
                fetch_file(src_path, dst_dir, link=as_link)
                assert os.path.isfile(dst_path), (
                    f"File [{tmp_file}] should be accessible under [{dst_path}]. "
                    f"Failed with: {(src_path, as_link, result_link)}"
                )
                if result_link:
                    assert os.path.islink(dst_path), "Result is not a link when it is expected to be one."
                else:
                    assert not os.path.islink(dst_path), "Result is a link when it is expected not to be one."
                assert json.load(open(dst_path)) == tmp_data, "File should be properly copied/referenced from original"
    except OSError as exc:
        pytest.fail(f"Unexpected error raised during test: [{exc!s}]")
    finally:
        shutil.rmtree(src_dir, ignore_errors=True)
        shutil.rmtree(dst_dir, ignore_errors=True)

Example #10

0

Show file

File: test_utils.py Project: crim-ca/weaver

def test_fetch_file_remote_s3_bucket():
    with tempfile.TemporaryDirectory() as tmpdir:
        test_file_name = "test-file.txt"
        test_file_data = "dummy file"
        test_bucket_name = "test-fake-bucket"
        test_bucket_ref = mocked_aws_s3_bucket_test_file(test_bucket_name, test_file_name, test_file_data)
        result = fetch_file(test_bucket_ref, tmpdir)
        assert result == os.path.join(tmpdir, test_file_name)
        assert os.path.isfile(result)
        with open(result, mode="r") as test_file:
            assert test_file.read() == test_file_data

Example #11

0

Show file

    def test_execute_docker_embedded_python_script(self):
        test_proc = "test-docker-python-script"
        cwl = load_file(os.path.join(WEAVER_ROOT_DIR, "docs/examples/docker-python-script-report.cwl"))
        body = {
            "processDescription": {
                "process": {
                    "id": test_proc
                }
            },
            "executionUnit": [{"unit": cwl}],
            "deploymentProfileName": "http://www.opengis.net/profiles/eoc/dockerizedApplication"
        }
        self.deploy_process(body)

        with contextlib.ExitStack() as stack:
            for mock_exec in mocked_execute_process():
                stack.enter_context(mock_exec)

            path = f"/processes/{test_proc}/execution"
            cost = 2.45
            amount = 3
            body = {
                "mode": EXECUTE_MODE_ASYNC,
                "response": EXECUTE_RESPONSE_DOCUMENT,
                "inputs": [
                    {"id": "amount", "value": amount},
                    {"id": "cost", "value": cost}
                ],
                "outputs": [
                    {"id": "quote", "transmissionMode": EXECUTE_TRANSMISSION_MODE_REFERENCE},
                ]
            }
            resp = mocked_sub_requests(self.app, "POST", path, json=body, headers=self.json_headers, only_local=True)
            status_url = resp.headers["Location"]
            results = self.monitor_job(status_url)

            assert results["quote"]["href"].startswith("http")
            stack.enter_context(mocked_wps_output(self.settings))
            tmpdir = stack.enter_context(tempfile.TemporaryDirectory())
            report_file = fetch_file(results["quote"]["href"], tmpdir, self.settings)
            report_data = load_file(report_file, text=True)
            assert report_data == f"Order Total: {amount * cost:0.2f}$\n"

Example #12

0

Show file

    def stage_results(self, results, expected_outputs, out_dir):
        # type: (JobResults, CWL_ExpectedOutputs, str) -> None
        """
        Retrieves the remote execution :term:`Job` results for staging locally into the specified output directory.

        This operation should be called by the implementing remote :term:`Process` definition after :meth:`execute`.

        .. note::
            The :term:`CWL` runner expects the output file(s) to be written matching definition in ``expected_outputs``,
            but this definition could be a glob pattern to match multiple file and/or nested directories.
            We cannot rely on specific file names to be mapped, since glob can match many (eg: ``"*.txt"``).

        .. seealso::
            Function :func:`weaver.processes.convert.any2cwl_io` defines a generic glob pattern using the output ID
            and expected file extension based on Content-Type format. Since the remote :term:`WPS` :term:`Process`
            doesn't necessarily produces file names with the output ID as expected to find them (could be anything),
            staging must patch locations to let :term:`CWL` runtime resolve the files according to glob definitions.

        .. warning::
            Only remote :term:`Provider` implementations (which auto-generate a pseudo :term:`CWL` to map components)
            that produce outputs with inconsistent file names as described above should set attribute
            :attr:`WpsProcessInterface.stage_output_id_nested` accordingly. For :term:`Process` that directly provide
            an actual :term:`CWL` :term:`Application Package` definition (e.g.: Docker application), auto-mapping
            of glob patterns should be avoided, as it is expected that the :term:`CWL` contains real mapping to be
            respected for correct execution and retrieval of outputs from the application.
        """
        for result in results:
            res_id = get_any_id(result)
            if res_id not in expected_outputs:
                continue

            # plan ahead when list of multiple output values could be supported
            result_values = get_any_value(result)
            if not isinstance(result_values, list):
                result_values = [result_values]
            if self.stage_output_id_nested:
                cwl_out_dir = "/".join([out_dir.rstrip("/"), res_id])
            else:
                cwl_out_dir = out_dir.rstrip("/")
            os.makedirs(cwl_out_dir, mode=0o700, exist_ok=True)
            for value in result_values:
                src_name = value.split("/")[-1]
                dst_path = "/".join([cwl_out_dir, src_name])
                # performance improvement:
                #   Bypass download if file can be resolved as local resource (already fetched or same server).
                #   Because CWL expects the file to be in specified 'out_dir', make a link for it to be found
                #   even though the file is stored in the full job output location instead (already staged by step).
                map_path = map_wps_output_location(value, self.settings)
                as_link = False
                if map_path:
                    LOGGER.info(
                        "Detected result [%s] from [%s] as local reference to this instance. "
                        "Skipping fetch and using local copy in output destination: [%s]",
                        res_id, value, dst_path)
                    LOGGER.debug("Mapped result [%s] to local reference: [%s]",
                                 value, map_path)
                    src_path = map_path
                    as_link = True
                else:
                    LOGGER.info(
                        "Fetching result [%s] from [%s] to CWL output destination: [%s]",
                        res_id, value, dst_path)
                    src_path = value
                fetch_file(src_path,
                           cwl_out_dir,
                           settings=self.settings,
                           link=as_link)

Example #13

0

Show file

File: test_utils.py Project: crim-ca/weaver

def test_fetch_file_http_content_disposition_filename():
    tmp_dir = tempfile.gettempdir()
    with contextlib.ExitStack() as stack:
        tmp_json = stack.enter_context(tempfile.NamedTemporaryFile(dir=tmp_dir, mode="w", suffix=".json"))  # noqa
        tmp_data = {"message": "fetch-file-request"}
        tmp_text = json.dumps(tmp_data)
        tmp_json.write(tmp_text)
        tmp_json.seek(0)

        tmp_random = "123456"
        tmp_normal = "spécial.json"
        tmp_escape = quote(tmp_normal)  # % characters
        tmp_name = os.path.split(tmp_json.name)[-1]
        tmp_http = f"http://weaver.mock/{tmp_random}"  # pseudo endpoint where file name is not directly visible

        def mock_response(__request, test_headers):
            test_headers.update({
                "Content-Type": ContentType.APP_JSON,
                "Content-Length": str(len(tmp_text))
            })
            return 200, headers, tmp_text

        res_dir = os.path.join(tmp_dir, str(uuid.uuid4()))
        req_mock = stack.enter_context(responses.RequestsMock())
        try:
            make_dirs(res_dir, exist_ok=True)
            for target, headers in [
                (tmp_name, {
                    "Content-Disposition": f"attachment; filename=\"{tmp_name}\";filename*=UTF-8''{tmp_name}"
                }),
                (tmp_name, {  # unusual spacing/order does not matter
                    "Content-Disposition": f" filename*=UTF-8''{tmp_name};   filename=\"{tmp_name}\";attachment;"
                }),
                (tmp_name, {
                    "Content-Disposition": f"attachment; filename=\"{tmp_name}\""
                }),
                (tmp_name, {
                    "Content-Disposition": f"attachment; filename={tmp_name}"
                }),
                (tmp_normal, {
                    "Content-Disposition": f"attachment; filename=\"{tmp_normal}\";filename*=UTF-8''{tmp_escape}"
                }),
                (tmp_normal, {  # disallowed escape character in 'filename', but 'filename*' is valid and used first
                    "Content-Disposition": f"attachment; filename=\"{tmp_escape}\";filename*=UTF-8''{tmp_normal}"
                }),
                (tmp_random, {  # disallowed escape character in 'filename', reject since no alternative
                    "Content-Disposition": f"attachment; filename=\"{tmp_escape}\""
                }),
                (tmp_random, {  # empty header
                    "Content-Disposition": ""
                }),
                (tmp_random, {  # missing header
                }),
                (tmp_random, {  # missing filename
                    "Content-Disposition": "attachment"
                }),
                (tmp_random, {  # invalid filename
                    "Content-Disposition": "attachment; filename*=UTF-8''exec%20'echo%20test'"
                }),
                (tmp_random, {  # invalid encoding
                    "Content-Disposition": "attachment; filename*=random''%47%4F%4F%44.json"
                }),
                ("GOOD.json", {  # valid encoding and allowed characters after escape
                    "Content-Disposition": "attachment; filename*=UTF-8''%47%4F%4F%44.json"
                })
            ]:
                req_mock.remove("GET", tmp_http)  # reset previous iter
                req_mock.add_callback("GET", tmp_http, callback=lambda req: mock_response(req, headers))
                try:
                    res_path = fetch_file(tmp_http, res_dir)
                except Exception as exc:
                    raise AssertionError(f"Unexpected exception when testing with: [{headers}]. Exception: [{exc}]")
                assert res_path == os.path.join(res_dir, target), f"Not expected name when testing with: [{headers}]"
                assert os.path.isfile(res_path), f"File [{tmp_http}] should be accessible under [{res_path}]"
                assert json.load(open(res_path)) == tmp_data, "File should be properly generated from HTTP reference"
        except Exception:
            raise
        finally:
            shutil.rmtree(res_dir, ignore_errors=True)

Example #14

0

Show file

 def mocked_file_request(file_reference, file_outdir, **kwargs):
     if file_reference and file_reference.startswith(MOCK_HTTP_REF):
         file_reference = file_reference.replace(MOCK_HTTP_REF, "")
     file_path = fetch_file(file_reference, file_outdir, **kwargs)
     return file_path