Ejemplo n.º 1
0
    def render(
        self,
        compiled_module: CompiledModule,
        chroot_context: ChrootContext,
        basedir: Path,
        input_table: ArrowTable,
        params: Params,
        tab: Tab,
        fetch_result: Optional[FetchResult],
        output_filename: str,
    ) -> RenderResult:
        """Run the module's `render_thrift()` function and return its result.

        Raise ModuleError if the module has a bug.
        """
        chroot_dir = chroot_context.chroot.root
        basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir)
        request = ttypes.RenderRequest(
            str(basedir_seen_by_module),
            arrow_arrow_table_to_thrift(input_table),
            arrow_params_to_thrift(params),
            arrow_tab_to_thrift(tab),
            (None if fetch_result is None else
             arrow_fetch_result_to_thrift(fetch_result)),
            output_filename,
        )
        try:
            with chroot_context.writable_file(basedir / output_filename):
                result = self._run_in_child(
                    chroot_dir=chroot_dir,
                    network_config=pyspawner.NetworkConfig(
                    ),  # TODO disallow networking
                    compiled_module=compiled_module,
                    timeout=self.render_timeout,
                    result=ttypes.RenderResult(),
                    function="render_thrift",
                    args=[request],
                )
        finally:
            chroot_context.clear_unowned_edits()

        if result.table.filename and result.table.filename != output_filename:
            raise ModuleExitedError(compiled_module.module_slug, 0,
                                    "Module wrote to wrong output file")

        try:
            # thrift_render_result_to_arrow() verifies all filenames passed by
            # the module are in the directory the module has access to. It
            # assumes the Arrow file (if there is one) is untrusted, so it can
            # raise ValidateError
            render_result = thrift_render_result_to_arrow(result, basedir)
        except ValidateError as err:
            raise ModuleExitedError(
                compiled_module.module_slug,
                0,
                "Module produced invalid data: %s" % str(err),
            )
        return render_result
Ejemplo n.º 2
0
    def test_execute_migrate_params_module_error_gives_default_params(
            self, fake_load_module):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        delta1 = workflow.last_delta
        ModuleVersion.create_or_replace_from_spec({
            "id_name":
            "mod",
            "name":
            "Mod",
            "category":
            "Clean",
            "parameters": [{
                "type": "string",
                "id_name": "x",
                "default": "def"
            }],
        })
        tab.wf_modules.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=delta1.id,
            module_id_name="mod",
        )

        def render(*args, params, **kwargs):
            self.assertEqual(params, Params({"x": "def"}))  # default params
            return RenderResult(arrow_table({"A": [1]}))

        # make migrate_params() raise an error.
        fake_load_module.return_value.migrate_params.side_effect = ModuleExitedError(
            -9, "")
        fake_load_module.return_value.render.side_effect = render
        self._execute(workflow)
        fake_load_module.return_value.render.assert_called()
Ejemplo n.º 3
0
def _extract_from_chroot(chroot: Path, path: Path) -> None:
    """
    Extract a file from `chroot`

    Modules write to files within their chroot. If path is `/tmp/out.arrow`,
    then the module wrote to `/chroot-dir/tmp/out.arrow`.

    (`path` exists before we create the chroot, and the chroot logic uses
    hard-link; so it's possible the module wrote directly to `/tmp/out.arrow`
    because `/chroot-dir/tmp/out.arrow` hard-links to it. But we don't count
    on modules opening an existing file rather than writing a new one.)

    To handle all cases, we hard-link `/tmp/out.arrow` to point to the file
    `/chroot-dir/tmp/out.arrow`. This "copies" the data, cheaply.
    
    The caller is responsible for restoring the file's permissions and
    attributes.

    Raise ModuleExitedError if the module tried to inject a symlink.
    """
    src = chroot / path.relative_to("/")
    if src.is_symlink():
        # If the module wrote a symlink, DO NOT READ IT. That's a security
        # issue -- the module could write "/etc/passwd" and then we'd read it.
        raise ModuleExitedError(0, "SECURITY: module output a symlink")
    path.unlink()  # os.link() won't overwrite; delete the destination
    os.link(src, path)
Ejemplo n.º 4
0
 def test_exited_stack_trace(self):
     self.assertEqual(
         format_for_user_debugging(
             ModuleExitedError(
                 1,
                 """\n  File "/app/cjwkernel/errors.py", line 1, in <module>\n    import signals\nModuleNotFoundError: No module named 'signals'\n""",
             )),
         "exit code 1: ModuleNotFoundError: No module named 'signals'",
     )
Ejemplo n.º 5
0
    def fetch(
        self,
        compiled_module: CompiledModule,
        chroot_context: ChrootContext,
        basedir: Path,
        params: Dict[str, Any],
        secrets: Dict[str, Any],
        last_fetch_result: Optional[FetchResult],
        input_parquet_filename: Optional[str],
        output_filename: str,
    ) -> FetchResult:
        """Run the module's `fetch_thrift()` function and return its result.

        Raise ModuleError if the module has a bug.
        """
        chroot_dir = chroot_context.chroot.root
        basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir)
        request = ttypes.FetchRequest(
            basedir=str(basedir_seen_by_module),
            params=pydict_to_thrift_json_object(params),
            secrets=pydict_to_thrift_json_object(secrets),
            last_fetch_result=(
                None if last_fetch_result is None else
                arrow_fetch_result_to_thrift(last_fetch_result)),
            input_table_parquet_filename=input_parquet_filename,
            output_filename=output_filename,
        )
        try:
            with chroot_context.writable_file(basedir / output_filename):
                result = self._run_in_child(
                    chroot_dir=chroot_dir,
                    network_config=pyspawner.NetworkConfig(),
                    compiled_module=compiled_module,
                    timeout=self.fetch_timeout,
                    result=ttypes.FetchResult(),
                    function="fetch_thrift",
                    args=[request],
                )
        finally:
            chroot_context.clear_unowned_edits()

        if result.filename and result.filename != output_filename:
            raise ModuleExitedError(compiled_module.module_slug, 0,
                                    "Module wrote to wrong output file")

        # TODO validate result isn't too large. If result is dataframe it makes
        # sense to truncate; but fetch results aren't necessarily data frames.
        # It's up to the module to enforce this logic ... but we need to set a
        # maximum file size.
        return thrift_fetch_result_to_arrow(result, basedir)
Ejemplo n.º 6
0
 def test_migrated_params_is_error(self):
     with self.assertLogs("fetcher.fetch", level=logging.ERROR):
         result = fetch.fetch_or_wrap_error(
             self.ctx,
             self.chroot_context,
             self.basedir,
             "mod",
             create_module_zipfile("mod"),
             ModuleExitedError("mod", 1, "Traceback:\n\n\nRuntimeError: bad"),
             {},
             None,
             None,
             self.output_path,
         )
     self.assertEqual(result, self._bug_err("exit code 1: RuntimeError: bad"))
Ejemplo n.º 7
0
 def test_fetch_module_error(self, load_module):
     load_module.return_value.migrate_params.return_value = {}
     load_module.return_value.fetch.side_effect = ModuleExitedError(1, "bad")
     with self.assertLogs(level=logging.ERROR):
         result = fetch.fetch_or_wrap_error(
             self.ctx,
             self.basedir,
             WfModule(),
             MockModuleVersion(),
             {},
             None,
             None,
             self.output_path,
         )
     self.assertEqual(result, self._bug_err("exit code 1: bad"))
Ejemplo n.º 8
0
 def test_load_module_compile_error(self, load_module):
     load_module.side_effect = ModuleExitedError(1, "log")
     with self.assertLogs(level=logging.ERROR):
         result = fetch.fetch_or_wrap_error(
             self.ctx,
             self.basedir,
             WfModule(),
             MockModuleVersion("bad"),
             {},
             None,
             None,
             self.output_path,
         )
     self.assertEqual(self.output_path.stat().st_size, 0)
     self.assertEqual(result, self._bug_err("exit code 1: log (during load)"))
Ejemplo n.º 9
0
 def test_fetch_module_error(self):
     self.kernel.fetch.side_effect = ModuleExitedError("mod", 1, "RuntimeError: bad")
     with self.assertLogs(level=logging.ERROR):
         result = fetch.fetch_or_wrap_error(
             self.ctx,
             self.chroot_context,
             self.basedir,
             "mod",
             create_module_zipfile("mod"),
             {},
             {},
             None,
             None,
             self.output_path,
         )
     self.assertEqual(result, self._bug_err("exit code 1: RuntimeError: bad"))
Ejemplo n.º 10
0
    def render(
        self,
        compiled_module: CompiledModule,
        basedir: Path,
        input_table: ArrowTable,
        params: Params,
        tab: Tab,
        fetch_result: Optional[FetchResult],
        output_filename: str,
    ) -> RenderResult:
        request = ttypes.RenderRequest(
            str(basedir),
            input_table.to_thrift(),
            params.to_thrift(),
            tab.to_thrift(),
            None if fetch_result is None else fetch_result.to_thrift(),
            output_filename,
        )
        with _chroot_dir_context(provide_paths=[basedir],
                                 extract_paths=[basedir / output_filename
                                                ]) as chroot:
            result = self._run_in_child(
                chroot=chroot,
                chroot_paths=[basedir] + DATA_PATHS + PARQUET_PATHS +
                NETWORKING_PATHS,  # TODO nix networking
                compiled_module=compiled_module,
                timeout=self.render_timeout,
                result=ttypes.RenderResult(),
                function="render_thrift",
                args=[request],
            )
            if result.table.filename and result.table.filename != output_filename:
                raise ModuleExitedError(0, "Module wrote to wrong output file")

        # RenderResult.from_thrift() verifies all filenames passed by the
        # module are in the directory the module has access to.
        render_result = RenderResult.from_thrift(result, basedir)
        if render_result.table.table is not None:
            validate(render_result.table.table, render_result.table.metadata)
        return render_result
Ejemplo n.º 11
0
 def fetch(
     self,
     compiled_module: CompiledModule,
     basedir: Path,
     params: Params,
     secrets: Dict[str, Any],
     last_fetch_result: Optional[FetchResult],
     input_parquet_filename: str,
     output_filename: str,
 ) -> FetchResult:
     request = ttypes.FetchRequest(
         str(basedir),
         params.to_thrift(),
         RawParams(secrets).to_thrift(),
         None
         if last_fetch_result is None else last_fetch_result.to_thrift(),
         input_parquet_filename,
         output_filename,
     )
     with _chroot_dir_context(provide_paths=[basedir],
                              extract_paths=[basedir / output_filename
                                             ]) as chroot:
         result = self._run_in_child(
             chroot=chroot,
             chroot_paths=[basedir] + DATA_PATHS + PARQUET_PATHS +
             NETWORKING_PATHS,
             compiled_module=compiled_module,
             timeout=self.fetch_timeout,
             result=ttypes.FetchResult(),
             function="fetch_thrift",
             args=[request],
         )
         if result.filename and result.filename != output_filename:
             raise ModuleExitedError(0, "Module wrote to wrong output file")
     # TODO validate result isn't too large. If result is dataframe it makes
     # sense to truncate; but fetch results aren't necessarily data frames.
     # It's up to the module to enforce this logic ... but we need to set a
     # maximum file size.
     return FetchResult.from_thrift(result, basedir)
Ejemplo n.º 12
0
 def test_exited_sigsys(self):
     self.assertEqual(
         # SIGSYS usually means "seccomp killed you"
         format_for_user_debugging(ModuleExitedError(-31, "")),
         "SIGSYS",
     )
Ejemplo n.º 13
0
 def test_exited_sigkill(self):
     self.assertEqual(format_for_user_debugging(ModuleExitedError(-9, "")),
                      "SIGKILL")
Ejemplo n.º 14
0
    def _run_in_child(
        self,
        *,
        chroot: Path,
        chroot_paths: List[Path],
        compiled_module: CompiledModule,
        timeout: float,
        result: Any,
        function: str,
        args: List[Any],
    ) -> None:
        """
        Fork a child process to run `function` with `args`.

        `args` must be Thrift data types. `result` must also be a Thrift type --
        its `.read()` function will be called, which may produce an error if
        the child process has a bug. (EOFError is very likely.)

        Raise ModuleExitedError if the child process did not behave as expected.

        Raise ModuleTimeoutError if it did not exit after a delay -- or if it
        closed its file descriptors long before it exited.
        """
        limit_time = time.time() + timeout

        module_process = self._forkserver.spawn_module(
            process_name=compiled_module.module_slug,
            chroot_dir=chroot,
            chroot_provide_paths=[(p, p) for p in chroot_paths],
            args=[compiled_module, function, args],
        )

        # stdout is Thrift package; stderr is logs
        output_reader = ChildReader(module_process.stdout.fileno(),
                                    OUTPUT_BUFFER_MAX_BYTES)
        log_reader = ChildReader(module_process.stderr.fileno(),
                                 LOG_BUFFER_MAX_BYTES)
        # Read until the child closes its stdout and stderr
        with selectors.DefaultSelector() as selector:
            selector.register(output_reader.fileno, selectors.EVENT_READ)
            selector.register(log_reader.fileno, selectors.EVENT_READ)

            timed_out = False
            while selector.get_map():
                remaining = limit_time - time.time()
                if remaining <= 0:
                    if not timed_out:
                        timed_out = True
                        module_process.kill(
                        )  # untrusted code could ignore SIGTERM
                    timeout = None  # wait as long as it takes for everything to die
                    # Fall through. After SIGKILL the child will close each fd,
                    # sending EOF to us. That means the selector _must_ return.
                else:
                    timeout = remaining  # wait until we reach our timeout

                events = selector.select(timeout=timeout)
                ready = frozenset(key.fd for key, _ in events)
                for reader in (output_reader, log_reader):
                    if reader.fileno in ready:
                        reader.ingest()
                        if reader.eof:
                            selector.unregister(reader.fileno)

        # The child closed its fds, so it should die soon. If it doesn't, that's
        # a bug -- so kill -9 it!
        #
        # os.wait() has no timeout option, and asyncio messes with signals so
        # we won't use those. Spin until the process dies, and force-kill if we
        # spin too long.
        for _ in range(DEAD_PROCESS_N_WAITS):
            pid, exit_status = module_process.wait(os.WNOHANG)
            if pid != 0:  # pid==0 means process is still running
                break
            time.sleep(DEAD_PROCESS_WAIT_POLL_INTERVAL)
        else:
            # we waited and waited. No luck. Dead module. Kill it.
            timed_out = True
            module_process.kill()
            _, exit_status = module_process.wait(0)
        if os.WIFEXITED(exit_status):
            exit_code = os.WEXITSTATUS(exit_status)
        elif os.WIFSIGNALED(exit_status):
            exit_code = -os.WTERMSIG(exit_status)
        else:
            raise RuntimeError("Unhandled wait() status: %r" % exit_status)

        if timed_out:
            raise ModuleTimeoutError

        if exit_code != 0:
            raise ModuleExitedError(exit_code, log_reader.to_str())

        transport = thrift.transport.TTransport.TMemoryBuffer(
            output_reader.buffer)
        protocol = thrift.protocol.TBinaryProtocol.TBinaryProtocol(transport)
        try:
            result.read(protocol)
        except EOFError:  # TODO handle other errors Thrift may throw
            raise ModuleExitedError(exit_code, log_reader.to_str()) from None

        # We should be at the end of the output now. If we aren't, that means
        # the child wrote too much.
        if transport.read(1) != b"":
            raise ModuleExitedError(exit_code, log_reader.to_str())

        if log_reader.buffer:
            logger.info("Output from module process: %s", log_reader.to_str())

        return result
Ejemplo n.º 15
0
def invoke_render(
    module_zipfile: ModuleZipfile,
    *,
    chroot_context: ChrootContext,
    basedir: Path,
    input_filename: Optional[str],
    params: Dict[str, Any],
    tab_name: str,
    fetch_result: Optional[FetchResult],
    tab_outputs: Dict[str, TabOutput],
    uploaded_files: Dict[str, UploadedFile],
    output_filename: str,
) -> LoadedRenderResult:
    """Use kernel to process `table` with module `render` function.

    Raise `ModuleError` on error. (This is usually the module author's fault.)

    Log any ModuleError. Also log success.

    This synchronous method can be slow for complex modules or large
    datasets. Consider calling it from an executor.
    """
    time1 = time.time()
    begin_status_format = "%s:render() (%0.1fMB input)"
    begin_status_args = (
        module_zipfile.path.name,
        (
            (basedir / input_filename).stat().st_size / 1024 / 1024
            if input_filename is not None
            else 0
        ),
    )
    logger.info(begin_status_format + " begin", *begin_status_args)
    status = "???"
    try:
        result = cjwstate.modules.kernel.render(
            module_zipfile.compile_code_without_executing(),
            chroot_context=chroot_context,
            basedir=basedir,
            input_filename=input_filename,
            params=params,
            tab_name=tab_name,
            fetch_result=fetch_result,
            tab_outputs=tab_outputs,
            uploaded_files=uploaded_files,
            output_filename=output_filename,
        )

        output_path = basedir / output_filename
        st_size = output_path.stat().st_size
        if st_size == 0:
            table = pa.table({})
            columns = []
            status = "(no output)"
        else:
            try:
                table, columns = load_untrusted_arrow_file_with_columns(output_path)
                status = "(%drows, %dcols, %0.1fMB)" % (
                    table.num_rows,
                    table.num_columns,
                    st_size / 1024 / 1024,
                )
            except ValidateError as err:
                raise ModuleExitedError(
                    module_zipfile.path.name,
                    0,
                    "Module wrote invalid data: %s" % str(err),
                )
        return LoadedRenderResult(
            path=output_path,
            table=table,
            columns=columns,
            errors=result.errors,
            json=result.json,
        )
    except ModuleError as err:
        logger.exception("Exception in %s:render", module_zipfile.path.name)
        status = type(err).__name__
        raise
    finally:
        time2 = time.time()

        logger.info(
            begin_status_format + " => %s in %dms",
            *begin_status_args,
            status,
            int((time2 - time1) * 1000),
        )
Ejemplo n.º 16
0
 def render(*args, fetch_result, **kwargs):
     raise ModuleExitedError(-9, "")