Example #1
0
    def render(
        self,
        compiled_module: CompiledModule,
        chroot_context: ChrootContext,
        basedir: Path,
        input_table: ArrowTable,
        params: Params,
        tab: Tab,
        fetch_result: Optional[FetchResult],
        output_filename: str,
    ) -> RenderResult:
        """Run the module's `render_thrift()` function and return its result.

        Raise ModuleError if the module has a bug.
        """
        chroot_dir = chroot_context.chroot.root
        basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir)
        request = ttypes.RenderRequest(
            str(basedir_seen_by_module),
            arrow_arrow_table_to_thrift(input_table),
            arrow_params_to_thrift(params),
            arrow_tab_to_thrift(tab),
            (None if fetch_result is None else
             arrow_fetch_result_to_thrift(fetch_result)),
            output_filename,
        )
        try:
            with chroot_context.writable_file(basedir / output_filename):
                result = self._run_in_child(
                    chroot_dir=chroot_dir,
                    network_config=pyspawner.NetworkConfig(
                    ),  # TODO disallow networking
                    compiled_module=compiled_module,
                    timeout=self.render_timeout,
                    result=ttypes.RenderResult(),
                    function="render_thrift",
                    args=[request],
                )
        finally:
            chroot_context.clear_unowned_edits()

        if result.table.filename and result.table.filename != output_filename:
            raise ModuleExitedError(compiled_module.module_slug, 0,
                                    "Module wrote to wrong output file")

        try:
            # thrift_render_result_to_arrow() verifies all filenames passed by
            # the module are in the directory the module has access to. It
            # assumes the Arrow file (if there is one) is untrusted, so it can
            # raise ValidateError
            render_result = thrift_render_result_to_arrow(result, basedir)
        except ValidateError as err:
            raise ModuleExitedError(
                compiled_module.module_slug,
                0,
                "Module produced invalid data: %s" % str(err),
            )
        return render_result
Example #2
0
    def render(
        self,
        compiled_module: CompiledModule,
        chroot_context: ChrootContext,
        basedir: Path,
        input_filename: str,
        params: Dict[str, Any],
        tab_name: str,
        fetch_result: Optional[FetchResult],
        tab_outputs: List[TabOutput],
        uploaded_files: Dict[str, UploadedFile],
        output_filename: str,
    ) -> RenderResult:
        """Run the module's `render_thrift()` function and return its result.

        Raise ModuleError if the module has a bug.
        """
        chroot_dir = chroot_context.chroot.root
        basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir)
        request = ttypes.RenderRequest(
            basedir=str(basedir_seen_by_module),
            params=pydict_to_thrift_json_object(params),
            tab_name=tab_name,
            tab_outputs={
                k: arrow_tab_output_to_thrift(v)
                for k, v in tab_outputs.items()
            },
            uploaded_files={
                k: arrow_uploaded_file_to_thrift(v)
                for k, v in uploaded_files.items()
            },
            fetch_result=(None if fetch_result is None else
                          arrow_fetch_result_to_thrift(fetch_result)),
            output_filename=output_filename,
            input_filename=input_filename,
        )
        if compiled_module.module_slug in {"pythoncode", "ACS2016"}:
            # TODO disallow networking; make network_config always None
            network_config = pyspawner.NetworkConfig()
        else:
            network_config = None
        try:
            with chroot_context.writable_file(basedir / output_filename):
                result = self._run_in_child(
                    chroot_dir=chroot_dir,
                    network_config=network_config,
                    compiled_module=compiled_module,
                    timeout=self.render_timeout,
                    result=ttypes.RenderResult(),
                    function="render_thrift",
                    args=[request],
                )
        finally:
            chroot_context.clear_unowned_edits()

        return thrift_render_result_to_arrow(result)
Example #3
0
    def fetch(
        self,
        compiled_module: CompiledModule,
        chroot_context: ChrootContext,
        basedir: Path,
        params: Dict[str, Any],
        secrets: Dict[str, Any],
        last_fetch_result: Optional[FetchResult],
        input_parquet_filename: Optional[str],
        output_filename: str,
    ) -> FetchResult:
        """Run the module's `fetch_thrift()` function and return its result.

        Raise ModuleError if the module has a bug.
        """
        chroot_dir = chroot_context.chroot.root
        basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir)
        request = ttypes.FetchRequest(
            basedir=str(basedir_seen_by_module),
            params=pydict_to_thrift_json_object(params),
            secrets=pydict_to_thrift_json_object(secrets),
            last_fetch_result=(
                None if last_fetch_result is None else
                arrow_fetch_result_to_thrift(last_fetch_result)),
            input_table_parquet_filename=input_parquet_filename,
            output_filename=output_filename,
        )
        try:
            with chroot_context.writable_file(basedir / output_filename):
                result = self._run_in_child(
                    chroot_dir=chroot_dir,
                    network_config=pyspawner.NetworkConfig(),
                    compiled_module=compiled_module,
                    timeout=self.fetch_timeout,
                    result=ttypes.FetchResult(),
                    function="fetch_thrift",
                    args=[request],
                )
        finally:
            chroot_context.clear_unowned_edits()

        if result.filename and result.filename != output_filename:
            raise ModuleExitedError(compiled_module.module_slug, 0,
                                    "Module wrote to wrong output file")

        # TODO validate result isn't too large. If result is dataframe it makes
        # sense to truncate; but fetch results aren't necessarily data frames.
        # It's up to the module to enforce this logic ... but we need to set a
        # maximum file size.
        return thrift_fetch_result_to_arrow(result, basedir)
Example #4
0
async def execute_tab_flow(
    chroot_context: ChrootContext,
    workflow: Workflow,
    flow: TabFlow,
    tab_results: Dict[Tab, Optional[RenderResult]],
    output_path: Path,
) -> RenderResult:
    """Ensure `flow.tab.live_steps` all cache fresh render results.

    `tab_results.keys()` must be ordered as the Workflow's tabs are.

    Raise `UnneededExecution` if something changes underneath us such that we
    can't guarantee all render results will be fresh. (The remaining execution
    is "unneeded" because we assume another render has been queued.)

    WEBSOCKET NOTES: each step is executed in turn. After each execution,
    we notify clients of its new columns and status.
    """
    logger.debug(
        "Rendering Tab(%d, %s - %s)", workflow.id, flow.tab_slug, flow.tab.name
    )

    basedir = output_path.parent

    # Execute one module at a time.
    #
    # We don't hold any lock throughout the loop: the loop can take a long
    # time; it might be run multiple times simultaneously (even on
    # different computers); and `await` doesn't work with locks.
    #
    # We pass data between two Arrow files, kinda like double-buffering. The
    # two are `output_path` and `buffer_path`. This requires fewer temporary
    # files, so it's less of a hassle to clean up.
    with chroot_context.tempfile_context(
        dir=basedir, prefix="render-buffer", suffix=".arrow"
    ) as buffer_path:
        # We will render from `buffer_path` to `output_path` and from
        # `output_path` to `buffer_path`, alternating, so that the final output
        # is in `output_path` and we only use a single tempfile. (Think "page
        # flipping" in graphics.) Illustrated:
        #
        # [cache] -> A -> B -> C: A and C use `output_path`.
        # [cache] -> A -> B: cache and B use `output_path`.
        step_output_paths = cycle([output_path, buffer_path])

        # Find the first stale step, going backwards. Build a to-do list (in
        # reverse).
        #
        # When render() exits, the render cache should be fresh for all steps.
        # "fresh" means `step.cached_render_result` returns non-None and
        # reading does not result in a `CorruptCacheError`. BUT it's really
        # expensive to check for `CorruptCacheError` all the time; so as an
        # optimization, we only check for `CorruptCacheError` when it prevents
        # us from loading a step's _input_. [2019-10-10, adamhooper] This rule
        # was created so that renderer can recover from `CorruptCacheError`
        # (instead of crashing completely). `CorruptCacheError` is still a
        # serious problem that needs human intervention.
        #
        # A _correct_ approach would be to read every step from the cache.
        #
        # Set `step_index` (first step that needs rendering) and `last_result`
        # (the input to `flow.steps[step_index]`)
        known_stale = flow.first_stale_index
        for step_index in range(len(flow.steps) - 1, -1, -1):
            step_output_path = next(step_output_paths)
            if known_stale is not None and step_index >= known_stale:
                # We know this step needs to be rendered, from our
                # last_relevant_delta_id math.
                continue  # loop, decrementing `step_index`
            else:
                # This step _shouldn't_ need to be rendered. Load its output.
                # If we get CorruptCacheError, recover by backtracking another
                # step.
                step = flow.steps[step_index].step
                try:
                    # raise CorruptCacheError, UnneededExecution
                    last_result = await _load_step_output_from_rendercache(
                        workflow, step, step_output_path
                    )
                    # `last_result` will be the input into steps[step_index]
                    step_index += 1
                    break
                except CorruptCacheError:
                    logger.exception(
                        "Backtracking to recover from corrupt cache in wf-%d/wfm-%d",
                        workflow.id,
                        step.id,
                    )
                    # loop
        else:
            # "Step minus-1" -- we need an input into flow.steps[0]
            #
            # fiddle with cycle's state -- `last_result` has no backing file;
            # but if it did, it would be `next(step_output_paths)`.
            next(step_output_paths)
            last_result = RenderResult()
            step_index = 0  # needed when there are no steps at all

        for step, step_output_path in zip(flow.steps[step_index:], step_output_paths):
            step_output_path.write_bytes(b"")  # don't leak data from two steps ago
            next_result = await execute_step(
                chroot_context=chroot_context,
                workflow=workflow,
                step=step.step,
                module_zipfile=step.module_zipfile,
                params=step.params,
                tab=flow.tab,
                input_result=last_result,
                tab_results=tab_results,
                output_path=step_output_path,
            )
            last_result = next_result

        return last_result