def render( self, compiled_module: CompiledModule, chroot_context: ChrootContext, basedir: Path, input_table: ArrowTable, params: Params, tab: Tab, fetch_result: Optional[FetchResult], output_filename: str, ) -> RenderResult: """Run the module's `render_thrift()` function and return its result. Raise ModuleError if the module has a bug. """ chroot_dir = chroot_context.chroot.root basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir) request = ttypes.RenderRequest( str(basedir_seen_by_module), arrow_arrow_table_to_thrift(input_table), arrow_params_to_thrift(params), arrow_tab_to_thrift(tab), (None if fetch_result is None else arrow_fetch_result_to_thrift(fetch_result)), output_filename, ) try: with chroot_context.writable_file(basedir / output_filename): result = self._run_in_child( chroot_dir=chroot_dir, network_config=pyspawner.NetworkConfig( ), # TODO disallow networking compiled_module=compiled_module, timeout=self.render_timeout, result=ttypes.RenderResult(), function="render_thrift", args=[request], ) finally: chroot_context.clear_unowned_edits() if result.table.filename and result.table.filename != output_filename: raise ModuleExitedError(compiled_module.module_slug, 0, "Module wrote to wrong output file") try: # thrift_render_result_to_arrow() verifies all filenames passed by # the module are in the directory the module has access to. It # assumes the Arrow file (if there is one) is untrusted, so it can # raise ValidateError render_result = thrift_render_result_to_arrow(result, basedir) except ValidateError as err: raise ModuleExitedError( compiled_module.module_slug, 0, "Module produced invalid data: %s" % str(err), ) return render_result
def render( self, compiled_module: CompiledModule, chroot_context: ChrootContext, basedir: Path, input_filename: str, params: Dict[str, Any], tab_name: str, fetch_result: Optional[FetchResult], tab_outputs: List[TabOutput], uploaded_files: Dict[str, UploadedFile], output_filename: str, ) -> RenderResult: """Run the module's `render_thrift()` function and return its result. Raise ModuleError if the module has a bug. """ chroot_dir = chroot_context.chroot.root basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir) request = ttypes.RenderRequest( basedir=str(basedir_seen_by_module), params=pydict_to_thrift_json_object(params), tab_name=tab_name, tab_outputs={ k: arrow_tab_output_to_thrift(v) for k, v in tab_outputs.items() }, uploaded_files={ k: arrow_uploaded_file_to_thrift(v) for k, v in uploaded_files.items() }, fetch_result=(None if fetch_result is None else arrow_fetch_result_to_thrift(fetch_result)), output_filename=output_filename, input_filename=input_filename, ) if compiled_module.module_slug in {"pythoncode", "ACS2016"}: # TODO disallow networking; make network_config always None network_config = pyspawner.NetworkConfig() else: network_config = None try: with chroot_context.writable_file(basedir / output_filename): result = self._run_in_child( chroot_dir=chroot_dir, network_config=network_config, compiled_module=compiled_module, timeout=self.render_timeout, result=ttypes.RenderResult(), function="render_thrift", args=[request], ) finally: chroot_context.clear_unowned_edits() return thrift_render_result_to_arrow(result)
def fetch( self, compiled_module: CompiledModule, chroot_context: ChrootContext, basedir: Path, params: Dict[str, Any], secrets: Dict[str, Any], last_fetch_result: Optional[FetchResult], input_parquet_filename: Optional[str], output_filename: str, ) -> FetchResult: """Run the module's `fetch_thrift()` function and return its result. Raise ModuleError if the module has a bug. """ chroot_dir = chroot_context.chroot.root basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir) request = ttypes.FetchRequest( basedir=str(basedir_seen_by_module), params=pydict_to_thrift_json_object(params), secrets=pydict_to_thrift_json_object(secrets), last_fetch_result=( None if last_fetch_result is None else arrow_fetch_result_to_thrift(last_fetch_result)), input_table_parquet_filename=input_parquet_filename, output_filename=output_filename, ) try: with chroot_context.writable_file(basedir / output_filename): result = self._run_in_child( chroot_dir=chroot_dir, network_config=pyspawner.NetworkConfig(), compiled_module=compiled_module, timeout=self.fetch_timeout, result=ttypes.FetchResult(), function="fetch_thrift", args=[request], ) finally: chroot_context.clear_unowned_edits() if result.filename and result.filename != output_filename: raise ModuleExitedError(compiled_module.module_slug, 0, "Module wrote to wrong output file") # TODO validate result isn't too large. If result is dataframe it makes # sense to truncate; but fetch results aren't necessarily data frames. # It's up to the module to enforce this logic ... but we need to set a # maximum file size. return thrift_fetch_result_to_arrow(result, basedir)
async def execute_tab_flow( chroot_context: ChrootContext, workflow: Workflow, flow: TabFlow, tab_results: Dict[Tab, Optional[RenderResult]], output_path: Path, ) -> RenderResult: """Ensure `flow.tab.live_steps` all cache fresh render results. `tab_results.keys()` must be ordered as the Workflow's tabs are. Raise `UnneededExecution` if something changes underneath us such that we can't guarantee all render results will be fresh. (The remaining execution is "unneeded" because we assume another render has been queued.) WEBSOCKET NOTES: each step is executed in turn. After each execution, we notify clients of its new columns and status. """ logger.debug( "Rendering Tab(%d, %s - %s)", workflow.id, flow.tab_slug, flow.tab.name ) basedir = output_path.parent # Execute one module at a time. # # We don't hold any lock throughout the loop: the loop can take a long # time; it might be run multiple times simultaneously (even on # different computers); and `await` doesn't work with locks. # # We pass data between two Arrow files, kinda like double-buffering. The # two are `output_path` and `buffer_path`. This requires fewer temporary # files, so it's less of a hassle to clean up. with chroot_context.tempfile_context( dir=basedir, prefix="render-buffer", suffix=".arrow" ) as buffer_path: # We will render from `buffer_path` to `output_path` and from # `output_path` to `buffer_path`, alternating, so that the final output # is in `output_path` and we only use a single tempfile. (Think "page # flipping" in graphics.) Illustrated: # # [cache] -> A -> B -> C: A and C use `output_path`. # [cache] -> A -> B: cache and B use `output_path`. step_output_paths = cycle([output_path, buffer_path]) # Find the first stale step, going backwards. Build a to-do list (in # reverse). # # When render() exits, the render cache should be fresh for all steps. # "fresh" means `step.cached_render_result` returns non-None and # reading does not result in a `CorruptCacheError`. BUT it's really # expensive to check for `CorruptCacheError` all the time; so as an # optimization, we only check for `CorruptCacheError` when it prevents # us from loading a step's _input_. [2019-10-10, adamhooper] This rule # was created so that renderer can recover from `CorruptCacheError` # (instead of crashing completely). `CorruptCacheError` is still a # serious problem that needs human intervention. # # A _correct_ approach would be to read every step from the cache. # # Set `step_index` (first step that needs rendering) and `last_result` # (the input to `flow.steps[step_index]`) known_stale = flow.first_stale_index for step_index in range(len(flow.steps) - 1, -1, -1): step_output_path = next(step_output_paths) if known_stale is not None and step_index >= known_stale: # We know this step needs to be rendered, from our # last_relevant_delta_id math. continue # loop, decrementing `step_index` else: # This step _shouldn't_ need to be rendered. Load its output. # If we get CorruptCacheError, recover by backtracking another # step. step = flow.steps[step_index].step try: # raise CorruptCacheError, UnneededExecution last_result = await _load_step_output_from_rendercache( workflow, step, step_output_path ) # `last_result` will be the input into steps[step_index] step_index += 1 break except CorruptCacheError: logger.exception( "Backtracking to recover from corrupt cache in wf-%d/wfm-%d", workflow.id, step.id, ) # loop else: # "Step minus-1" -- we need an input into flow.steps[0] # # fiddle with cycle's state -- `last_result` has no backing file; # but if it did, it would be `next(step_output_paths)`. next(step_output_paths) last_result = RenderResult() step_index = 0 # needed when there are no steps at all for step, step_output_path in zip(flow.steps[step_index:], step_output_paths): step_output_path.write_bytes(b"") # don't leak data from two steps ago next_result = await execute_step( chroot_context=chroot_context, workflow=workflow, step=step.step, module_zipfile=step.module_zipfile, params=step.params, tab=flow.tab, input_result=last_result, tab_results=tab_results, output_path=step_output_path, ) last_result = next_result return last_result