def load_cached_render_result( crr: CachedRenderResult, path: Path ) -> LoadedRenderResult: """Create a LoadedRenderResult was it was passed to `cache_render_result()`. Write a zero-byte file if `crr` has no columns. The returned LoadedRenderResult is backed by `path`, an mmapped file on disk. The whole operation doesn't require much physical RAM. Raise CorruptCacheError if the cached data does not match `crr`. That can mean: * The cached Parquet file is corrupt * The cached Parquet file is missing * `crr` is stale -- the cached result is for a different delta. This could be detected by a `Workflow.cooperative_lock()`, too, should the caller want to distinguish this error from the others. """ if not crr.table_metadata.columns: # Zero-column tables aren't written to cache path.write_bytes(b"") return LoadedRenderResult( path=path, table=pa.table({}), columns=[], errors=crr.errors, json=crr.json, ) else: # raises CorruptCacheError with downloaded_parquet_file(crr) as parquet_path: try: # raises ArrowIOError table = read_parquet_as_arrow(parquet_path, crr.table_metadata.columns) except pa.ArrowIOError as err: raise CorruptCacheError from err # We don't expect errors writing to disk: this shouldn't consume RAM with pa.ipc.RecordBatchFileWriter(path, table.schema) as writer: writer.write_table(table) # Now, read the table from the file, so that `path` and `table` are # equivalent. Don't validate the file: we know what it contains. with pa.ipc.open_file(path) as reader: table = reader.read_all() return LoadedRenderResult( path=path, table=table, columns=crr.table_metadata.columns, errors=crr.errors, json=crr.json, )
def test_metadata_does_not_require_file_read(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Timestamp()), Column("C", ColumnType.Text()), Column("D", ColumnType.Date("month")), ] with arrow_table_context( make_column("A", [1], format="{:,.2f}"), make_column("B", [datetime.datetime(2021, 4, 13)]), make_column("C", ["c"]), make_column("D", [datetime.date(2021, 4, 1)], unit="month"), ) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=columns, errors=[], json={} ) cache_render_result(self.workflow, self.step, 1, result) # Delete from disk entirely, to prove we did not read. s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_step = Step.objects.get(id=self.step.id) cached_result = fresh_step.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
def test_invalid_parquet_is_corrupt_cache_error(self): with arrow_table_context(make_column("A", ["x"])) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Text())], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) crr = self.step.cached_render_result s3.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET") with tempfile_context() as arrow_path: with self.assertRaises(CorruptCacheError): with open_cached_render_result(crr) as loaded: pass
def test_read_cached_render_result_slice_as_text_timestamp(self): with arrow_table_context( make_column("A", [2134213412341232967, None], pa.timestamp("ns")) ) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Timestamp())], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) crr = self.step.cached_render_result self.assertEqual( read_cached_render_result_slice_as_text(crr, "csv", range(2), range(3)), "A\n2037-08-18T13:03:32.341232967Z\n", )
def test_clear(self): with arrow_table_context(make_column("A", [1])) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Number(format="{:,}"))], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) parquet_key = crr_parquet_key(self.step.cached_render_result) clear_cached_render_result_for_step(self.step) db_step = Step.objects.get(id=self.step.id) self.assertIsNone(db_step.cached_render_result) self.assertFalse(s3.exists(BUCKET, parquet_key))
def test_cache_render_result(self): with arrow_table_context(make_column("A", [1])) as (table_path, table): result = LoadedRenderResult( path=table_path, table=table, columns=[Column("A", ColumnType.Number(format="{:,}"))], errors=[ RenderError( I18nMessage("e1", {"text": "hi"}, None), [ QuickFix( I18nMessage("q1", {"var": 2}, None), QuickFixAction.PrependStep("filter", {"a": "x"}), ) ], ), RenderError(I18nMessage("e2", {}, None), []), ], json={"foo": "bar"}, ) cache_render_result(self.workflow, self.step, 1, result) cached = self.step.cached_render_result self.assertEqual(cached.step_id, self.step.id) self.assertEqual(cached.delta_id, 1) self.assertEqual( crr_parquet_key(cached), f"wf-{self.workflow.id}/wfm-{self.step.id}/delta-1.dat", ) # Reading completely freshly from the DB should give the same thing db_step = Step.objects.get(id=self.step.id) from_db = db_step.cached_render_result self.assertEqual(from_db, cached) with open_cached_render_result(from_db) as result2: assert_arrow_table_equals( result2.table, make_table(make_column("A", [1], format="{:,}")) ) self.assertEqual( result2.columns, [Column("A", ColumnType.Number(format="{:,}"))] )
def write_to_rendercache( workflow: Workflow, step: Step, delta_id: int, table: pa.Table, errors: List[RenderError] = [], json: Dict[str, Any] = {}, ) -> None: with arrow_table_context(table) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=read_columns(table, full=False), errors=errors, json=json, ) # use the caller-provided delta ID: no assertion old_last_relevant_delta_id = step.last_relevant_delta_id step.last_relevant_delta_id = delta_id try: cache_render_result(workflow, step, delta_id, result) finally: step.last_relevant_delta_id = old_last_relevant_delta_id
async def _render_step( chroot_context: ChrootContext, workflow: Workflow, step: Step, module_zipfile: Optional[ModuleZipfile], raw_params: Dict[str, Any], tab_name: str, input_path: Path, input_table_columns: List[Column], tab_results: Dict[Tab, Optional[StepResult]], output_path: Path, ) -> LoadedRenderResult: """Prepare and call `step`'s `render()`; return a LoadedRenderResult. The actual render runs in a background thread so the event loop can process other events. """ basedir = output_path.parent if step.order > 0 and not input_table_columns: return LoadedRenderResult.unreachable(output_path) if module_zipfile is None: return LoadedRenderResult.from_errors( output_path, errors=[ RenderError( trans( "py.renderer.execute.step.noModule", default="Please delete this step: an administrator uninstalled its code.", ) ) ], ) # exit_stack: stuff that gets deleted when the render is done with contextlib.ExitStack() as exit_stack: try: # raise UnneededExecution, TabCycleError, TabOutputUnreachableError, # NoLoadedDataError, PromptingError fetch_result, params, tab_outputs, uploaded_files = await _execute_step_pre( basedir=basedir, exit_stack=exit_stack, workflow=workflow, step=step, module_zipfile=module_zipfile, raw_params=raw_params, input_path=input_path, input_table_columns=input_table_columns, tab_results=tab_results, ) except NoLoadedDataError: return LoadedRenderResult.from_errors( output_path, errors=[ RenderError( trans( "py.renderer.execute.step.NoLoadedDataError", default="Please Add Data before this step.", ) ) ], ) except TabCycleError: return LoadedRenderResult.from_errors( output_path, errors=[ RenderError( trans( "py.renderer.execute.step.TabCycleError", default="The chosen tab depends on this one. Please choose another tab.", ) ) ], ) except TabOutputUnreachableError: return LoadedRenderResult.from_errors( output_path, errors=[ RenderError( trans( "py.renderer.execute.step.TabOutputUnreachableError", default="The chosen tab has no output. Please select another one.", ) ) ], ) except PromptingError as err: return LoadedRenderResult.from_errors( output_path, errors=err.as_render_errors() ) # Render may take a while. run_in_executor to push that slowdown to a # thread and keep our event loop responsive. loop = asyncio.get_event_loop() try: return await loop.run_in_executor( None, partial( invoke_render, module_zipfile, chroot_context=chroot_context, basedir=basedir, input_filename=input_path.name, params=params, tab_name=tab_name, tab_outputs=tab_outputs, uploaded_files=uploaded_files, fetch_result=fetch_result, output_filename=output_path.name, ), ) except ModuleError as err: output_path.write_bytes(b"") # SECURITY return LoadedRenderResult.from_errors( output_path, errors=[ RenderError( trans( "py.renderer.execute.step.user_visible_bug_during_render", default="Something unexpected happened. We have been notified and are " "working to fix it. If this persists, contact us. Error code: {message}", arguments={"message": format_for_user_debugging(err)}, ) ) ], )
def invoke_render( module_zipfile: ModuleZipfile, *, chroot_context: ChrootContext, basedir: Path, input_filename: Optional[str], params: Dict[str, Any], tab_name: str, fetch_result: Optional[FetchResult], tab_outputs: Dict[str, TabOutput], uploaded_files: Dict[str, UploadedFile], output_filename: str, ) -> LoadedRenderResult: """Use kernel to process `table` with module `render` function. Raise `ModuleError` on error. (This is usually the module author's fault.) Log any ModuleError. Also log success. This synchronous method can be slow for complex modules or large datasets. Consider calling it from an executor. """ time1 = time.time() begin_status_format = "%s:render() (%0.1fMB input)" begin_status_args = ( module_zipfile.path.name, ( (basedir / input_filename).stat().st_size / 1024 / 1024 if input_filename is not None else 0 ), ) logger.info(begin_status_format + " begin", *begin_status_args) status = "???" try: result = cjwstate.modules.kernel.render( module_zipfile.compile_code_without_executing(), chroot_context=chroot_context, basedir=basedir, input_filename=input_filename, params=params, tab_name=tab_name, fetch_result=fetch_result, tab_outputs=tab_outputs, uploaded_files=uploaded_files, output_filename=output_filename, ) output_path = basedir / output_filename st_size = output_path.stat().st_size if st_size == 0: table = pa.table({}) columns = [] status = "(no output)" else: try: table, columns = load_untrusted_arrow_file_with_columns(output_path) status = "(%drows, %dcols, %0.1fMB)" % ( table.num_rows, table.num_columns, st_size / 1024 / 1024, ) except ValidateError as err: raise ModuleExitedError( module_zipfile.path.name, 0, "Module wrote invalid data: %s" % str(err), ) return LoadedRenderResult( path=output_path, table=table, columns=columns, errors=result.errors, json=result.json, ) except ModuleError as err: logger.exception("Exception in %s:render", module_zipfile.path.name) status = type(err).__name__ raise finally: time2 = time.time() logger.info( begin_status_format + " => %s in %dms", *begin_status_args, status, int((time2 - time1) * 1000), )