def test_fetch_result_from_thrift_disallow_non_file(self): with tempfile.TemporaryDirectory(dir=str(self.basedir)) as tmpsubdir: with self.assertRaisesRegex(ValueError, "be a regular file"): types.thrift_fetch_result_to_arrow( ttypes.FetchResult(Path(tmpsubdir).name, []), self.basedir, )
def render_thrift(request: ttypes.RenderRequest) -> ttypes.RenderResult: """ Render using Thrift data types. This function will convert to `cjwkernel.types` (opening Arrow tables in the process), call `render_arrow()`, and then convert the result back to Thrift. This uses very little RAM. Module authors may overwrite this function to avoid reading or writing the data table entirely -- for instance, a "change number format" module may not need to read any data, so it could operate on the Thrift layer. Most modules _do_ look at table data, so they should not overwrite this function. """ basedir = Path(request.basedir) arrow_table = thrift_arrow_table_to_arrow(request.input_table, basedir, trusted=True) params = thrift_params_to_arrow(request.params, basedir) params_dict = params.params if request.fetch_result is None: fetch_result = None else: fetch_result = thrift_fetch_result_to_arrow(request.fetch_result, basedir) arrow_result: types.RenderResult = __render_by_signature( table=arrow_table, params=params_dict, tab_name=request.tab.name, fetch_result=fetch_result, output_path=basedir / request.output_filename, ) return arrow_render_result_to_thrift(arrow_result)
def call_fetch(fetch: Callable, request: ttypes.FetchRequest) -> ttypes.FetchResult: """Call `fetch()` and validate the result. Module code may contain errors. This function and `fetch()` should strive to raise developer-friendly errors in the case of bugs -- including unexpected input. """ # thrift => pandas basedir = Path(request.basedir) params: Dict[str, Any] = thrift_json_object_to_pydict(request.params) secrets: Dict[str, Any] = thrift_json_object_to_pydict(request.secrets) if request.input_table_parquet_filename is None: input_table_parquet_path = None else: input_table_parquet_path = basedir / request.input_table_parquet_filename if request.last_fetch_result is None: last_fetch_result = None else: last_fetch_result = thrift_fetch_result_to_arrow( request.last_fetch_result, basedir) output_path = basedir / request.output_filename result = fetch( params=params, secrets=secrets, last_fetch_result=last_fetch_result, input_table_parquet_path=input_table_parquet_path, output_path=output_path, ) return arrow_fetch_result_to_thrift(result)
def _test_fetch( self, fetch_fn, *, params={}, secrets={}, last_fetch_result=None, input_table_parquet_path=None, output_filename=None, ): with ExitStack() as ctx: ctx.enter_context(patch.object(module, "fetch", fetch_fn)) if output_filename is None: # Make a temporary output filename -- this will make `fetch()` # complete, but callers won't be able to see the data it # outputs because we'll delete the file too soon. output_filename = ctx.enter_context( tempfile_context(dir=self.basedir)).name thrift_result = module.fetch_thrift( ttypes.FetchRequest( basedir=str(self.basedir), params=arrow_params_to_thrift(Params(params)), secrets=arrow_raw_params_to_thrift(RawParams(secrets)), last_fetch_result=( arrow_fetch_result_to_thrift(last_fetch_result) if last_fetch_result is not None else None), input_table_parquet_filename=(input_table_parquet_path.name if input_table_parquet_path is not None else None), output_filename=output_filename, )) return thrift_fetch_result_to_arrow(thrift_result, self.basedir)
def call_render(render: Callable, request: ttypes.RenderRequest) -> ttypes.RenderResult: basedir = Path(request.basedir) table = load_trusted_arrow_file(basedir / request.input_filename) params = thrift_json_object_to_pydict(request.params) tab_outputs = { k: TabOutput( tab_name=v.tab_name, table=load_trusted_arrow_file(basedir / v.table_filename), ) for k, v in request.tab_outputs.items() } uploaded_files = { k: UploadedFile( name=v.name, path=(basedir / v.filename), uploaded_at=datetime.datetime.utcfromtimestamp( v.uploaded_at_timestampus / 1000000.0), ) for k, v in request.uploaded_files.items() } if request.fetch_result is None: fetch_result = None else: fetch_result = thrift_fetch_result_to_arrow(request.fetch_result, basedir) raw_result = render( table, params, settings=settings, tab_name=request.tab_name, tab_outputs=tab_outputs, uploaded_files=uploaded_files, fetch_result=fetch_result, ) if not isinstance(raw_result, ArrowRenderResult): # Crash. The module author wrote a buggy module. raise ValueError( "render_arrow_v1() must return a cjwmodule.arrow.types.ArrowRenderResult" ) with pa.ipc.RecordBatchFileWriter( basedir / request.output_filename, schema=raw_result.table.schema) as writer: writer.write_table(raw_result.table) return ttypes.RenderResult( errors=[arrow_render_error_to_thrift(e) for e in raw_result.errors], json=pydict_to_thrift_json_object(raw_result.json), )
def fetch_thrift(request: ttypes.FetchRequest) -> ttypes.FetchResult: basedir = Path(request.basedir) arrow_result = fetch_arrow( thrift_params_to_arrow(request.params, basedir).params, thrift_raw_params_to_arrow(request.secrets).params, (None if request.last_fetch_result is None else thrift_fetch_result_to_arrow(request.last_fetch_result, basedir)), (None if request.input_table_parquet_filename is None else basedir / request.input_table_parquet_filename), basedir / request.output_filename, ) return arrow_fetch_result_to_thrift(arrow_result)
def test_fetch_result_from_thrift_happy_path(self): with tempfile.NamedTemporaryFile(dir=str(self.basedir)) as tf: self.assertEqual( types.thrift_fetch_result_to_arrow( ttypes.FetchResult( Path(tf.name).name, [ttypes.RenderError(ttypes.I18nMessage("hi", {}), [])], ), self.basedir, ), types.FetchResult(Path( tf.name), [types.RenderError(types.I18nMessage("hi"))]), )
def call_render(module_spec: ModuleSpec, render: Callable, request: ttypes.RenderRequest) -> ttypes.RenderResult: basedir = Path(request.basedir) input_path = basedir / request.input_filename table, columns = load_trusted_arrow_file_with_columns(input_path) params = _prepare_params( module_spec, thrift_json_object_to_pydict(request.params), basedir=basedir, uploaded_files={ k: thrift_uploaded_file_to_arrow(v) for k, v in request.uploaded_files.items() }, ) if request.fetch_result is None: fetch_result = None else: fetch_result = thrift_fetch_result_to_arrow(request.fetch_result, basedir) output_path = basedir / request.output_filename raw_result = render( table, params, output_path, columns=columns, settings=settings, tab_name=request.tab_name, fetch_result=fetch_result, ) # coerce result # # TODO omit all this code and rely on Workbench's validation. To do this: # # 1. Change all modules to return RenderResult # 2. Nix this coersion code _DEPRECATED_overwrite_to_fix_arrow_table_schema( output_path, fallback_schema=table.schema) if raw_result is None: errors = [] elif isinstance(raw_result, list): errors = coerce_RenderError_list(raw_result) else: raise ValueError("Unhandled raw_result") return ttypes.RenderResult( errors=[arrow_render_error_to_thrift(e) for e in errors], json={}, # this framework never produces JSON )
def fetch( self, compiled_module: CompiledModule, chroot_context: ChrootContext, basedir: Path, params: Dict[str, Any], secrets: Dict[str, Any], last_fetch_result: Optional[FetchResult], input_parquet_filename: Optional[str], output_filename: str, ) -> FetchResult: """Run the module's `fetch_thrift()` function and return its result. Raise ModuleError if the module has a bug. """ chroot_dir = chroot_context.chroot.root basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir) request = ttypes.FetchRequest( basedir=str(basedir_seen_by_module), params=pydict_to_thrift_json_object(params), secrets=pydict_to_thrift_json_object(secrets), last_fetch_result=( None if last_fetch_result is None else arrow_fetch_result_to_thrift(last_fetch_result)), input_table_parquet_filename=input_parquet_filename, output_filename=output_filename, ) try: with chroot_context.writable_file(basedir / output_filename): result = self._run_in_child( chroot_dir=chroot_dir, network_config=pyspawner.NetworkConfig(), compiled_module=compiled_module, timeout=self.fetch_timeout, result=ttypes.FetchResult(), function="fetch_thrift", args=[request], ) finally: chroot_context.clear_unowned_edits() if result.filename and result.filename != output_filename: raise ModuleExitedError(compiled_module.module_slug, 0, "Module wrote to wrong output file") # TODO validate result isn't too large. If result is dataframe it makes # sense to truncate; but fetch results aren't necessarily data frames. # It's up to the module to enforce this logic ... but we need to set a # maximum file size. return thrift_fetch_result_to_arrow(result, basedir)
def test_fetch_result_from_thrift_disallow_non_files(self): with self.assertRaisesRegex(ValueError, "must exist"): types.thrift_fetch_result_to_arrow( ttypes.FetchResult("missing", []), self.basedir)
def test_fetch_result_from_thrift_disallow_hidden_files(self): with self.assertRaisesRegex(ValueError, "must not be hidden"): types.thrift_fetch_result_to_arrow( ttypes.FetchResult(".secrets", []), Path(__file__).parent)
def test_fetch_result_from_thrift_disallow_directories(self): with self.assertRaisesRegex(ValueError, "must not contain directories"): types.thrift_fetch_result_to_arrow( ttypes.FetchResult("/etc/passwd", []), Path(__file__).parent)
def test_fetch_result_from_thrift_disallow_directories(self): with self.assertRaisesRegex(ValueError, "must not include directory names"): types.thrift_fetch_result_to_arrow( ttypes.FetchResult("/etc/passwd", []), self.basedir)