def test_default_render_returns_fetch_result(self): # Functionality used by libraryofcongress with ExitStack() as ctx: input_arrow_table = ctx.enter_context( arrow_table_context({"A": [1]}, dir=self.basedir) ) parquet_filename = Path( ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name ).name out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name thrift_result = module.render_thrift( ttypes.RenderRequest( str(self.basedir), input_arrow_table.to_thrift(), Params({}).to_thrift(), ttypes.Tab("tab-1", "Tab 1"), ttypes.FetchResult( parquet_filename, [RenderError(I18nMessage.TODO_i18n("A warning")).to_thrift()], ), out_filename, ) ) result = RenderResult.from_thrift(thrift_result, self.basedir) assert_render_result_equals( result, RenderResult( arrow_table({"A": [2]}), [RenderError(I18nMessage.TODO_i18n("A warning"))], ), )
def test_fetch_result_from_thrift_disallow_non_file(self): with tempfile.TemporaryDirectory(dir=str(self.basedir)) as tmpsubdir: with self.assertRaisesRegex(ValueError, "be a regular file"): types.thrift_fetch_result_to_arrow( ttypes.FetchResult(Path(tmpsubdir).name, []), self.basedir, )
def test_fetch_result_from_thrift_happy_path(self): with tempfile.NamedTemporaryFile(dir=str(self.basedir)) as tf: self.assertEqual( types.thrift_fetch_result_to_arrow( ttypes.FetchResult( Path(tf.name).name, [ttypes.RenderError(ttypes.I18nMessage("hi", {}), [])], ), self.basedir, ), types.FetchResult(Path( tf.name), [types.RenderError(types.I18nMessage("hi"))]), )
def fetch( self, compiled_module: CompiledModule, chroot_context: ChrootContext, basedir: Path, params: Dict[str, Any], secrets: Dict[str, Any], last_fetch_result: Optional[FetchResult], input_parquet_filename: Optional[str], output_filename: str, ) -> FetchResult: """Run the module's `fetch_thrift()` function and return its result. Raise ModuleError if the module has a bug. """ chroot_dir = chroot_context.chroot.root basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir) request = ttypes.FetchRequest( basedir=str(basedir_seen_by_module), params=pydict_to_thrift_json_object(params), secrets=pydict_to_thrift_json_object(secrets), last_fetch_result=( None if last_fetch_result is None else arrow_fetch_result_to_thrift(last_fetch_result)), input_table_parquet_filename=input_parquet_filename, output_filename=output_filename, ) try: with chroot_context.writable_file(basedir / output_filename): result = self._run_in_child( chroot_dir=chroot_dir, network_config=pyspawner.NetworkConfig(), compiled_module=compiled_module, timeout=self.fetch_timeout, result=ttypes.FetchResult(), function="fetch_thrift", args=[request], ) finally: chroot_context.clear_unowned_edits() if result.filename and result.filename != output_filename: raise ModuleExitedError(compiled_module.module_slug, 0, "Module wrote to wrong output file") # TODO validate result isn't too large. If result is dataframe it makes # sense to truncate; but fetch results aren't necessarily data frames. # It's up to the module to enforce this logic ... but we need to set a # maximum file size. return thrift_fetch_result_to_arrow(result, basedir)
def test_default_render_returns_fetch_result(self): # Functionality used by libraryofcongress with ExitStack() as ctx: input_arrow_table = ctx.enter_context( arrow_table_context({"A": [1]}, dir=self.basedir) ) parquet_filename = Path( ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name ).name out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name thrift_result = module.render_thrift( ttypes.RenderRequest( str(self.basedir), arrow_arrow_table_to_thrift(input_arrow_table), {}, # params ttypes.Tab("tab-1", "Tab 1"), ttypes.FetchResult( parquet_filename, [ ttypes.RenderError( ttypes.I18nMessage( "TODO_i18n", { "text": ttypes.I18nArgument( string_value="A warning" ) }, ), [], ) ], ), out_filename, ) ) result = thrift_render_result_to_arrow(thrift_result, self.basedir) assert_render_result_equals( result, RenderResult( arrow_table({"A": [2]}), [RenderError(I18nMessage.TODO_i18n("A warning"))], ), )
def fetch( self, compiled_module: CompiledModule, basedir: Path, params: Params, secrets: Dict[str, Any], last_fetch_result: Optional[FetchResult], input_parquet_filename: str, output_filename: str, ) -> FetchResult: request = ttypes.FetchRequest( str(basedir), params.to_thrift(), RawParams(secrets).to_thrift(), None if last_fetch_result is None else last_fetch_result.to_thrift(), input_parquet_filename, output_filename, ) with _chroot_dir_context(provide_paths=[basedir], extract_paths=[basedir / output_filename ]) as chroot: result = self._run_in_child( chroot=chroot, chroot_paths=[basedir] + DATA_PATHS + PARQUET_PATHS + NETWORKING_PATHS, compiled_module=compiled_module, timeout=self.fetch_timeout, result=ttypes.FetchResult(), function="fetch_thrift", args=[request], ) if result.filename and result.filename != output_filename: raise ModuleExitedError(0, "Module wrote to wrong output file") # TODO validate result isn't too large. If result is dataframe it makes # sense to truncate; but fetch results aren't necessarily data frames. # It's up to the module to enforce this logic ... but we need to set a # maximum file size. return FetchResult.from_thrift(result, basedir)
def test_fetch_result_from_thrift_disallow_non_files(self): with self.assertRaisesRegex(ValueError, "must exist"): types.FetchResult.from_thrift( ttypes.FetchResult("missing", []), self.basedir )
def test_fetch_result_from_thrift_disallow_hidden_files(self): with self.assertRaisesRegex(ValueError, "must not be hidden"): types.FetchResult.from_thrift( ttypes.FetchResult(".secrets", []), Path(__file__).parent )
def test_fetch_result_from_thrift_disallow_directories(self): with self.assertRaisesRegex(ValueError, "must not contain directories"): types.FetchResult.from_thrift( ttypes.FetchResult("/etc/passwd", []), Path(__file__).parent )
def call_fetch(fetch: Callable, request: ttypes.FetchRequest) -> ttypes.FetchResult: """Call `fetch()` and validate the result. Module code may contain errors. This function and `fetch()` should strive to raise developer-friendly errors in the case of bugs -- including unexpected input. """ # thrift => pandas basedir = Path(request.basedir) params: Dict[str, Any] = thrift_json_object_to_pydict(request.params) output_path = basedir / request.output_filename spec = inspect.getfullargspec(fetch) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "secrets" in kwonlyargs: kwargs["secrets"] = thrift_json_object_to_pydict(request.secrets) if varkw or "settings" in kwonlyargs: kwargs["settings"] = settings if varkw or "get_input_dataframe" in kwonlyargs: async def get_input_dataframe(): if request.input_table_parquet_filename is None: return None else: return _parquet_to_pandas(basedir / request.input_table_parquet_filename) kwargs["get_input_dataframe"] = get_input_dataframe if varkw or "output_path" in kwonlyargs: kwargs["output_path"] = output_path result = fetch(params, **kwargs) if asyncio.iscoroutine(result): result = asyncio.run(result) if isinstance(result, tuple) and len(result) == 2 and isinstance( result[0], Path): errors = ptypes.coerce_RenderError_list(result[1]) elif isinstance(result, Path): errors = [] elif isinstance(result, list): errors = ptypes.coerce_RenderError_list(result) else: pandas_result = ptypes.ProcessResult.coerce(result) pandas_result.truncate_in_place_if_too_big() # ProcessResult => FetchResult isn't a thing; but we can hack it using # ProcessResult => RenderResult => FetchResult. with tempfile_context(suffix=".arrow") as arrow_path: if pandas_result.columns: hacky_result = pandas_result.to_arrow(arrow_path) table = load_trusted_arrow_file(arrow_path) cjwparquet.write(output_path, table) errors = hacky_result.errors else: output_path.write_bytes(b"") errors = pandas_result.errors return ttypes.FetchResult( filename=request.output_filename, errors=[arrow_render_error_to_thrift(e) for e in errors], )
def test_fetch_result_from_thrift_disallow_hidden_files(self): with self.assertRaisesRegex(ValueError, "must not be hidden"): types.thrift_fetch_result_to_arrow( ttypes.FetchResult(".secrets", []), self.basedir)
def test_fetch_result_from_thrift_disallow_directories(self): with self.assertRaisesRegex(ValueError, "must not include directory names"): types.thrift_fetch_result_to_arrow( ttypes.FetchResult("/etc/passwd", []), self.basedir)