def fetch_pandas( params: Dict[str, Any], secrets: Dict[str, Any], last_fetch_result: Optional[types.FetchResult], input_table_parquet_path: Optional[Path], output_path: Path, ) -> Union[ptypes.ProcessResult, types.FetchResult]: """ Call `fetch()` and validate the result. Module authors should not replace this function: they should replace `fetch()` instead. This function validates the `fetch()` return value, to raise a helpful `ValueError` if the module code is buggy. """ spec = inspect.getfullargspec(fetch) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "secrets" in kwonlyargs: kwargs["secrets"] = secrets if varkw or "get_input_dataframe" in kwonlyargs: async def get_input_dataframe(): if input_table_parquet_path is None: return None else: return __parquet_to_pandas(input_table_parquet_path) kwargs["get_input_dataframe"] = get_input_dataframe if varkw or "get_stored_dataframe" in kwonlyargs: async def get_stored_dataframe(): if last_fetch_result is None: return None else: return __parquet_to_pandas(last_fetch_result.path) kwargs["get_stored_dataframe"] = get_stored_dataframe if varkw or "output_path" in kwonlyargs: kwargs["output_path"] = output_path result = fetch(params, **kwargs) if asyncio.iscoroutine(result): result = asyncio.run(result) if (isinstance(result, tuple) and len(result) == 2 and isinstance(result[0], Path) and isinstance(result[1], str)): return types.FetchResult( result[0], [types.RenderError(types.I18nMessage.TODO_i18n(result[1]))]) elif isinstance(result, Path): return types.FetchResult(result) else: return ptypes.ProcessResult.coerce(result)
def fetch_arrow( params: Dict[str, Any], secrets: Dict[str, Any], last_fetch_result: Optional[types.FetchResult], input_table_parquet_path: Optional[Path], output_path: Path, ) -> types.FetchResult: """Render using `cjwkernel.types` data types. The result will be encoded as a Parquet file. Module authors are encouraged to replace this function, because the `fetch()` signature deals in dataframes instead of in raw data. """ pandas_result: Union[ptypes.ProcessResult, types.FetchResult] = fetch_pandas( params=__arrow_param_to_pandas_param(params), secrets=secrets, last_fetch_result=last_fetch_result, input_table_parquet_path=input_table_parquet_path, output_path=output_path, ) if isinstance(pandas_result, ptypes.ProcessResult): pandas_result.truncate_in_place_if_too_big() # ProcessResult => FetchResult isn't a thing; but we can hack it using # ProcessResult => RenderResult => FetchResult. with tempfile_context(suffix=".arrow") as arrow_path: hacky_result = pandas_result.to_arrow(arrow_path) if hacky_result.table.path: cjwparquet.write(output_path, hacky_result.table.table) else: output_path.write_bytes(b"") return types.FetchResult(output_path, hacky_result.errors) else: # it's already a types.FetchResult return pandas_result
def call_render(module_spec: ModuleSpec, render: Callable, request: ttypes.RenderRequest) -> ttypes.RenderResult: basedir = Path(request.basedir) input_path = basedir / request.input_filename table = load_trusted_arrow_file(input_path) dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table) tab_outputs = { k: _thrift_tab_output_to_pandas(v, basedir) for k, v in request.tab_outputs.items() } params = _prepare_params(module_spec, thrift_json_object_to_pydict(request.params), basedir, tab_outputs) spec = inspect.getfullargspec(render) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "fetch_result" in kwonlyargs: if request.fetch_result is None: fetch_result = None else: fetch_result_path = basedir / request.fetch_result.filename errors = [ # Data comes in as FetchError and we return RenderError. RenderError(thrift_i18n_message_to_arrow(e.message)) for e in request.fetch_result.errors ] if (fetch_result_path.stat().st_size == 0 or cjwparquet.file_has_parquet_magic_number( fetch_result_path)): fetch_result = ptypes.ProcessResult( dataframe=_parquet_to_pandas(fetch_result_path), errors=errors, # infer columns -- the fetch interface doesn't handle formats # (TODO nix pandas_v0 fetching altogether by rewriting all modules) ) else: # TODO nix pandas Fetch modules. (Do any use files, even?) fetch_result = types.FetchResult(path=fetch_result_path, errors=errors) kwargs["fetch_result"] = fetch_result if varkw or "settings" in kwonlyargs: kwargs["settings"] = settings if varkw or "tab_name" in kwonlyargs: kwargs["tab_name"] = request.tab_name if varkw or "input_columns" in kwonlyargs: kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema) input_columns = read_columns(table, full=False) raw_result = render(dataframe, params, **kwargs) # raise ValueError if invalid pandas_result = ptypes.ProcessResult.coerce( raw_result, try_fallback_columns=input_columns) pandas_result.truncate_in_place_if_too_big() arrow_result = pandas_result.to_arrow(basedir / request.output_filename) return arrow_render_result_to_thrift(arrow_result)
def test_fetch_result_from_thrift_happy_path(self): with tempfile.NamedTemporaryFile(dir=str(self.basedir)) as tf: self.assertEqual( types.thrift_fetch_result_to_arrow( ttypes.FetchResult( Path(tf.name).name, [ttypes.RenderError(ttypes.I18nMessage("hi", {}), [])], ), self.basedir, ), types.FetchResult(Path( tf.name), [types.RenderError(types.I18nMessage("hi"))]), )