def call_render(module_spec: ModuleSpec, render: Callable, request: ttypes.RenderRequest) -> ttypes.RenderResult: basedir = Path(request.basedir) input_path = basedir / request.input_filename table = load_trusted_arrow_file(input_path) dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table) tab_outputs = { k: _thrift_tab_output_to_pandas(v, basedir) for k, v in request.tab_outputs.items() } params = _prepare_params(module_spec, thrift_json_object_to_pydict(request.params), basedir, tab_outputs) spec = inspect.getfullargspec(render) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "fetch_result" in kwonlyargs: if request.fetch_result is None: fetch_result = None else: fetch_result_path = basedir / request.fetch_result.filename errors = [ # Data comes in as FetchError and we return RenderError. RenderError(thrift_i18n_message_to_arrow(e.message)) for e in request.fetch_result.errors ] if (fetch_result_path.stat().st_size == 0 or cjwparquet.file_has_parquet_magic_number( fetch_result_path)): fetch_result = ptypes.ProcessResult( dataframe=_parquet_to_pandas(fetch_result_path), errors=errors, # infer columns -- the fetch interface doesn't handle formats # (TODO nix pandas_v0 fetching altogether by rewriting all modules) ) else: # TODO nix pandas Fetch modules. (Do any use files, even?) fetch_result = types.FetchResult(path=fetch_result_path, errors=errors) kwargs["fetch_result"] = fetch_result if varkw or "settings" in kwonlyargs: kwargs["settings"] = settings if varkw or "tab_name" in kwonlyargs: kwargs["tab_name"] = request.tab_name if varkw or "input_columns" in kwonlyargs: kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema) input_columns = read_columns(table, full=False) raw_result = render(dataframe, params, **kwargs) # raise ValueError if invalid pandas_result = ptypes.ProcessResult.coerce( raw_result, try_fallback_columns=input_columns) pandas_result.truncate_in_place_if_too_big() arrow_result = pandas_result.to_arrow(basedir / request.output_filename) return arrow_render_result_to_thrift(arrow_result)
def render_arrow( table: types.ArrowTable, params: Dict[str, Any], tab_name: str, fetch_result: Optional[types.FetchResult], output_path: Path, ) -> types.RenderResult: """ Render using `cjwkernel.types` data types. If outputting Arrow data, write to `output_path`. Module authors are encouraged to replace this function, because Arrow tables are simpler and more memory-efficient than Pandas tables. This is the ideal signature for a "rename columns" module, for instance: Arrow can pass data through without consuming excessive RAM. This does not validate the render_pandas() return value. """ pandas_table = __arrow_to_pandas(table) pandas_input_tabs = { to.tab.slug: __arrow_tab_output_to_pandas(to) for to in _find_tab_outputs(params) } if fetch_result is not None: if fetch_result.path.stat( ).st_size == 0 or parquet.file_has_parquet_magic_number( fetch_result.path): fetched_table = __parquet_to_pandas(fetch_result.path) pandas_fetch_result = ptypes.ProcessResult( fetched_table, [ ptypes.ProcessResultError.from_arrow(error) for error in fetch_result.errors ], ) else: pandas_fetch_result = fetch_result else: pandas_fetch_result = None pandas_result: ptypes.ProcessResult = render_pandas( input_table=pandas_table, input_table_shape=ptypes.TableShape.from_arrow(table.metadata), params=_arrow_param_to_pandas_param(params), tab_name=tab_name, input_tabs=pandas_input_tabs, fetch_result=pandas_fetch_result, ) return pandas_result.to_arrow(output_path)
def render_arrow( table: types.ArrowTable, params: Dict[str, Any], tab_name: str, input_tabs: Dict[str, types.TabOutput], fetch_result: Optional[types.FetchResult], output_path: pathlib.Path, ) -> types.RenderResult: """ Render using `cjwkernel.types` data types. If outputting Arrow data, write to `output_path`. Module authors are encouraged to replace this function, because Arrow tables are simpler and more memory-efficient than Pandas tables. This is the ideal signature for a "rename columns" module, for instance: Arrow can pass data through without consuming excessive RAM. This does not validate the render_pandas() return value. """ pandas_table = __arrow_to_pandas(table.table) pandas_input_tabs = { k: __arrow_to_pandas(v) for k, v in input_tabs.items() } if fetch_result is not None: fetched_table = __parquet_to_pandas(fetch_result.path) pandas_fetch_result = ptypes.ProcessResult( fetched_table, error=("" if not fetch_result.errors else str(fetch_result.errors)), ) else: pandas_fetch_result = None pandas_result: ptypes.ProcessResult = render_pandas( input_table=pandas_table, input_table_shape=ptypes.TableShape.from_arrow(table.metadata), params=params, tab_name=tab_name, input_tabs=pandas_input_tabs, fetch_result=pandas_fetch_result, ) return pandas_result.to_arrow(output_path)
def __render_pandas( *, table: types.ArrowTable, params: Dict[str, Any], tab_name: str, fetch_result: Optional[types.FetchResult], output_path: Path, ) -> types.RenderResult: """ Call `render()` with the Pandas signature style. Features: * Convert input Arrow table to a Pandas dataframe * Convert input params to Pandas format (producing extra arguments like `input_tabs` as needed). * Convert input `fetch_result` to Pandas dataframe, if it is a valid Parquet file. * Coerce output from a Pandas dataframe to an Arrow table * Coerce output errors/json """ # Convert input arguments pandas_table = __arrow_to_pandas(table) pandas_params = __arrow_param_to_pandas_param(params) spec = inspect.getfullargspec(render) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "fetch_result" in kwonlyargs: if fetch_result is not None: if (fetch_result.path.stat().st_size == 0 or cjwparquet.file_has_parquet_magic_number( fetch_result.path)): fetched_table = __parquet_to_pandas(fetch_result.path) pandas_fetch_result = ptypes.ProcessResult( fetched_table, fetch_result.errors) else: pandas_fetch_result = fetch_result else: pandas_fetch_result = None kwargs["fetch_result"] = pandas_fetch_result if varkw or "settings" in kwonlyargs: kwargs["settings"] = settings if varkw or "tab_name" in kwonlyargs: kwargs["tab_name"] = tab_name if varkw or "input_columns" in kwonlyargs: kwargs["input_columns"] = { c.name: ptypes.RenderColumn(c.name, c.type.name, getattr(c.type, "format", None)) for c in table.metadata.columns } if varkw or "input_tabs" in kwonlyargs: kwargs["input_tabs"] = { to.tab.slug: __arrow_tab_output_to_pandas(to) for to in __find_tab_outputs(params) } # call render() raw_result = render(pandas_table, pandas_params, **kwargs) # Coerce outputs result = ptypes.ProcessResult.coerce( raw_result, try_fallback_columns=table.metadata.columns ) # raise ValueError if invalid result.truncate_in_place_if_too_big() return result.to_arrow(output_path)