def test_arrow_date32_column(self): self.assertEqual( arrow_schema_to_render_columns( pa.schema([pa.field("A", pa.date32(), metadata={"unit": "month"})]) ), {"A": RenderColumn("A", "date", "month")}, )
def test_arrow_timestamp_column(self): self.assertEqual( arrow_schema_to_render_columns( pa.schema([pa.field("A", pa.timestamp("ns"))]) ), {"A": RenderColumn("A", "timestamp", None)}, )
def test_arrow_schema_uint8_column(self): self.assertEqual( arrow_schema_to_render_columns( pa.schema([pa.field("A", pa.uint8(), metadata={"format": "{:,d}"})]) ), {"A": RenderColumn("A", "number", "{:,d}")}, )
def test_arrow_schema_category_column(self): self.assertEqual( arrow_schema_to_render_columns( pa.schema([pa.field("A", pa.dictionary(pa.int32(), pa.string()))]) ), {"A": RenderColumn("A", "text", None)}, )
def call_render(module_spec: ModuleSpec, render: Callable, request: ttypes.RenderRequest) -> ttypes.RenderResult: basedir = Path(request.basedir) input_path = basedir / request.input_filename table = load_trusted_arrow_file(input_path) dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table) tab_outputs = { k: _thrift_tab_output_to_pandas(v, basedir) for k, v in request.tab_outputs.items() } params = _prepare_params(module_spec, thrift_json_object_to_pydict(request.params), basedir, tab_outputs) spec = inspect.getfullargspec(render) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "fetch_result" in kwonlyargs: if request.fetch_result is None: fetch_result = None else: fetch_result_path = basedir / request.fetch_result.filename errors = [ # Data comes in as FetchError and we return RenderError. RenderError(thrift_i18n_message_to_arrow(e.message)) for e in request.fetch_result.errors ] if (fetch_result_path.stat().st_size == 0 or cjwparquet.file_has_parquet_magic_number( fetch_result_path)): fetch_result = ptypes.ProcessResult( dataframe=_parquet_to_pandas(fetch_result_path), errors=errors, # infer columns -- the fetch interface doesn't handle formats # (TODO nix pandas_v0 fetching altogether by rewriting all modules) ) else: # TODO nix pandas Fetch modules. (Do any use files, even?) fetch_result = types.FetchResult(path=fetch_result_path, errors=errors) kwargs["fetch_result"] = fetch_result if varkw or "settings" in kwonlyargs: kwargs["settings"] = settings if varkw or "tab_name" in kwonlyargs: kwargs["tab_name"] = request.tab_name if varkw or "input_columns" in kwonlyargs: kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema) input_columns = read_columns(table, full=False) raw_result = render(dataframe, params, **kwargs) # raise ValueError if invalid pandas_result = ptypes.ProcessResult.coerce( raw_result, try_fallback_columns=input_columns) pandas_result.truncate_in_place_if_too_big() arrow_result = pandas_result.to_arrow(basedir / request.output_filename) return arrow_render_result_to_thrift(arrow_result)
def _thrift_tab_output_to_pandas(tab_output: ttypes.TabOutput, basedir: Path) -> ptypes.TabOutput: table = load_trusted_arrow_file(basedir / tab_output.table_filename) render_columns = arrow_schema_to_render_columns(table.schema) return ptypes.TabOutput( tab_output.tab_name, render_columns, cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table), )
def test_arrow_schema_text_column(self): self.assertEqual( arrow_schema_to_render_columns(pa.schema([pa.field("A", pa.string())])), {"A": RenderColumn("A", "text", None)}, )