예제 #1
0
 def test_arrow_date32_column(self):
     self.assertEqual(
         arrow_schema_to_render_columns(
             pa.schema([pa.field("A", pa.date32(), metadata={"unit": "month"})])
         ),
         {"A": RenderColumn("A", "date", "month")},
     )
예제 #2
0
 def test_arrow_timestamp_column(self):
     self.assertEqual(
         arrow_schema_to_render_columns(
             pa.schema([pa.field("A", pa.timestamp("ns"))])
         ),
         {"A": RenderColumn("A", "timestamp", None)},
     )
예제 #3
0
 def test_arrow_schema_uint8_column(self):
     self.assertEqual(
         arrow_schema_to_render_columns(
             pa.schema([pa.field("A", pa.uint8(), metadata={"format": "{:,d}"})])
         ),
         {"A": RenderColumn("A", "number", "{:,d}")},
     )
예제 #4
0
 def test_arrow_schema_category_column(self):
     self.assertEqual(
         arrow_schema_to_render_columns(
             pa.schema([pa.field("A", pa.dictionary(pa.int32(), pa.string()))])
         ),
         {"A": RenderColumn("A", "text", None)},
     )
예제 #5
0
def call_render(module_spec: ModuleSpec, render: Callable,
                request: ttypes.RenderRequest) -> ttypes.RenderResult:
    basedir = Path(request.basedir)
    input_path = basedir / request.input_filename
    table = load_trusted_arrow_file(input_path)
    dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table)
    tab_outputs = {
        k: _thrift_tab_output_to_pandas(v, basedir)
        for k, v in request.tab_outputs.items()
    }
    params = _prepare_params(module_spec,
                             thrift_json_object_to_pydict(request.params),
                             basedir, tab_outputs)
    spec = inspect.getfullargspec(render)
    kwargs = {}
    varkw = bool(spec.varkw)  # if True, function accepts **kwargs
    kwonlyargs = spec.kwonlyargs
    if varkw or "fetch_result" in kwonlyargs:
        if request.fetch_result is None:
            fetch_result = None
        else:
            fetch_result_path = basedir / request.fetch_result.filename
            errors = [
                # Data comes in as FetchError and we return RenderError.
                RenderError(thrift_i18n_message_to_arrow(e.message))
                for e in request.fetch_result.errors
            ]
            if (fetch_result_path.stat().st_size == 0
                    or cjwparquet.file_has_parquet_magic_number(
                        fetch_result_path)):
                fetch_result = ptypes.ProcessResult(
                    dataframe=_parquet_to_pandas(fetch_result_path),
                    errors=errors,
                    # infer columns -- the fetch interface doesn't handle formats
                    # (TODO nix pandas_v0 fetching altogether by rewriting all modules)
                )
            else:
                # TODO nix pandas Fetch modules. (Do any use files, even?)
                fetch_result = types.FetchResult(path=fetch_result_path,
                                                 errors=errors)
        kwargs["fetch_result"] = fetch_result
    if varkw or "settings" in kwonlyargs:
        kwargs["settings"] = settings
    if varkw or "tab_name" in kwonlyargs:
        kwargs["tab_name"] = request.tab_name
    if varkw or "input_columns" in kwonlyargs:
        kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema)

    input_columns = read_columns(table, full=False)
    raw_result = render(dataframe, params, **kwargs)

    # raise ValueError if invalid
    pandas_result = ptypes.ProcessResult.coerce(
        raw_result, try_fallback_columns=input_columns)
    pandas_result.truncate_in_place_if_too_big()

    arrow_result = pandas_result.to_arrow(basedir / request.output_filename)
    return arrow_render_result_to_thrift(arrow_result)
예제 #6
0
def _thrift_tab_output_to_pandas(tab_output: ttypes.TabOutput,
                                 basedir: Path) -> ptypes.TabOutput:
    table = load_trusted_arrow_file(basedir / tab_output.table_filename)
    render_columns = arrow_schema_to_render_columns(table.schema)
    return ptypes.TabOutput(
        tab_output.tab_name,
        render_columns,
        cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table),
    )
예제 #7
0
 def test_arrow_schema_text_column(self):
     self.assertEqual(
         arrow_schema_to_render_columns(pa.schema([pa.field("A", pa.string())])),
         {"A": RenderColumn("A", "text", None)},
     )