Example #1
0
def call_render(module_spec: ModuleSpec, render: Callable,
                request: ttypes.RenderRequest) -> ttypes.RenderResult:
    basedir = Path(request.basedir)
    input_path = basedir / request.input_filename
    table = load_trusted_arrow_file(input_path)
    dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table)
    tab_outputs = {
        k: _thrift_tab_output_to_pandas(v, basedir)
        for k, v in request.tab_outputs.items()
    }
    params = _prepare_params(module_spec,
                             thrift_json_object_to_pydict(request.params),
                             basedir, tab_outputs)
    spec = inspect.getfullargspec(render)
    kwargs = {}
    varkw = bool(spec.varkw)  # if True, function accepts **kwargs
    kwonlyargs = spec.kwonlyargs
    if varkw or "fetch_result" in kwonlyargs:
        if request.fetch_result is None:
            fetch_result = None
        else:
            fetch_result_path = basedir / request.fetch_result.filename
            errors = [
                # Data comes in as FetchError and we return RenderError.
                RenderError(thrift_i18n_message_to_arrow(e.message))
                for e in request.fetch_result.errors
            ]
            if (fetch_result_path.stat().st_size == 0
                    or cjwparquet.file_has_parquet_magic_number(
                        fetch_result_path)):
                fetch_result = ptypes.ProcessResult(
                    dataframe=_parquet_to_pandas(fetch_result_path),
                    errors=errors,
                    # infer columns -- the fetch interface doesn't handle formats
                    # (TODO nix pandas_v0 fetching altogether by rewriting all modules)
                )
            else:
                # TODO nix pandas Fetch modules. (Do any use files, even?)
                fetch_result = types.FetchResult(path=fetch_result_path,
                                                 errors=errors)
        kwargs["fetch_result"] = fetch_result
    if varkw or "settings" in kwonlyargs:
        kwargs["settings"] = settings
    if varkw or "tab_name" in kwonlyargs:
        kwargs["tab_name"] = request.tab_name
    if varkw or "input_columns" in kwonlyargs:
        kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema)

    input_columns = read_columns(table, full=False)
    raw_result = render(dataframe, params, **kwargs)

    # raise ValueError if invalid
    pandas_result = ptypes.ProcessResult.coerce(
        raw_result, try_fallback_columns=input_columns)
    pandas_result.truncate_in_place_if_too_big()

    arrow_result = pandas_result.to_arrow(basedir / request.output_filename)
    return arrow_render_result_to_thrift(arrow_result)
Example #2
0
def render_arrow(
    table: types.ArrowTable,
    params: Dict[str, Any],
    tab_name: str,
    fetch_result: Optional[types.FetchResult],
    output_path: Path,
) -> types.RenderResult:
    """
    Render using `cjwkernel.types` data types.

    If outputting Arrow data, write to `output_path`.

    Module authors are encouraged to replace this function, because Arrow
    tables are simpler and more memory-efficient than Pandas tables. This is
    the ideal signature for a "rename columns" module, for instance: Arrow
    can pass data through without consuming excessive RAM.

    This does not validate the render_pandas() return value.
    """
    pandas_table = __arrow_to_pandas(table)
    pandas_input_tabs = {
        to.tab.slug: __arrow_tab_output_to_pandas(to)
        for to in _find_tab_outputs(params)
    }
    if fetch_result is not None:
        if fetch_result.path.stat(
        ).st_size == 0 or parquet.file_has_parquet_magic_number(
                fetch_result.path):
            fetched_table = __parquet_to_pandas(fetch_result.path)
            pandas_fetch_result = ptypes.ProcessResult(
                fetched_table,
                [
                    ptypes.ProcessResultError.from_arrow(error)
                    for error in fetch_result.errors
                ],
            )
        else:
            pandas_fetch_result = fetch_result
    else:
        pandas_fetch_result = None

    pandas_result: ptypes.ProcessResult = render_pandas(
        input_table=pandas_table,
        input_table_shape=ptypes.TableShape.from_arrow(table.metadata),
        params=_arrow_param_to_pandas_param(params),
        tab_name=tab_name,
        input_tabs=pandas_input_tabs,
        fetch_result=pandas_fetch_result,
    )

    return pandas_result.to_arrow(output_path)
Example #3
0
def render_arrow(
    table: types.ArrowTable,
    params: Dict[str, Any],
    tab_name: str,
    input_tabs: Dict[str, types.TabOutput],
    fetch_result: Optional[types.FetchResult],
    output_path: pathlib.Path,
) -> types.RenderResult:
    """
    Render using `cjwkernel.types` data types.

    If outputting Arrow data, write to `output_path`.

    Module authors are encouraged to replace this function, because Arrow
    tables are simpler and more memory-efficient than Pandas tables. This is
    the ideal signature for a "rename columns" module, for instance: Arrow
    can pass data through without consuming excessive RAM.

    This does not validate the render_pandas() return value.
    """
    pandas_table = __arrow_to_pandas(table.table)
    pandas_input_tabs = {
        k: __arrow_to_pandas(v)
        for k, v in input_tabs.items()
    }
    if fetch_result is not None:
        fetched_table = __parquet_to_pandas(fetch_result.path)
        pandas_fetch_result = ptypes.ProcessResult(
            fetched_table,
            error=(""
                   if not fetch_result.errors else str(fetch_result.errors)),
        )
    else:
        pandas_fetch_result = None

    pandas_result: ptypes.ProcessResult = render_pandas(
        input_table=pandas_table,
        input_table_shape=ptypes.TableShape.from_arrow(table.metadata),
        params=params,
        tab_name=tab_name,
        input_tabs=pandas_input_tabs,
        fetch_result=pandas_fetch_result,
    )

    return pandas_result.to_arrow(output_path)
Example #4
0
def __render_pandas(
    *,
    table: types.ArrowTable,
    params: Dict[str, Any],
    tab_name: str,
    fetch_result: Optional[types.FetchResult],
    output_path: Path,
) -> types.RenderResult:
    """
    Call `render()` with the Pandas signature style.

    Features:

    * Convert input Arrow table to a Pandas dataframe
    * Convert input params to Pandas format (producing extra arguments like
      `input_tabs` as needed).
    * Convert input `fetch_result` to Pandas dataframe, if it is a valid
      Parquet file.
    * Coerce output from a Pandas dataframe to an Arrow table
    * Coerce output errors/json
    """
    # Convert input arguments
    pandas_table = __arrow_to_pandas(table)
    pandas_params = __arrow_param_to_pandas_param(params)

    spec = inspect.getfullargspec(render)
    kwargs = {}
    varkw = bool(spec.varkw)  # if True, function accepts **kwargs
    kwonlyargs = spec.kwonlyargs
    if varkw or "fetch_result" in kwonlyargs:
        if fetch_result is not None:
            if (fetch_result.path.stat().st_size == 0
                    or cjwparquet.file_has_parquet_magic_number(
                        fetch_result.path)):
                fetched_table = __parquet_to_pandas(fetch_result.path)
                pandas_fetch_result = ptypes.ProcessResult(
                    fetched_table, fetch_result.errors)
            else:
                pandas_fetch_result = fetch_result
        else:
            pandas_fetch_result = None
        kwargs["fetch_result"] = pandas_fetch_result
    if varkw or "settings" in kwonlyargs:
        kwargs["settings"] = settings
    if varkw or "tab_name" in kwonlyargs:
        kwargs["tab_name"] = tab_name
    if varkw or "input_columns" in kwonlyargs:
        kwargs["input_columns"] = {
            c.name: ptypes.RenderColumn(c.name, c.type.name,
                                        getattr(c.type, "format", None))
            for c in table.metadata.columns
        }
    if varkw or "input_tabs" in kwonlyargs:
        kwargs["input_tabs"] = {
            to.tab.slug: __arrow_tab_output_to_pandas(to)
            for to in __find_tab_outputs(params)
        }

    # call render()
    raw_result = render(pandas_table, pandas_params, **kwargs)

    # Coerce outputs
    result = ptypes.ProcessResult.coerce(
        raw_result, try_fallback_columns=table.metadata.columns
    )  # raise ValueError if invalid
    result.truncate_in_place_if_too_big()

    return result.to_arrow(output_path)