Beispiel #1
0
def fetch_pandas(
    params: Dict[str, Any],
    secrets: Dict[str, Any],
    last_fetch_result: Optional[types.FetchResult],
    input_table_parquet_path: Optional[Path],
    output_path: Path,
) -> Union[ptypes.ProcessResult, types.FetchResult]:
    """
    Call `fetch()` and validate the result.

    Module authors should not replace this function: they should replace
    `fetch()` instead.

    This function validates the `fetch()` return value, to raise a helpful
    `ValueError` if the module code is buggy.
    """
    spec = inspect.getfullargspec(fetch)
    kwargs = {}
    varkw = bool(spec.varkw)  # if True, function accepts **kwargs
    kwonlyargs = spec.kwonlyargs

    if varkw or "secrets" in kwonlyargs:
        kwargs["secrets"] = secrets

    if varkw or "get_input_dataframe" in kwonlyargs:

        async def get_input_dataframe():
            if input_table_parquet_path is None:
                return None
            else:
                return __parquet_to_pandas(input_table_parquet_path)

        kwargs["get_input_dataframe"] = get_input_dataframe

    if varkw or "get_stored_dataframe" in kwonlyargs:

        async def get_stored_dataframe():
            if last_fetch_result is None:
                return None
            else:
                return __parquet_to_pandas(last_fetch_result.path)

        kwargs["get_stored_dataframe"] = get_stored_dataframe

    if varkw or "output_path" in kwonlyargs:
        kwargs["output_path"] = output_path

    result = fetch(params, **kwargs)
    if asyncio.iscoroutine(result):
        result = asyncio.run(result)
    if (isinstance(result, tuple) and len(result) == 2
            and isinstance(result[0], Path) and isinstance(result[1], str)):
        return types.FetchResult(
            result[0],
            [types.RenderError(types.I18nMessage.TODO_i18n(result[1]))])
    elif isinstance(result, Path):
        return types.FetchResult(result)
    else:
        return ptypes.ProcessResult.coerce(result)
Beispiel #2
0
def fetch_arrow(
    params: Dict[str, Any],
    secrets: Dict[str, Any],
    last_fetch_result: Optional[types.FetchResult],
    input_table_parquet_path: Optional[Path],
    output_path: Path,
) -> types.FetchResult:
    """Render using `cjwkernel.types` data types.

    The result will be encoded as a Parquet file.

    Module authors are encouraged to replace this function, because the
    `fetch()` signature deals in dataframes instead of in raw data.
    """
    pandas_result: Union[ptypes.ProcessResult,
                         types.FetchResult] = fetch_pandas(
                             params=__arrow_param_to_pandas_param(params),
                             secrets=secrets,
                             last_fetch_result=last_fetch_result,
                             input_table_parquet_path=input_table_parquet_path,
                             output_path=output_path,
                         )
    if isinstance(pandas_result, ptypes.ProcessResult):
        pandas_result.truncate_in_place_if_too_big()
        # ProcessResult => FetchResult isn't a thing; but we can hack it using
        # ProcessResult => RenderResult => FetchResult.
        with tempfile_context(suffix=".arrow") as arrow_path:
            hacky_result = pandas_result.to_arrow(arrow_path)
        if hacky_result.table.path:
            cjwparquet.write(output_path, hacky_result.table.table)
        else:
            output_path.write_bytes(b"")
        return types.FetchResult(output_path, hacky_result.errors)
    else:  # it's already a types.FetchResult
        return pandas_result
Beispiel #3
0
def call_render(module_spec: ModuleSpec, render: Callable,
                request: ttypes.RenderRequest) -> ttypes.RenderResult:
    basedir = Path(request.basedir)
    input_path = basedir / request.input_filename
    table = load_trusted_arrow_file(input_path)
    dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table)
    tab_outputs = {
        k: _thrift_tab_output_to_pandas(v, basedir)
        for k, v in request.tab_outputs.items()
    }
    params = _prepare_params(module_spec,
                             thrift_json_object_to_pydict(request.params),
                             basedir, tab_outputs)
    spec = inspect.getfullargspec(render)
    kwargs = {}
    varkw = bool(spec.varkw)  # if True, function accepts **kwargs
    kwonlyargs = spec.kwonlyargs
    if varkw or "fetch_result" in kwonlyargs:
        if request.fetch_result is None:
            fetch_result = None
        else:
            fetch_result_path = basedir / request.fetch_result.filename
            errors = [
                # Data comes in as FetchError and we return RenderError.
                RenderError(thrift_i18n_message_to_arrow(e.message))
                for e in request.fetch_result.errors
            ]
            if (fetch_result_path.stat().st_size == 0
                    or cjwparquet.file_has_parquet_magic_number(
                        fetch_result_path)):
                fetch_result = ptypes.ProcessResult(
                    dataframe=_parquet_to_pandas(fetch_result_path),
                    errors=errors,
                    # infer columns -- the fetch interface doesn't handle formats
                    # (TODO nix pandas_v0 fetching altogether by rewriting all modules)
                )
            else:
                # TODO nix pandas Fetch modules. (Do any use files, even?)
                fetch_result = types.FetchResult(path=fetch_result_path,
                                                 errors=errors)
        kwargs["fetch_result"] = fetch_result
    if varkw or "settings" in kwonlyargs:
        kwargs["settings"] = settings
    if varkw or "tab_name" in kwonlyargs:
        kwargs["tab_name"] = request.tab_name
    if varkw or "input_columns" in kwonlyargs:
        kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema)

    input_columns = read_columns(table, full=False)
    raw_result = render(dataframe, params, **kwargs)

    # raise ValueError if invalid
    pandas_result = ptypes.ProcessResult.coerce(
        raw_result, try_fallback_columns=input_columns)
    pandas_result.truncate_in_place_if_too_big()

    arrow_result = pandas_result.to_arrow(basedir / request.output_filename)
    return arrow_render_result_to_thrift(arrow_result)
Beispiel #4
0
 def test_fetch_result_from_thrift_happy_path(self):
     with tempfile.NamedTemporaryFile(dir=str(self.basedir)) as tf:
         self.assertEqual(
             types.thrift_fetch_result_to_arrow(
                 ttypes.FetchResult(
                     Path(tf.name).name,
                     [ttypes.RenderError(ttypes.I18nMessage("hi", {}), [])],
                 ),
                 self.basedir,
             ),
             types.FetchResult(Path(
                 tf.name), [types.RenderError(types.I18nMessage("hi"))]),
         )