Ejemplo n.º 1
0
def call_render(render: Callable,
                request: ttypes.RenderRequest) -> ttypes.RenderResult:
    basedir = Path(request.basedir)
    table = load_trusted_arrow_file(basedir / request.input_filename)
    params = thrift_json_object_to_pydict(request.params)

    tab_outputs = {
        k: TabOutput(
            tab_name=v.tab_name,
            table=load_trusted_arrow_file(basedir / v.table_filename),
        )
        for k, v in request.tab_outputs.items()
    }

    uploaded_files = {
        k: UploadedFile(
            name=v.name,
            path=(basedir / v.filename),
            uploaded_at=datetime.datetime.utcfromtimestamp(
                v.uploaded_at_timestampus / 1000000.0),
        )
        for k, v in request.uploaded_files.items()
    }

    if request.fetch_result is None:
        fetch_result = None
    else:
        fetch_result = thrift_fetch_result_to_arrow(request.fetch_result,
                                                    basedir)

    raw_result = render(
        table,
        params,
        settings=settings,
        tab_name=request.tab_name,
        tab_outputs=tab_outputs,
        uploaded_files=uploaded_files,
        fetch_result=fetch_result,
    )

    if not isinstance(raw_result, ArrowRenderResult):
        # Crash. The module author wrote a buggy module.
        raise ValueError(
            "render_arrow_v1() must return a cjwmodule.arrow.types.ArrowRenderResult"
        )

    with pa.ipc.RecordBatchFileWriter(
            basedir / request.output_filename,
            schema=raw_result.table.schema) as writer:
        writer.write_table(raw_result.table)

    return ttypes.RenderResult(
        errors=[arrow_render_error_to_thrift(e) for e in raw_result.errors],
        json=pydict_to_thrift_json_object(raw_result.json),
    )
Ejemplo n.º 2
0
def _DEPRECATED_overwrite_to_fix_arrow_table_schema(
        path: Path, fallback_schema: pa.Schema) -> None:
    if not path.stat().st_size:
        return

    table = load_trusted_arrow_file(path)

    untyped_schema = table.schema
    fields = [
        __DEPRECATED_fix_field(
            untyped_schema.field(i),
            (None if fallback_schema.get_field_index(name) == -1 else
             fallback_schema.field(fallback_schema.get_field_index(name))),
        ) for i, name in enumerate(untyped_schema.names)
    ]
    schema = pa.schema(fields)

    # Overwrite with new data
    #
    # We don't short-circuit by comparing schemas: two pa.Schema values
    # with different number formats evaluate as equal.
    #
    # We write a separate file to /var/tmp and then copy it: our sandbox
    # won't let us `rename(2)` in `path`'s directory.
    with tempfile_context(dir="/var/tmp") as rewrite_path:
        with pa.ipc.RecordBatchFileWriter(rewrite_path, schema) as writer:
            writer.write_table(pa.table(table.columns, schema=schema))
        shutil.copyfile(rewrite_path, path)
Ejemplo n.º 3
0
 def test_execute_empty_tab(self):
     workflow = Workflow.create_and_init()
     tab = workflow.tabs.first()
     tab_flow = TabFlow(Tab(tab.slug, tab.name), [])
     with self._execute(workflow, tab_flow, {}) as (result, path):
         self.assertEqual(result, StepResult(path, []))
         self.assertEqual(load_trusted_arrow_file(path), make_table())
Ejemplo n.º 4
0
def call_render(module_spec: ModuleSpec, render: Callable,
                request: ttypes.RenderRequest) -> ttypes.RenderResult:
    basedir = Path(request.basedir)
    input_path = basedir / request.input_filename
    table = load_trusted_arrow_file(input_path)
    dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table)
    tab_outputs = {
        k: _thrift_tab_output_to_pandas(v, basedir)
        for k, v in request.tab_outputs.items()
    }
    params = _prepare_params(module_spec,
                             thrift_json_object_to_pydict(request.params),
                             basedir, tab_outputs)
    spec = inspect.getfullargspec(render)
    kwargs = {}
    varkw = bool(spec.varkw)  # if True, function accepts **kwargs
    kwonlyargs = spec.kwonlyargs
    if varkw or "fetch_result" in kwonlyargs:
        if request.fetch_result is None:
            fetch_result = None
        else:
            fetch_result_path = basedir / request.fetch_result.filename
            errors = [
                # Data comes in as FetchError and we return RenderError.
                RenderError(thrift_i18n_message_to_arrow(e.message))
                for e in request.fetch_result.errors
            ]
            if (fetch_result_path.stat().st_size == 0
                    or cjwparquet.file_has_parquet_magic_number(
                        fetch_result_path)):
                fetch_result = ptypes.ProcessResult(
                    dataframe=_parquet_to_pandas(fetch_result_path),
                    errors=errors,
                    # infer columns -- the fetch interface doesn't handle formats
                    # (TODO nix pandas_v0 fetching altogether by rewriting all modules)
                )
            else:
                # TODO nix pandas Fetch modules. (Do any use files, even?)
                fetch_result = types.FetchResult(path=fetch_result_path,
                                                 errors=errors)
        kwargs["fetch_result"] = fetch_result
    if varkw or "settings" in kwonlyargs:
        kwargs["settings"] = settings
    if varkw or "tab_name" in kwonlyargs:
        kwargs["tab_name"] = request.tab_name
    if varkw or "input_columns" in kwonlyargs:
        kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema)

    input_columns = read_columns(table, full=False)
    raw_result = render(dataframe, params, **kwargs)

    # raise ValueError if invalid
    pandas_result = ptypes.ProcessResult.coerce(
        raw_result, try_fallback_columns=input_columns)
    pandas_result.truncate_in_place_if_too_big()

    arrow_result = pandas_result.to_arrow(basedir / request.output_filename)
    return arrow_render_result_to_thrift(arrow_result)
Ejemplo n.º 5
0
def _thrift_tab_output_to_pandas(tab_output: ttypes.TabOutput,
                                 basedir: Path) -> ptypes.TabOutput:
    table = load_trusted_arrow_file(basedir / tab_output.table_filename)
    render_columns = arrow_schema_to_render_columns(table.schema)
    return ptypes.TabOutput(
        tab_output.tab_name,
        render_columns,
        cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table),
    )
Ejemplo n.º 6
0
    def test_execute_partial_cache_hit(self):
        module_zipfile = create_module_zipfile(
            "mod", spec_kwargs={"loads_data": True})
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        # step1: cached result is fresh. Should not render.
        step1 = tab.steps.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        write_to_rendercache(workflow, step1, workflow.last_delta_id,
                             make_table(make_column("A", ["a"])))
        # step2: cached result is stale, so must be re-rendered
        step2 = tab.steps.create(
            order=1,
            slug="step-2",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        write_to_rendercache(
            workflow,
            step2,
            workflow.last_delta_id - 1,
            make_table(make_column("B", ["b"])),
        )

        tab_flow = TabFlow(
            Tab(tab.slug, tab.name),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        new_table = make_table(make_column("C", ["c"]))

        with patch.object(Kernel, "render",
                          side_effect=mock_render(new_table)):
            with self._execute(workflow, tab_flow, {}) as (result, path):
                self.assertEqual(
                    result, StepResult(path, [Column("C", ColumnType.Text())]))
                assert_arrow_table_equals(load_trusted_arrow_file(path),
                                          new_table)

            Kernel.render.assert_called_once()  # step2, not step1

            self.assertRegex(
                # Output is to the correct file
                Kernel.render.call_args[1]["output_filename"],
                r"execute-tab-output.*\.arrow",
            )
Ejemplo n.º 7
0
    def test_execute_cache_hit(self):
        cached_table1 = make_table(make_column("A", [1]))
        cached_table2 = make_table(make_column("B", [2], format="${:,}"))
        module_zipfile = create_module_zipfile(
            "mod", spec_kwargs={"loads_data": True})
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        step1 = tab.steps.create(order=0,
                                 slug="step-1",
                                 last_relevant_delta_id=workflow.last_delta_id)
        write_to_rendercache(workflow, step1, workflow.last_delta_id,
                             cached_table1)
        step2 = tab.steps.create(order=1,
                                 slug="step-2",
                                 last_relevant_delta_id=workflow.last_delta_id)
        write_to_rendercache(workflow, step2, workflow.last_delta_id,
                             cached_table2)

        tab_flow = TabFlow(
            Tab(tab.slug, tab.name),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        unwanted_table = make_table(make_column("No", ["bad"]))
        with patch.object(Kernel,
                          "render",
                          side_effect=mock_render(unwanted_table)):
            with self._execute(workflow, tab_flow, {}) as (result, path):
                self.assertEqual(
                    result,
                    StepResult(
                        path,
                        [Column("B", ColumnType.Number(format="${:,}"))]),
                )
                assert_arrow_table_equals(load_trusted_arrow_file(path),
                                          cached_table2)

            Kernel.render.assert_not_called()
Ejemplo n.º 8
0
    def test_execute_cache_miss(self):
        module_zipfile = create_module_zipfile(
            "mod", spec_kwargs={"loads_data": True})
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        step1 = tab.steps.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        step2 = tab.steps.create(
            order=1,
            slug="step-2",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )

        tab_flow = TabFlow(
            Tab(tab.slug, tab.name),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        table = make_table(make_column("A", ["a"]))

        with patch.object(Kernel, "render", side_effect=mock_render(table)):
            with self._execute(workflow, tab_flow, {}) as (result, path):
                self.assertEqual(
                    result, StepResult(path, [Column("A", ColumnType.Text())]))
                assert_arrow_table_equals(load_trusted_arrow_file(path), table)

            self.assertEqual(Kernel.render.call_count, 2)  # step2, not step1
            self.assertRegex(
                # Output is to the correct file
                Kernel.render.call_args[1]["output_filename"],
                r"execute-tab-output.*\.arrow",
            )
Ejemplo n.º 9
0
def call_fetch(fetch: Callable,
               request: ttypes.FetchRequest) -> ttypes.FetchResult:
    """Call `fetch()` and validate the result.

    Module code may contain errors. This function and `fetch()` should strive
    to raise developer-friendly errors in the case of bugs -- including
    unexpected input.
    """
    # thrift => pandas
    basedir = Path(request.basedir)
    params: Dict[str, Any] = thrift_json_object_to_pydict(request.params)
    output_path = basedir / request.output_filename

    spec = inspect.getfullargspec(fetch)
    kwargs = {}
    varkw = bool(spec.varkw)  # if True, function accepts **kwargs
    kwonlyargs = spec.kwonlyargs

    if varkw or "secrets" in kwonlyargs:
        kwargs["secrets"] = thrift_json_object_to_pydict(request.secrets)
    if varkw or "settings" in kwonlyargs:
        kwargs["settings"] = settings
    if varkw or "get_input_dataframe" in kwonlyargs:

        async def get_input_dataframe():
            if request.input_table_parquet_filename is None:
                return None
            else:
                return _parquet_to_pandas(basedir /
                                          request.input_table_parquet_filename)

        kwargs["get_input_dataframe"] = get_input_dataframe

    if varkw or "output_path" in kwonlyargs:
        kwargs["output_path"] = output_path

    result = fetch(params, **kwargs)
    if asyncio.iscoroutine(result):
        result = asyncio.run(result)

    if isinstance(result, tuple) and len(result) == 2 and isinstance(
            result[0], Path):
        errors = ptypes.coerce_RenderError_list(result[1])
    elif isinstance(result, Path):
        errors = []
    elif isinstance(result, list):
        errors = ptypes.coerce_RenderError_list(result)
    else:
        pandas_result = ptypes.ProcessResult.coerce(result)
        pandas_result.truncate_in_place_if_too_big()
        # ProcessResult => FetchResult isn't a thing; but we can hack it using
        # ProcessResult => RenderResult => FetchResult.
        with tempfile_context(suffix=".arrow") as arrow_path:
            if pandas_result.columns:
                hacky_result = pandas_result.to_arrow(arrow_path)
                table = load_trusted_arrow_file(arrow_path)
                cjwparquet.write(output_path, table)
                errors = hacky_result.errors
            else:
                output_path.write_bytes(b"")
                errors = pandas_result.errors

    return ttypes.FetchResult(
        filename=request.output_filename,
        errors=[arrow_render_error_to_thrift(e) for e in errors],
    )