예제 #1
0
 def _test_fetch(
     self,
     fetch_fn,
     *,
     params={},
     secrets={},
     last_fetch_result=None,
     input_table_parquet_path=None,
     output_filename=None,
 ):
     with ExitStack() as ctx:
         ctx.enter_context(patch.object(module, "fetch", fetch_fn))
         if output_filename is None:
             # Make a temporary output filename -- this will make `fetch()`
             # complete, but callers won't be able to see the data it
             # outputs because we'll delete the file too soon.
             output_filename = ctx.enter_context(
                 tempfile_context(dir=self.basedir)).name
         thrift_result = module.fetch_thrift(
             ttypes.FetchRequest(
                 basedir=str(self.basedir),
                 params=arrow_params_to_thrift(Params(params)),
                 secrets=arrow_raw_params_to_thrift(RawParams(secrets)),
                 last_fetch_result=(
                     arrow_fetch_result_to_thrift(last_fetch_result)
                     if last_fetch_result is not None else None),
                 input_table_parquet_filename=(input_table_parquet_path.name
                                               if input_table_parquet_path
                                               is not None else None),
                 output_filename=output_filename,
             ))
         return thrift_fetch_result_to_arrow(thrift_result, self.basedir)
예제 #2
0
def run_in_sandbox(compiled_module: CompiledModule, function: str,
                   args: List[Any]) -> None:
    """Run `function` with `args`, and write the (Thrift) result to `sys.stdout`."""
    # TODO sandbox -- will need an OS `clone()` with namespace, cgroups, ....

    # Run the user's code in a new (programmatic) module.
    #
    # This gives the user code a blank namespace -- exactly what we want.
    module_name = f"rawmodule.{compiled_module.module_slug}"
    user_code_module = types.ModuleType(module_name)
    sys.modules[module_name] = user_code_module  # simulate "import"

    exec(compiled_module.code_object, user_code_module.__dict__)

    # And now ... now we're unsafe! Because `code_object` may be malicious, any
    # line of code from here on out gives undefined behavior. Luckily, a parent
    # is catching all possibile outcomes....

    # Now override the pieces of the _default_ module with the user-supplied
    # ones. That way, when the default `render_pandas()` calls `render()`, that
    # `render()` is the user-code `render()` (if supplied).
    #
    # Good thing we've forked! This totally messes with global variables.
    module = cjwkernel.pandas.module
    for fn in (
            "fetch",
            "fetch_arrow",
            "fetch_pandas",
            "fetch_thrift",
            "migrate_params",
            "migrate_params_thrift",
            "render",
            "render_arrow",
            "render_arrow_v1",
            "render_pandas",
            "render_thrift",
    ):
        if fn in user_code_module.__dict__:
            module.__dict__[fn] = user_code_module.__dict__[fn]
    # Set ModuleSpec global parameter -- module frameworks use it for params
    module.__dict__["ModuleSpec"] = load_spec(compiled_module.module_spec_dict)

    if function == "render_thrift":
        result = module.render_thrift(*args)
    elif function == "migrate_params_thrift":
        result = module.migrate_params_thrift(*args)
    elif function == "validate_thrift":
        result = module.validate_thrift(*args)
    elif function == "fetch_thrift":
        result = module.fetch_thrift(*args)
    else:
        raise NotImplementedError

    transport = thrift.transport.TTransport.TFileObjectTransport(
        sys.__stdout__.buffer)
    protocol = thrift.protocol.TBinaryProtocol.TBinaryProtocol(transport)
    if result is not None:
        result.write(protocol)
    transport.flush()