コード例 #1
0
 def test_default_render_returns_fetch_result(self):
     # Functionality used by libraryofcongress
     with ExitStack() as ctx:
         input_arrow_table = ctx.enter_context(
             arrow_table_context({"A": [1]}, dir=self.basedir)
         )
         parquet_filename = Path(
             ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name
         ).name
         out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name
         thrift_result = module.render_thrift(
             ttypes.RenderRequest(
                 str(self.basedir),
                 input_arrow_table.to_thrift(),
                 Params({}).to_thrift(),
                 ttypes.Tab("tab-1", "Tab 1"),
                 ttypes.FetchResult(
                     parquet_filename,
                     [RenderError(I18nMessage.TODO_i18n("A warning")).to_thrift()],
                 ),
                 out_filename,
             )
         )
         result = RenderResult.from_thrift(thrift_result, self.basedir)
         assert_render_result_equals(
             result,
             RenderResult(
                 arrow_table({"A": [2]}),
                 [RenderError(I18nMessage.TODO_i18n("A warning"))],
             ),
         )
コード例 #2
0
 def test_fetch_result_from_thrift_disallow_non_file(self):
     with tempfile.TemporaryDirectory(dir=str(self.basedir)) as tmpsubdir:
         with self.assertRaisesRegex(ValueError, "be a regular file"):
             types.thrift_fetch_result_to_arrow(
                 ttypes.FetchResult(Path(tmpsubdir).name, []),
                 self.basedir,
             )
コード例 #3
0
 def test_fetch_result_from_thrift_happy_path(self):
     with tempfile.NamedTemporaryFile(dir=str(self.basedir)) as tf:
         self.assertEqual(
             types.thrift_fetch_result_to_arrow(
                 ttypes.FetchResult(
                     Path(tf.name).name,
                     [ttypes.RenderError(ttypes.I18nMessage("hi", {}), [])],
                 ),
                 self.basedir,
             ),
             types.FetchResult(Path(
                 tf.name), [types.RenderError(types.I18nMessage("hi"))]),
         )
コード例 #4
0
ファイル: kernel.py プロジェクト: vishalbelsare/cjworkbench
    def fetch(
        self,
        compiled_module: CompiledModule,
        chroot_context: ChrootContext,
        basedir: Path,
        params: Dict[str, Any],
        secrets: Dict[str, Any],
        last_fetch_result: Optional[FetchResult],
        input_parquet_filename: Optional[str],
        output_filename: str,
    ) -> FetchResult:
        """Run the module's `fetch_thrift()` function and return its result.

        Raise ModuleError if the module has a bug.
        """
        chroot_dir = chroot_context.chroot.root
        basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir)
        request = ttypes.FetchRequest(
            basedir=str(basedir_seen_by_module),
            params=pydict_to_thrift_json_object(params),
            secrets=pydict_to_thrift_json_object(secrets),
            last_fetch_result=(
                None if last_fetch_result is None else
                arrow_fetch_result_to_thrift(last_fetch_result)),
            input_table_parquet_filename=input_parquet_filename,
            output_filename=output_filename,
        )
        try:
            with chroot_context.writable_file(basedir / output_filename):
                result = self._run_in_child(
                    chroot_dir=chroot_dir,
                    network_config=pyspawner.NetworkConfig(),
                    compiled_module=compiled_module,
                    timeout=self.fetch_timeout,
                    result=ttypes.FetchResult(),
                    function="fetch_thrift",
                    args=[request],
                )
        finally:
            chroot_context.clear_unowned_edits()

        if result.filename and result.filename != output_filename:
            raise ModuleExitedError(compiled_module.module_slug, 0,
                                    "Module wrote to wrong output file")

        # TODO validate result isn't too large. If result is dataframe it makes
        # sense to truncate; but fetch results aren't necessarily data frames.
        # It's up to the module to enforce this logic ... but we need to set a
        # maximum file size.
        return thrift_fetch_result_to_arrow(result, basedir)
コード例 #5
0
 def test_default_render_returns_fetch_result(self):
     # Functionality used by libraryofcongress
     with ExitStack() as ctx:
         input_arrow_table = ctx.enter_context(
             arrow_table_context({"A": [1]}, dir=self.basedir)
         )
         parquet_filename = Path(
             ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name
         ).name
         out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name
         thrift_result = module.render_thrift(
             ttypes.RenderRequest(
                 str(self.basedir),
                 arrow_arrow_table_to_thrift(input_arrow_table),
                 {},  # params
                 ttypes.Tab("tab-1", "Tab 1"),
                 ttypes.FetchResult(
                     parquet_filename,
                     [
                         ttypes.RenderError(
                             ttypes.I18nMessage(
                                 "TODO_i18n",
                                 {
                                     "text": ttypes.I18nArgument(
                                         string_value="A warning"
                                     )
                                 },
                             ),
                             [],
                         )
                     ],
                 ),
                 out_filename,
             )
         )
         result = thrift_render_result_to_arrow(thrift_result, self.basedir)
         assert_render_result_equals(
             result,
             RenderResult(
                 arrow_table({"A": [2]}),
                 [RenderError(I18nMessage.TODO_i18n("A warning"))],
             ),
         )
コード例 #6
0
 def fetch(
     self,
     compiled_module: CompiledModule,
     basedir: Path,
     params: Params,
     secrets: Dict[str, Any],
     last_fetch_result: Optional[FetchResult],
     input_parquet_filename: str,
     output_filename: str,
 ) -> FetchResult:
     request = ttypes.FetchRequest(
         str(basedir),
         params.to_thrift(),
         RawParams(secrets).to_thrift(),
         None
         if last_fetch_result is None else last_fetch_result.to_thrift(),
         input_parquet_filename,
         output_filename,
     )
     with _chroot_dir_context(provide_paths=[basedir],
                              extract_paths=[basedir / output_filename
                                             ]) as chroot:
         result = self._run_in_child(
             chroot=chroot,
             chroot_paths=[basedir] + DATA_PATHS + PARQUET_PATHS +
             NETWORKING_PATHS,
             compiled_module=compiled_module,
             timeout=self.fetch_timeout,
             result=ttypes.FetchResult(),
             function="fetch_thrift",
             args=[request],
         )
         if result.filename and result.filename != output_filename:
             raise ModuleExitedError(0, "Module wrote to wrong output file")
     # TODO validate result isn't too large. If result is dataframe it makes
     # sense to truncate; but fetch results aren't necessarily data frames.
     # It's up to the module to enforce this logic ... but we need to set a
     # maximum file size.
     return FetchResult.from_thrift(result, basedir)
コード例 #7
0
 def test_fetch_result_from_thrift_disallow_non_files(self):
     with self.assertRaisesRegex(ValueError, "must exist"):
         types.FetchResult.from_thrift(
             ttypes.FetchResult("missing", []), self.basedir
         )
コード例 #8
0
 def test_fetch_result_from_thrift_disallow_hidden_files(self):
     with self.assertRaisesRegex(ValueError, "must not be hidden"):
         types.FetchResult.from_thrift(
             ttypes.FetchResult(".secrets", []), Path(__file__).parent
         )
コード例 #9
0
 def test_fetch_result_from_thrift_disallow_directories(self):
     with self.assertRaisesRegex(ValueError, "must not contain directories"):
         types.FetchResult.from_thrift(
             ttypes.FetchResult("/etc/passwd", []), Path(__file__).parent
         )
コード例 #10
0
def call_fetch(fetch: Callable,
               request: ttypes.FetchRequest) -> ttypes.FetchResult:
    """Call `fetch()` and validate the result.

    Module code may contain errors. This function and `fetch()` should strive
    to raise developer-friendly errors in the case of bugs -- including
    unexpected input.
    """
    # thrift => pandas
    basedir = Path(request.basedir)
    params: Dict[str, Any] = thrift_json_object_to_pydict(request.params)
    output_path = basedir / request.output_filename

    spec = inspect.getfullargspec(fetch)
    kwargs = {}
    varkw = bool(spec.varkw)  # if True, function accepts **kwargs
    kwonlyargs = spec.kwonlyargs

    if varkw or "secrets" in kwonlyargs:
        kwargs["secrets"] = thrift_json_object_to_pydict(request.secrets)
    if varkw or "settings" in kwonlyargs:
        kwargs["settings"] = settings
    if varkw or "get_input_dataframe" in kwonlyargs:

        async def get_input_dataframe():
            if request.input_table_parquet_filename is None:
                return None
            else:
                return _parquet_to_pandas(basedir /
                                          request.input_table_parquet_filename)

        kwargs["get_input_dataframe"] = get_input_dataframe

    if varkw or "output_path" in kwonlyargs:
        kwargs["output_path"] = output_path

    result = fetch(params, **kwargs)
    if asyncio.iscoroutine(result):
        result = asyncio.run(result)

    if isinstance(result, tuple) and len(result) == 2 and isinstance(
            result[0], Path):
        errors = ptypes.coerce_RenderError_list(result[1])
    elif isinstance(result, Path):
        errors = []
    elif isinstance(result, list):
        errors = ptypes.coerce_RenderError_list(result)
    else:
        pandas_result = ptypes.ProcessResult.coerce(result)
        pandas_result.truncate_in_place_if_too_big()
        # ProcessResult => FetchResult isn't a thing; but we can hack it using
        # ProcessResult => RenderResult => FetchResult.
        with tempfile_context(suffix=".arrow") as arrow_path:
            if pandas_result.columns:
                hacky_result = pandas_result.to_arrow(arrow_path)
                table = load_trusted_arrow_file(arrow_path)
                cjwparquet.write(output_path, table)
                errors = hacky_result.errors
            else:
                output_path.write_bytes(b"")
                errors = pandas_result.errors

    return ttypes.FetchResult(
        filename=request.output_filename,
        errors=[arrow_render_error_to_thrift(e) for e in errors],
    )
コード例 #11
0
 def test_fetch_result_from_thrift_disallow_hidden_files(self):
     with self.assertRaisesRegex(ValueError, "must not be hidden"):
         types.thrift_fetch_result_to_arrow(
             ttypes.FetchResult(".secrets", []), self.basedir)
コード例 #12
0
 def test_fetch_result_from_thrift_disallow_directories(self):
     with self.assertRaisesRegex(ValueError,
                                 "must not include directory names"):
         types.thrift_fetch_result_to_arrow(
             ttypes.FetchResult("/etc/passwd", []), self.basedir)