예제 #1
0
    def test_pass_last_fetch_result(self, downloaded_file):
        last_result_path = self.ctx.enter_context(
            tempfile_context(prefix="last-result")
        )

        result_path = self.ctx.enter_context(tempfile_context(prefix="result"))

        self.kernel.fetch.return_value = FetchResult(result_path, [])
        with self.assertLogs("fetcher.fetch", level=logging.INFO):
            fetch.fetch_or_wrap_error(
                self.ctx,
                self.chroot_context,
                self.basedir,
                "mod",
                create_module_zipfile("mod"),
                {},
                {},
                FetchResult(last_result_path, []),
                None,
                self.output_path,
            )
        self.assertEqual(
            self.kernel.fetch.call_args[1]["last_fetch_result"],
            FetchResult(last_result_path, []),
        )
예제 #2
0
 def test_render_xlsx_bad_content(self):
     with tempfile_context("fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", XLSX_MIME_TYPE)],
             io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     self.assertEqual(
         result,
         RenderResult(
             ArrowTable(),
             [
                 RenderError(
                     I18nMessage.TODO_i18n(
                         'Error reading Excel file: Unsupported format, or corrupt file: Expected BOF record; found b"ce\\xc3\\xa7i n\'"'
                     )
                 )
             ],
         ),
     )
예제 #3
0
def arrow_table_context(
    *columns,
    dir: Optional[pathlib.Path] = None,
) -> ContextManager[Tuple[pathlib.Path, pyarrow.Table]]:
    """Yield a Path and a pa.Table with its contents.

    Two calling conventions:

        with arrow_table_context(make_column("A", [1]), make_column("B", [2])) as (path, table):
            pass

        table = make_table(make_column("A", [1]), make_column("B", [1]))
        with arrow_table_context(x) as (path, _):
            pass
    """
    if len(columns) == 1 and isinstance(columns[0], pyarrow.Table):
        table = columns[0]
    else:
        table = make_table(*columns)

    with tempfile_context(dir=dir) as path:
        writer = pyarrow.RecordBatchFileWriter(path, table.schema)
        writer.write_table(table)
        writer.close()
        yield path, table
예제 #4
0
    def test_render_exception(self):
        module = self.kernel.compile(
            MockPath(
                ["foo.py"],
                b"import os\ndef render(table, params): raise RuntimeError('fail')",
            ),
            "foo",
        )
        with self.assertRaises(ModuleExitedError) as cm:
            with arrow_table_context({"A": [1]},
                                     dir=self.basedir) as input_table:
                input_table.path.chmod(0o644)
                with tempfile_context(prefix="output-",
                                      dir=self.basedir) as output_path:
                    self.kernel.render(
                        module,
                        self.basedir,
                        input_table,
                        types.Params({
                            "m": 2.5,
                            "s": "XX"
                        }),
                        types.Tab("tab-1", "Tab 1"),
                        None,
                        output_filename=output_path.name,
                    )

        self.assertEquals(cm.exception.exit_code, 1)  # Python exit code
        self.assertRegex(cm.exception.log, r"\bRuntimeError\b")
        self.assertRegex(cm.exception.log, r"\bfail\b")
        # Regression test: [2019-10-02], the "forkserver_main()->spawn_module()"
        # process would raise _another_ exception while exiting. It would try to
        # close an already-closed socket.
        self.assertNotRegex(cm.exception.log, r"Bad file descriptor")
예제 #5
0
    def test_render_empty_file_fetch_result_is_parquet(self):
        def render(*args, fetch_result):
            return fetch_result.dataframe

        with tempfile_context(dir=self.basedir) as tf:
            result = self._test_render(render, fetch_result=FetchResult(tf))
            assert_render_result_equals(result, RenderResult(arrow_table({})))
예제 #6
0
 def test_arrow_file_does_not_exist(self):
     with tempfile_context() as path:
         path.unlink()
         with self.assertRaisesRegex(
                 InvalidArrowFile,
                 "arrow-validate: .*No such file or directory"):
             validate_arrow_file(path)
예제 #7
0
def open_cached_render_result(
        crr: CachedRenderResult) -> ContextManager[RenderResult]:
    """
    Yield a RenderResult equivalent to the one passed to `cache_render_result()`.

    Raise CorruptCacheError if the cached data does not match `crr`. That can
    mean:

        * The cached Parquet file is corrupt
        * The cached Parquet file is missing
        * `crr` is stale -- the cached result is for a different delta. This
          could be detected by a `Workflow.cooperative_lock()`, too, should the
          caller want to distinguish this error from the others.

    The returned RenderResult is backed by an mmapped file on disk, so it
    doesn't require much physical RAM.
    """
    if not crr.table_metadata.columns:
        # Zero-column tables aren't written to cache
        yield RenderResult(
            ArrowTable.from_zero_column_metadata(
                TableMetadata(crr.table_metadata.n_rows, [])),
            crr.errors,
            crr.json,
        )
        return

    with tempfile_context(prefix="cached-render-result") as arrow_path:
        # raise CorruptCacheError (deleting `arrow_path` in the process)
        result = load_cached_render_result(crr, arrow_path)

        yield result
예제 #8
0
    def test_fetch_happy_path(self):
        module = self.kernel.compile(
            MockPath(
                ["foo.py"],
                textwrap.dedent("""
                    import pandas as pd

                    def fetch(params):
                        return pd.DataFrame({"A": [params["a"]]})
                    """).encode("utf-8"),
            ),
            "foo",
        )

        with tempfile_context(prefix="output-",
                              dir=self.basedir) as output_path:
            result = self.kernel.fetch(
                module,
                self.basedir,
                types.Params({"a": 1}),
                {},
                None,
                None,
                output_filename=output_path.name,
            )

            self.assertEquals(result.errors, [])
            table = pyarrow.parquet.read_pandas(str(result.path))
            self.assertEquals(table.to_pydict(), {"A": [1]})
예제 #9
0
 def test_pydict_zero_rows(self):
     with tempfile_context() as path:
         # ensure at least 1 row group
         parquet.write(
             path,
             pyarrow.table({
                 "A":
                 pyarrow.array([], type=pyarrow.string()),
                 "B":
                 pyarrow.DictionaryArray.from_arrays(
                     pyarrow.array([], type=pyarrow.int32()),
                     pyarrow.array([], type=pyarrow.string()),
                 ),
                 "C":
                 pyarrow.array([], type=pyarrow.timestamp("ns")),
                 "D":
                 pyarrow.array([], type=pyarrow.float64()),
             }),
         )
         self.assertEqual(
             parquet.read_pydict(path, range(4), range(0)),
             {
                 "A": [],
                 "B": [],
                 "C": [],
                 "D": []
             },
         )
예제 #10
0
 def test_render_fetch_error(self):
     fetch_errors = [RenderError(I18nMessage("x", {"y": "z"}))]
     with tempfile_context() as empty_path:
         with self.render(P(), FetchResult(empty_path,
                                           fetch_errors)) as result:
             assert_arrow_table_equals(result.table, ArrowTable())
             self.assertEqual(result.errors, fetch_errors)
 def test_fetch_nothing(self):
     with tempfile_context(prefix="output-") as output_path:
         result = fetch_arrow(P(file=None), {}, None, None, output_path)
         self.assertEqual(
             result.errors,
             [RenderError(I18nMessage.TODO_i18n("Please choose a file"))],
         )
예제 #12
0
    def test_fetch_get_stored_dataframe_empty_file_is_empty_table(self):
        async def fetch(params, *, get_stored_dataframe):
            df = await get_stored_dataframe()
            assert_frame_equal(df, pd.DataFrame())

        with tempfile_context(dir=self.basedir) as parquet_path:
            self._test_fetch(fetch, last_fetch_result=FetchResult(parquet_path, []))
예제 #13
0
    def test_fetch_get_input_dataframe_empty_file_is_empty_table(self):
        async def fetch(params, *, get_input_dataframe):
            df = await get_input_dataframe()
            assert_frame_equal(df, pd.DataFrame())

        with tempfile_context(dir=self.basedir) as input_table_parquet_path:
            self._test_fetch(fetch, input_table_parquet_path=input_table_parquet_path)
예제 #14
0
 def test_default_render_returns_fetch_result(self):
     # Functionality used by libraryofcongress
     with ExitStack() as ctx:
         input_arrow_table = ctx.enter_context(
             arrow_table_context({"A": [1]}, dir=self.basedir)
         )
         parquet_filename = Path(
             ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name
         ).name
         out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name
         thrift_result = module.render_thrift(
             ttypes.RenderRequest(
                 str(self.basedir),
                 input_arrow_table.to_thrift(),
                 Params({}).to_thrift(),
                 ttypes.Tab("tab-1", "Tab 1"),
                 ttypes.FetchResult(
                     parquet_filename,
                     [RenderError(I18nMessage.TODO_i18n("A warning")).to_thrift()],
                 ),
                 out_filename,
             )
         )
         result = RenderResult.from_thrift(thrift_result, self.basedir)
         assert_render_result_equals(
             result,
             RenderResult(
                 arrow_table({"A": [2]}),
                 [RenderError(I18nMessage.TODO_i18n("A warning"))],
             ),
         )
예제 #15
0
def _DEPRECATED_overwrite_to_fix_arrow_table_schema(
        path: Path, fallback_schema: pa.Schema) -> None:
    if not path.stat().st_size:
        return

    table = load_trusted_arrow_file(path)

    untyped_schema = table.schema
    fields = [
        __DEPRECATED_fix_field(
            untyped_schema.field(i),
            (None if fallback_schema.get_field_index(name) == -1 else
             fallback_schema.field(fallback_schema.get_field_index(name))),
        ) for i, name in enumerate(untyped_schema.names)
    ]
    schema = pa.schema(fields)

    # Overwrite with new data
    #
    # We don't short-circuit by comparing schemas: two pa.Schema values
    # with different number formats evaluate as equal.
    #
    # We write a separate file to /var/tmp and then copy it: our sandbox
    # won't let us `rename(2)` in `path`'s directory.
    with tempfile_context(dir="/var/tmp") as rewrite_path:
        with pa.ipc.RecordBatchFileWriter(rewrite_path, schema) as writer:
            writer.write_table(pa.table(table.columns, schema=schema))
        shutil.copyfile(rewrite_path, path)
예제 #16
0
def fetch_arrow(
    params: Dict[str, Any],
    secrets: Dict[str, Any],
    last_fetch_result: Optional[types.FetchResult],
    input_table_parquet_path: Optional[Path],
    output_path: Path,
) -> types.FetchResult:
    """Render using `cjwkernel.types` data types.

    The result will be encoded as a Parquet file.

    Module authors are encouraged to replace this function, because the
    `fetch()` signature deals in dataframes instead of in raw data.
    """
    pandas_result: Union[ptypes.ProcessResult,
                         types.FetchResult] = fetch_pandas(
                             params=__arrow_param_to_pandas_param(params),
                             secrets=secrets,
                             last_fetch_result=last_fetch_result,
                             input_table_parquet_path=input_table_parquet_path,
                             output_path=output_path,
                         )
    if isinstance(pandas_result, ptypes.ProcessResult):
        pandas_result.truncate_in_place_if_too_big()
        # ProcessResult => FetchResult isn't a thing; but we can hack it using
        # ProcessResult => RenderResult => FetchResult.
        with tempfile_context(suffix=".arrow") as arrow_path:
            hacky_result = pandas_result.to_arrow(arrow_path)
        if hacky_result.table.path:
            cjwparquet.write(output_path, hacky_result.table.table)
        else:
            output_path.write_bytes(b"")
        return types.FetchResult(output_path, hacky_result.errors)
    else:  # it's already a types.FetchResult
        return pandas_result
예제 #17
0
    def test_fetch_truncate(self):
        def fetch(params):
            return pd.DataFrame({"A": [1, 2, 3]})

        with tempfile_context(dir=self.basedir) as outfile:
            result = self._test_fetch(fetch, output_filename=outfile.name)
            self.assertEqual(
                result,
                FetchResult(
                    outfile,
                    errors=[
                        FetchError(
                            I18nMessage(
                                "py.cjwkernel.pandas.types.ProcessResult.truncate_in_place_if_too_big.warning",
                                {
                                    "old_number": 3,
                                    "new_number": 2
                                },
                                None,
                            ))
                    ],
                ),
            )
            assert_arrow_table_equals(
                read_parquet_as_arrow(
                    outfile, [Column("A", ColumnType.Number("{:,}"))]),
                make_table(make_column("A", [1, 2])),
            )
예제 #18
0
 async def test_fetch_http_404(self):
     self.responses["bad-url"] = web.HTTPNotFound()
     with tempfile_context("output-") as output_path:
         url = self.build_url("/bad-url")
         result = await fetch(P(url=url), output_path=output_path)
         self.assertEqual(result, "Error from server: 404 Not Found")
         self.assertEqual(output_path.read_bytes(), b"")
예제 #19
0
 def _test_fetch(
     self,
     fetch_fn,
     *,
     params={},
     secrets={},
     last_fetch_result=None,
     input_table_parquet_path=None,
     output_filename=None,
 ):
     with ExitStack() as ctx:
         ctx.enter_context(patch.object(module, "fetch", fetch_fn))
         if output_filename is None:
             # Make a temporary output filename -- this will make `fetch()`
             # complete, but callers won't be able to see the data it
             # outputs because we'll delete the file too soon.
             output_filename = ctx.enter_context(
                 tempfile_context(dir=self.basedir)).name
         thrift_result = module.fetch_thrift(
             ttypes.FetchRequest(
                 basedir=str(self.basedir),
                 params=arrow_params_to_thrift(Params(params)),
                 secrets=arrow_raw_params_to_thrift(RawParams(secrets)),
                 last_fetch_result=(
                     arrow_fetch_result_to_thrift(last_fetch_result)
                     if last_fetch_result is not None else None),
                 input_table_parquet_filename=(input_table_parquet_path.name
                                               if input_table_parquet_path
                                               is not None else None),
                 output_filename=output_filename,
             ))
         return thrift_fetch_result_to_arrow(thrift_result, self.basedir)
예제 #20
0
def parse_xls_bytesio(bytesio: io.BytesIO) -> pd.DataFrame:
    """
    Build a pd.DataFrame from xlsx bytes or raise parse error.

    Peculiarities:

    * Error can be xlrd.XLRDError or pandas error
    * We read the entire file contents into memory before parsing

    TODO change signature to require `Path`, not `io.BytesIO`. The way things
    are, we're copying tempfiles gratuitously.
    """
    # Use xlrd.open_workbook(): if we call pandas.read_excel(bytesio) it
    # will read the entire file into RAM.
    with tempfile_context() as path:
        with path.open("wb") as tmp:
            shutil.copyfileobj(bytesio, tmp)
        # dtype='category' crashes as of 2018-09-11
        try:
            workbook = xlrd.open_workbook(str(path))
            data = pd.read_excel(workbook, engine="xlrd", dtype=object)
        except xlrd.XLRDError as err:
            return f"Error reading Excel file: %s" % str(err)

    data.columns = [str(c) for c in data.columns]
    autocast_dtypes_in_place(data)
    return data
예제 #21
0
 def _test_render(
         self,
         render_fn,
         arrow_table_dict={},
         arrow_table=None,
         params={},
         tab=Tab("tab-1", "Tab 1"),
         fetch_result=None,
         output_filename=None,
 ):
     with ExitStack() as ctx:
         if arrow_table is None:
             arrow_table = ctx.enter_context(
                 arrow_table_context(arrow_table_dict, dir=self.basedir))
         ctx.enter_context(patch.object(module, "render", render_fn))
         out_filename = ctx.enter_context(
             tempfile_context(dir=self.basedir)).name
         thrift_result = module.render_thrift(
             ttypes.RenderRequest(
                 str(self.basedir),
                 arrow_arrow_table_to_thrift(arrow_table),
                 arrow_params_to_thrift(Params(params)),
                 arrow_tab_to_thrift(tab),
                 arrow_fetch_result_to_thrift(fetch_result)
                 if fetch_result is not None else None,
                 out_filename,
             ))
         return thrift_render_result_to_arrow(thrift_result, self.basedir)
예제 #22
0
 def test_SECURITY_no_new_privs(self):
     # The user cannot use a setuid program to become root
     assert os.getuid() == 0  # so our test suite can actually chmod
     # Build the tempfile in the root filesystem, where there's no
     # "nosetuid" mount option
     with tempfile_context(prefix="print-id", suffix=".bin",
                           dir="/") as prog:
         # We can't test with a _script_: we need to test with a _binary_.
         # (Scripts invoke the interpreter, which is not setuid.)
         #
         # The "id" binary is perfect: it prints all three uids and gids if
         # they differ from one another.
         shutil.copy("/usr/bin/id", prog)
         os.chown(str(prog), 0, 0)  # make doubly sure root owns it
         os.chmod(str(prog), 0o755 | stat.S_ISUID | stat.S_ISGID)
         exitcode, stdout, stderr = self._spawn_and_communicate(
             r"""
             import os
             os.execv("%s", ["%s"])
             """ % (str(prog), str(prog)),
             # XXX SECURITY [2019-10-11] This test should fail if we comment
             # out "no_new_privs". Why doesn't it? (It looks like there's
             # some other security layer we don't know of....)
             skip_sandbox_except=frozenset(["setuid", "no_new_privs"]),
         )
         self.assertEqual(exitcode, 0)
         self.assertEqual(stdout, b"uid=1000 gid=1000 groups=1000\n")
         self.assertEqual(stderr, b"")
예제 #23
0
 def fetch(
     self, url: str = "", has_header: bool = True
 ) -> ContextManager[FetchResult]:
     with tempfile_context(prefix="output-") as output_path:
         with self.assertLogs(level=logging.DEBUG):
             yield fetch_arrow(
                 {"url": url, "has_header": has_header}, {}, None, None, output_path
             )
예제 #24
0
    def test_fetch_return_error(self):
        async def fetch(params):
            return "bad things"

        with tempfile_context(dir=self.basedir) as outfile:
            result = self._test_fetch(fetch, output_filename=outfile.name)
            self.assertEqual(result.errors,
                             [FetchError(TODO_i18n("bad things"))])
            self.assertEqual(outfile.read_bytes(), b"")
예제 #25
0
    def test_fetch_return_tuple_path_and_error(self):
        with tempfile_context(dir=self.basedir) as outfile:

            async def fetch(params):
                outfile.write_text("xyz")
                return outfile, "foo"

            result = self._test_fetch(fetch, output_filename=outfile.name)
            self.assertEqual(result.errors, [RenderError(I18nMessage.TODO_i18n("foo"))])
예제 #26
0
    def test_render_with_non_parquet_fetch_result(self):
        def render(*args, fetch_result):
            return pd.DataFrame({"A": [fetch_result.path.read_text()]})

        with tempfile_context(dir=self.basedir) as tf:
            tf.write_bytes(b"abcd")
            result = self._test_render(render, fetch_result=FetchResult(tf))
            assert_render_result_equals(
                result, RenderResult(arrow_table({"A": ["abcd"]})))
예제 #27
0
    def test_wf_module_duplicate(self):
        workflow = Workflow.create_and_init()
        step1 = workflow.tabs.first().wf_modules.create(order=0, slug="step-1")

        # store data to test that it is duplicated
        with tempfile_context() as path1:
            path1.write_bytes(b"12345")
            create_stored_object(workflow.id, step1.id, path1)
        with tempfile_context() as path2:
            path1.write_bytes(b"23456")
            so2 = create_stored_object(workflow.id, step1.id, path2)
        step1.secrets = {"do not copy": {"name": "evil", "secret": "evil"}}
        step1.stored_data_version = so2.stored_at
        step1.save(update_fields=["stored_data_version"])

        # duplicate into another workflow, as we would do when duplicating a workflow
        workflow2 = Workflow.create_and_init()
        tab2 = workflow2.tabs.first()
        step1d = step1.duplicate_into_new_workflow(tab2)
        step1d.refresh_from_db()  # test what we actually have in the db

        self.assertEqual(step1d.slug, "step-1")
        self.assertEqual(step1d.workflow, workflow2)
        self.assertEqual(step1d.module_id_name, step1.module_id_name)
        self.assertEqual(step1d.order, step1.order)
        self.assertEqual(step1d.notes, step1.notes)
        self.assertEqual(step1d.last_update_check, step1.last_update_check)
        self.assertEqual(step1d.is_collapsed, step1.is_collapsed)
        self.assertEqual(step1d.params, step1.params)
        self.assertEqual(step1d.secrets, {})

        # Stored data should contain a clone of content only, not complete version history
        self.assertEqual(step1d.stored_objects.count(), 1)
        self.assertEqual(step1d.stored_data_version, step1.stored_data_version)
        so2d = step1d.stored_objects.first()
        # The StoredObject was copied byte for byte into a different file
        self.assertNotEqual(so2d.key, so2.key)
        self.assertEqual(
            minio.get_object_with_data(minio.StoredObjectsBucket,
                                       so2d.key)["Body"],
            minio.get_object_with_data(minio.StoredObjectsBucket,
                                       so2.key)["Body"],
        )
예제 #28
0
    def test_render_empty_file_fetch_result_is_parquet(self):
        def render(table, params, *, fetch_result):
            assert_frame_equal(fetch_result.dataframe, pd.DataFrame({}))
            return fetch_result.dataframe

        with ModuleTestEnv(render=render) as env:
            with tempfile_context(dir=env.basedir) as tf:
                outcome = env.call_render(make_table(), {},
                                          fetch_result=FetchResult(tf))
                self.assertEqual(outcome.read_table(), make_table())
예제 #29
0
    def test_fetch_get_stored_dataframe_unhandled_parquet_is_error(self):
        # Why an error? So module authors can handle it. They _created_ the
        # problem, after all. Let's help them detect it.
        async def fetch(params, *, get_stored_dataframe):
            with self.assertRaises(pa.ArrowIOError):
                await get_stored_dataframe()

        with tempfile_context(dir=self.basedir) as parquet_path:
            parquet_path.write_bytes(b"12345")
            self._test_fetch(fetch, last_fetch_result=FetchResult(parquet_path, []))
예제 #30
0
def _load_fetch_result(
    wf_module: WfModule, basedir: Path, exit_stack: contextlib.ExitStack
) -> Optional[FetchResult]:
    """
    Download user-selected StoredObject to `basedir`, so render() can read it.

    Edge cases:

    Create no file (and return `None`) if the user did not select a
    StoredObject, or if the selected StoredObject does not point to a file
    on minio.

    The caller should ensure "leave `path` alone" means "return an empty
    FetchResult". The FetchResult may still have an error.
    """
    try:
        stored_object = wf_module.stored_objects.get(
            stored_at=wf_module.stored_data_version
        )
    except StoredObject.DoesNotExist:
        return None
    if not stored_object.bucket or not stored_object.key:
        return None

    with contextlib.ExitStack() as inner_stack:
        path = inner_stack.enter_context(
            tempfile_context(prefix="fetch-result-", dir=basedir)
        )

        try:
            minio.download(stored_object.bucket, stored_object.key, path)
            # Download succeeded, so we no longer want to delete `path`
            # right _now_ ("now" means, "in inner_stack.close()"). Instead,
            # transfer ownership of `path` to exit_stack.
            exit_stack.callback(inner_stack.pop_all().close)
        except FileNotFoundError:
            # A few StoredObjects -- very old ones with size=0 -- are
            # *intentionally* not in minio. It turns out modules from that era
            # treated empty-file and None as identical. The _modules_ must
            # preserve that logic for backwards compatibility; so it's safe to
            # return `None` here.
            #
            # Other than that, if the file doesn't exist it's a race: either
            # the fetch result is too _new_ (it's in the database but its file
            # hasn't been written yet) or the fetch result is half-deleted (its
            # file was deleted and it's still in the database). In either case,
            # pretend the fetch result does not exist in the database -- i.e.,
            # return `None`.
            return None

    if wf_module.fetch_error:
        errors = [RenderError(I18nMessage.TODO_i18n(wf_module.fetch_error))]
    else:
        errors = []
    return FetchResult(path, errors)