Exemple #1
0
def _with_downloaded_cached_render_result(
        ctx: contextlib.ExitStack, maybe_crr: Optional[CachedRenderResult],
        dir: Path) -> Tuple[Optional[Path], TableMetadata]:
    if maybe_crr is None:
        return (None, TableMetadata())
    else:
        try:
            parquet_path = ctx.enter_context(
                rendercache.downloaded_parquet_file(maybe_crr, dir=dir))
            return (parquet_path, maybe_crr.table_metadata)
        except rendercache.CorruptCacheError:
            # This is probably a race. That's okay. Treat missing
            # cache as, "there is no input". (This is user-visible
            # but likely uncommon.)
            return (None, TableMetadata())
Exemple #2
0
def open_cached_render_result(
        crr: CachedRenderResult) -> ContextManager[RenderResult]:
    """
    Yield a RenderResult equivalent to the one passed to `cache_render_result()`.

    Raise CorruptCacheError if the cached data does not match `crr`. That can
    mean:

        * The cached Parquet file is corrupt
        * The cached Parquet file is missing
        * `crr` is stale -- the cached result is for a different delta. This
          could be detected by a `Workflow.cooperative_lock()`, too, should the
          caller want to distinguish this error from the others.

    The returned RenderResult is backed by an mmapped file on disk, so it
    doesn't require much physical RAM.
    """
    if not crr.table_metadata.columns:
        # Zero-column tables aren't written to cache
        yield RenderResult(
            ArrowTable.from_zero_column_metadata(
                TableMetadata(crr.table_metadata.n_rows, [])),
            crr.errors,
            crr.json,
        )
        return

    with tempfile_context(prefix="cached-render-result") as arrow_path:
        # raise CorruptCacheError (deleting `arrow_path` in the process)
        result = load_cached_render_result(crr, arrow_path)

        yield result
Exemple #3
0
 def test_text_zero_chunks_valid(self):
     validate(
         pyarrow.Table.from_batches([],
                                    pyarrow.schema([("A", pyarrow.string())
                                                    ])),
         TableMetadata(0, [Text("A")]),
     )
 def test_clean_multicolumn_sort_in_table_order(self):
     input_shape = TableMetadata(3, [
         Column("B", ColumnType.Number()),
         Column("A", ColumnType.Number())
     ])
     result = clean_value(ParamDType.Multicolumn(), ["A", "B"], input_shape)
     self.assertEqual(result, ["B", "A"])
Exemple #5
0
def arrow_table_context(
    table: Union[Dict[str, List[Any]], pyarrow.Table],
    columns: Optional[List[Column]] = None,
    dir: Optional[pathlib.Path] = None,
) -> ContextManager[ArrowTable]:
    """
    Yield an ArrowTable (whose `.path` is a file).

    Metadata is inferred. Number columns have format `{:,}`.
    """
    if isinstance(table, dict):
        table = pyarrow.table(table)

    if columns is None:
        columns = [
            _arrow_column_to_column(name, col)
            for name, col in zip(table.column_names, table.columns)
        ]
    metadata = TableMetadata(table.num_rows, columns)

    if metadata.columns:
        with arrow_file(table, dir=dir) as path:
            yield ArrowTable(path, table, metadata)
    else:
        yield ArrowTable(None, None, metadata)
 def test_clean_column_happy_path(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     self.assertEqual(
         clean_value(ParamDType.Column(column_types=frozenset({"number"})),
                     "A", input_shape),
         "A",
     )
Exemple #7
0
    def test_metadata_comes_from_db_columns(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Datetime()),
            Column("C", ColumnType.Text()),
        ]
        result = RenderResult(
            arrow_table({
                "A": [1],
                "B": [datetime.datetime.now()],
                "C": ["x"]
            },
                        columns=columns))
        cache_render_result(self.workflow, self.wf_module, self.delta.id,
                            result)
        # Delete from disk entirely, to prove we did not read.
        minio.remove(BUCKET,
                     crr_parquet_key(self.wf_module.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_wf_module = WfModule.objects.get(id=self.wf_module.id)
        cached_result = fresh_wf_module.cached_render_result

        self.assertEqual(cached_result.table_metadata,
                         TableMetadata(1, columns))
Exemple #8
0
 def test_input_crr_corrupt_cache_error_is_none(
     self, downloaded_parquet_file, load_module
 ):
     load_module.return_value.migrate_params.return_value = {}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     downloaded_parquet_file.side_effect = rendercache.CorruptCacheError(
         "file not found"
     )
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     fetch.fetch_or_wrap_error(
         self.ctx,
         self.chroot_context,
         self.basedir,
         WfModule(),
         MockModuleVersion(),
         {},
         None,
         input_crr,
         self.output_path,
     )
     # fetch is still called, with `None` as argument.
     self.assertIsNone(
         load_module.return_value.fetch.call_args[1]["input_parquet_filename"]
     )
Exemple #9
0
 def test_input_crr(self, downloaded_parquet_file, clean_value, load_module):
     load_module.return_value.migrate_params.return_value = {}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     clean_value.return_value = {}
     downloaded_parquet_file.return_value = Path("/path/to/x.parquet")
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     fetch.fetch_or_wrap_error(
         self.ctx,
         self.chroot_context,
         self.basedir,
         WfModule(),
         MockModuleVersion(),
         {},
         None,
         input_crr,
         self.output_path,
     )
     # Passed file is downloaded from rendercache
     downloaded_parquet_file.assert_called_with(input_crr, dir=self.basedir)
     self.assertEqual(
         load_module.return_value.fetch.call_args[1]["input_parquet_filename"],
         "x.parquet",
     )
     # clean_value() is called with input metadata from CachedRenderResult
     clean_value.assert_called()
     self.assertEqual(clean_value.call_args[0][2], input_metadata)
Exemple #10
0
    def test_metadata_does_not_require_file_read(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Timestamp()),
            Column("C", ColumnType.Text()),
            Column("D", ColumnType.Date("month")),
        ]
        with arrow_table_context(
            make_column("A", [1], format="{:,.2f}"),
            make_column("B", [datetime.datetime(2021, 4, 13)]),
            make_column("C", ["c"]),
            make_column("D", [datetime.date(2021, 4, 1)], unit="month"),
        ) as (path, table):
            result = LoadedRenderResult(
                path=path, table=table, columns=columns, errors=[], json={}
            )
            cache_render_result(self.workflow, self.step, 1, result)
        # Delete from disk entirely, to prove we did not read.
        s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_step = Step.objects.get(id=self.step.id)
        cached_result = fresh_step.cached_render_result

        self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
Exemple #11
0
def infer_table_metadata(table: pyarrow.Table) -> TableMetadata:
    return TableMetadata(
        n_rows=table.num_rows,
        columns=[
            Column(name, _infer_output_column_type(column))
            for name, column in zip(table.column_names, table.columns)
        ],
    )
Exemple #12
0
 def test_duplicate_column_name(self):
     with self.assertRaises(DuplicateColumnName):
         validate(
             pyarrow.Table.from_arrays(
                 [pyarrow.array(["a"]),
                  pyarrow.array(["b"])], ["A", "A"]),
             TableMetadata(1, [Text("A"), Text("A")]),
         )
Exemple #13
0
 def test_clean_multicolumn_valid(self):
     input_shape = TableMetadata(3, [
         Column("A", ColumnType.Number()),
         Column("B", ColumnType.Number())
     ])
     result = clean_value(ParamSchema.Multicolumn(), ["A", "B"],
                          input_shape)
     self.assertEqual(result, ["A", "B"])
 def test_clean_multicolumn_missing_is_removed(self):
     input_shape = TableMetadata(3, [
         Column("A", ColumnType.Number()),
         Column("B", ColumnType.Number())
     ])
     result = clean_value(ParamDType.Multicolumn(), ["A", "X", "B"],
                          input_shape)
     self.assertEqual(result, ["A", "B"])
Exemple #15
0
    def test_column_name_not_utf8(self):
        bad_str = "\ud800x"  # invalid continuation byte
        bad_bytes = b"\xed\xa0\x80x"  # bad_str, encoded to utf-8

        with self.assertRaises(ColumnNameIsInvalidUtf8):
            validate(
                pyarrow.Table.from_arrays([pyarrow.array(["a"])], [bad_bytes]),
                TableMetadata(1, [Text(bad_str)]),
            )
Exemple #16
0
 def test_column_name_mismatch(self):
     with self.assertRaises(WrongColumnName):
         validate(
             pyarrow.table({
                 "A": ["a"],
                 "B": ["b"]
             }),
             TableMetadata(1, [Text("A"), Text("B2")]),
         )
Exemple #17
0
 def test_text_dictionary_zero_chunks_is_valid(self):
     validate(
         pyarrow.Table.from_batches(
             [],
             pyarrow.schema([("A",
                              pyarrow.dictionary(pyarrow.int32(),
                                                 pyarrow.string()))]),
         ),
         TableMetadata(0, [Text("A")]),
     )
 def test_clean_normal_dict(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     schema = ParamDType.Dict({
         "str": ParamDType.String(),
         "int": ParamDType.Integer()
     })
     value = {"str": "foo", "int": 3}
     expected = dict(value)  # no-op
     result = clean_value(schema, value, input_shape)
     self.assertEqual(result, expected)
Exemple #19
0
 def test_dictionary_column_has_unused_entry(self):
     with self.assertRaises(DictionaryColumnHasUnusedEntry):
         validate(
             pyarrow.table({
                 "A":
                 pyarrow.array(["x", None, "y", "y",
                                "z"]).dictionary_encode()[0:4]
             }),
             TableMetadata(4, [Text("A")]),
         )
Exemple #20
0
 def test_table_not_one_batch(self):
     with self.assertRaises(TableHasTooManyRecordBatches):
         validate(
             pyarrow.Table.from_batches([
                 pyarrow.RecordBatch.from_arrays([pyarrow.array(["a"])],
                                                 ["A"]),
                 pyarrow.RecordBatch.from_arrays([pyarrow.array(["b"])],
                                                 ["A"]),
             ]),
             TableMetadata(2, [Text("A")]),
         )
Exemple #21
0
 def test_column_datetime_must_be_ns_resolution(self):
     # [2019-09-17] Pandas only supports datetime64[ns]
     # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563
     with self.assertRaises(DatetimeUnitNotAllowed):
         validate(
             pyarrow.table({
                 "A":
                 pyarrow.array([5298375234],
                               type=pyarrow.timestamp("us", tz=None))
             }),
             TableMetadata(1, [Datetime("A")]),
         )
Exemple #22
0
 def test_column_datetime_should_be_tz_naive(self):
     with self.assertRaises(DatetimeTimezoneNotAllowed):
         validate(
             pyarrow.table({
                 "A":
                 pyarrow.array(
                     [5298375234123],
                     type=pyarrow.timestamp("ns", "America/New_York"),
                 )
             }),
             TableMetadata(1, [Datetime("A")]),
         )
    def test_clean_column_prompting_error_convert_to_number(self):
        input_shape = TableMetadata(3, [Column("A", ColumnType.Text())])
        with self.assertRaises(PromptingError) as cm:
            clean_value(ParamDType.Column(column_types=frozenset({"number"})),
                        "A", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A"], "text",
                                               frozenset({"number"}))
            ],
        )
    def test_clean_column_prompting_error_convert_to_text(self):
        # TODO make this _automatic_ instead of quick-fix?
        # Consider Regex. We probably want to pass the module a text Series
        # _separately_ from the input DataFrame. That way Regex can output
        # a new Text column but preserve its input column's data type.
        #
        # ... but for now: prompt for a Quick Fix.
        input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
        with self.assertRaises(PromptingError) as cm:
            clean_value(ParamDType.Column(column_types=frozenset({"text"})),
                        "A", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [PromptingError.WrongColumnType(["A"], None, frozenset({"text"}))],
        )
Exemple #25
0
 def test_text_invalid_utf8(self):
     # Let's construct a particularly tricky case: two strings that are
     # invalid on their own but are valid when concatenated. (In the buffer
     # they're concatenated, so the buffer bytes are valid utf-8 even though
     # the values aren't.)
     #
     # We'll also throw in a NULL, so we don't get tempted to ignore them
     # when we optimize this validation.
     poop_bytes = "💩".encode("utf-8")
     binary_array = pyarrow.array([None, poop_bytes[:2], poop_bytes[2:]])
     _, offsets, data = binary_array.buffers()
     with self.assertRaises(ColumnDataIsInvalidUtf8):
         validate(
             pyarrow.table(
                 {"A": pyarrow.StringArray.from_buffers(3, offsets, data)}),
             TableMetadata(3, [Text("A")]),
         )
    def test_input_crr(self, downloaded_parquet_file, clean_value):
        def do_fetch(
            compiled_module,
            chroot_context,
            basedir,
            params,
            secrets,
            last_fetch_result,
            input_parquet_filename,
            output_filename,
        ):
            shutil.copy(basedir / input_parquet_filename,
                        basedir / output_filename)
            return FetchResult(basedir / output_filename)

        self.kernel.fetch.side_effect = do_fetch
        clean_value.return_value = {}

        with tempfile_context(dir=self.basedir,
                              suffix=".parquet") as parquet_path:
            parquet_path.write_bytes(b"abc123")
            downloaded_parquet_file.return_value = parquet_path

            input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
            input_crr = CachedRenderResult(1, 2, 3, "ok", [], {},
                                           input_metadata)
            with self.assertLogs("fetcher.fetch", level=logging.INFO):
                result = fetch.fetch_or_wrap_error(
                    self.ctx,
                    self.chroot_context,
                    self.basedir,
                    "mod",
                    create_module_zipfile("mod"),
                    {},
                    {},
                    None,
                    input_crr,
                    self.output_path,
                )

            # Passed file is downloaded from rendercache
            self.assertEqual(result.path.read_bytes(), b"abc123")
            # clean_value() is called with input metadata from CachedRenderResult
            clean_value.assert_called()
            self.assertEqual(clean_value.call_args[0][2], input_metadata)
    def _build_cached_render_result_fresh_or_not(self) -> Optional[CachedRenderResult]:
        """
        Build a CachedRenderResult with this WfModule's rendered output.

        If the output is stale, return it anyway. (The return value's .delta_id
        will not match this WfModule's .delta_id.)

        This does not read the dataframe from disk. If you want a "snapshot in
        time" of the `render()` output, you need a lock, like this:

            # Lock the workflow, making sure we don't overwrite data
            with workflow.cooperative_lock():
                wf_module.refresh_from_db()
                # Read from disk
                with cjwstate.rendercache.io.open_cached_render_result(
                    wf_module.get_stale_cached_render_result()
                ) as result:
        """
        if self.cached_render_result_delta_id is None:
            return None

        delta_id = self.cached_render_result_delta_id
        status = self.cached_render_result_status
        columns = self.cached_render_result_columns
        errors = self.cached_render_result_errors
        nrows = self.cached_render_result_nrows

        # cached_render_result_json is sometimes a memoryview
        json_bytes = bytes(self.cached_render_result_json)
        if json_bytes:
            json_dict = json.loads(json_bytes)
        else:
            json_dict = {}

        return CachedRenderResult(
            workflow_id=self.workflow_id,
            wf_module_id=self.id,
            delta_id=delta_id,
            status=status,
            errors=errors,
            json=json_dict,
            table_metadata=TableMetadata(nrows, columns),
        )
Exemple #28
0
 def test_input_crr_corrupt_cache_error_is_none(self, downloaded_parquet_file):
     self.kernel.fetch.return_value = FetchResult(self.output_path, [])
     downloaded_parquet_file.side_effect = rendercache.CorruptCacheError(
         "file not found"
     )
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     with self.assertLogs("fetcher.fetch", level=logging.INFO):
         fetch.fetch_or_wrap_error(
             self.ctx,
             self.chroot_context,
             self.basedir,
             "mod",
             create_module_zipfile("mod"),
             {},
             {},
             None,
             input_crr,
             self.output_path,
         )
     # fetch is still called, with `None` as argument.
     self.assertIsNone(self.kernel.fetch.call_args[1]["input_parquet_filename"])
    def test_clean_multicolumn_prompting_error_convert_to_text(self):
        # TODO make this _automatic_ instead of quick-fix?
        # ... but for now: prompt for a Quick Fix.
        input_shape = TableMetadata(
            3,
            [
                Column("A", ColumnType.Number()),
                Column("B", ColumnType.Timestamp()),
                Column("C", ColumnType.Text()),
            ],
        )
        with self.assertRaises(PromptingError) as cm:
            schema = ParamDType.Multicolumn(column_types=frozenset({"text"}))
            clean_value(schema, "A,B", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A", "B"], None,
                                               frozenset({"text"}))
            ],
        )
    def test_dict_prompting_error(self):
        input_shape = TableMetadata(
            3,
            [Column("A", ColumnType.Text()),
             Column("B", ColumnType.Text())])
        schema = ParamDType.Dict({
            "col1":
            ParamDType.Column(column_types=frozenset({"number"})),
            "col2":
            ParamDType.Column(column_types=frozenset({"timestamp"})),
        })
        with self.assertRaises(PromptingError) as cm:
            clean_value(schema, {"col1": "A", "col2": "B"}, input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A"], "text",
                                               frozenset({"number"})),
                PromptingError.WrongColumnType(["B"], "text",
                                               frozenset({"timestamp"})),
            ],
        )