Python TableMetadata Exemples, cjwkernel.types.TableMetadata Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : fetch.py Projet : afcarl/cjworkbench

def _with_downloaded_cached_render_result(
        ctx: contextlib.ExitStack, maybe_crr: Optional[CachedRenderResult],
        dir: Path) -> Tuple[Optional[Path], TableMetadata]:
    if maybe_crr is None:
        return (None, TableMetadata())
    else:
        try:
            parquet_path = ctx.enter_context(
                rendercache.downloaded_parquet_file(maybe_crr, dir=dir))
            return (parquet_path, maybe_crr.table_metadata)
        except rendercache.CorruptCacheError:
            # This is probably a race. That's okay. Treat missing
            # cache as, "there is no input". (This is user-visible
            # but likely uncommon.)
            return (None, TableMetadata())

Exemple #2

0

Afficher le fichier

def open_cached_render_result(
        crr: CachedRenderResult) -> ContextManager[RenderResult]:
    """
    Yield a RenderResult equivalent to the one passed to `cache_render_result()`.

    Raise CorruptCacheError if the cached data does not match `crr`. That can
    mean:

        * The cached Parquet file is corrupt
        * The cached Parquet file is missing
        * `crr` is stale -- the cached result is for a different delta. This
          could be detected by a `Workflow.cooperative_lock()`, too, should the
          caller want to distinguish this error from the others.

    The returned RenderResult is backed by an mmapped file on disk, so it
    doesn't require much physical RAM.
    """
    if not crr.table_metadata.columns:
        # Zero-column tables aren't written to cache
        yield RenderResult(
            ArrowTable.from_zero_column_metadata(
                TableMetadata(crr.table_metadata.n_rows, [])),
            crr.errors,
            crr.json,
        )
        return

    with tempfile_context(prefix="cached-render-result") as arrow_path:
        # raise CorruptCacheError (deleting `arrow_path` in the process)
        result = load_cached_render_result(crr, arrow_path)

        yield result

Exemple #3

0

Afficher le fichier

 def test_text_zero_chunks_valid(self):
     validate(
         pyarrow.Table.from_batches([],
                                    pyarrow.schema([("A", pyarrow.string())
                                                    ])),
         TableMetadata(0, [Text("A")]),
     )

Exemple #4

0

Afficher le fichier

Fichier : test_fetchprep.py Projet : lenamax2355/cjworkbench

 def test_clean_multicolumn_sort_in_table_order(self):
     input_shape = TableMetadata(3, [
         Column("B", ColumnType.Number()),
         Column("A", ColumnType.Number())
     ])
     result = clean_value(ParamDType.Multicolumn(), ["A", "B"], input_shape)
     self.assertEqual(result, ["B", "A"])

Exemple #5

0

Afficher le fichier

def arrow_table_context(
    table: Union[Dict[str, List[Any]], pyarrow.Table],
    columns: Optional[List[Column]] = None,
    dir: Optional[pathlib.Path] = None,
) -> ContextManager[ArrowTable]:
    """
    Yield an ArrowTable (whose `.path` is a file).

    Metadata is inferred. Number columns have format `{:,}`.
    """
    if isinstance(table, dict):
        table = pyarrow.table(table)

    if columns is None:
        columns = [
            _arrow_column_to_column(name, col)
            for name, col in zip(table.column_names, table.columns)
        ]
    metadata = TableMetadata(table.num_rows, columns)

    if metadata.columns:
        with arrow_file(table, dir=dir) as path:
            yield ArrowTable(path, table, metadata)
    else:
        yield ArrowTable(None, None, metadata)

Exemple #6

0

Afficher le fichier

Fichier : test_fetchprep.py Projet : lenamax2355/cjworkbench

 def test_clean_column_happy_path(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     self.assertEqual(
         clean_value(ParamDType.Column(column_types=frozenset({"number"})),
                     "A", input_shape),
         "A",
     )

Exemple #7

0

Afficher le fichier

Fichier : test_io.py Projet : afcarl/cjworkbench

    def test_metadata_comes_from_db_columns(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Datetime()),
            Column("C", ColumnType.Text()),
        ]
        result = RenderResult(
            arrow_table({
                "A": [1],
                "B": [datetime.datetime.now()],
                "C": ["x"]
            },
                        columns=columns))
        cache_render_result(self.workflow, self.wf_module, self.delta.id,
                            result)
        # Delete from disk entirely, to prove we did not read.
        minio.remove(BUCKET,
                     crr_parquet_key(self.wf_module.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_wf_module = WfModule.objects.get(id=self.wf_module.id)
        cached_result = fresh_wf_module.cached_render_result

        self.assertEqual(cached_result.table_metadata,
                         TableMetadata(1, columns))

Exemple #8

0

Afficher le fichier

 def test_input_crr_corrupt_cache_error_is_none(
     self, downloaded_parquet_file, load_module
 ):
     load_module.return_value.migrate_params.return_value = {}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     downloaded_parquet_file.side_effect = rendercache.CorruptCacheError(
         "file not found"
     )
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     fetch.fetch_or_wrap_error(
         self.ctx,
         self.chroot_context,
         self.basedir,
         WfModule(),
         MockModuleVersion(),
         {},
         None,
         input_crr,
         self.output_path,
     )
     # fetch is still called, with `None` as argument.
     self.assertIsNone(
         load_module.return_value.fetch.call_args[1]["input_parquet_filename"]
     )

Exemple #9

0

Afficher le fichier

 def test_input_crr(self, downloaded_parquet_file, clean_value, load_module):
     load_module.return_value.migrate_params.return_value = {}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     clean_value.return_value = {}
     downloaded_parquet_file.return_value = Path("/path/to/x.parquet")
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     fetch.fetch_or_wrap_error(
         self.ctx,
         self.chroot_context,
         self.basedir,
         WfModule(),
         MockModuleVersion(),
         {},
         None,
         input_crr,
         self.output_path,
     )
     # Passed file is downloaded from rendercache
     downloaded_parquet_file.assert_called_with(input_crr, dir=self.basedir)
     self.assertEqual(
         load_module.return_value.fetch.call_args[1]["input_parquet_filename"],
         "x.parquet",
     )
     # clean_value() is called with input metadata from CachedRenderResult
     clean_value.assert_called()
     self.assertEqual(clean_value.call_args[0][2], input_metadata)

Exemple #10

0

Afficher le fichier

    def test_metadata_does_not_require_file_read(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Timestamp()),
            Column("C", ColumnType.Text()),
            Column("D", ColumnType.Date("month")),
        ]
        with arrow_table_context(
            make_column("A", [1], format="{:,.2f}"),
            make_column("B", [datetime.datetime(2021, 4, 13)]),
            make_column("C", ["c"]),
            make_column("D", [datetime.date(2021, 4, 1)], unit="month"),
        ) as (path, table):
            result = LoadedRenderResult(
                path=path, table=table, columns=columns, errors=[], json={}
            )
            cache_render_result(self.workflow, self.step, 1, result)
        # Delete from disk entirely, to prove we did not read.
        s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_step = Step.objects.get(id=self.step.id)
        cached_result = fresh_step.cached_render_result

        self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))

Exemple #11

0

Afficher le fichier

def infer_table_metadata(table: pyarrow.Table) -> TableMetadata:
    return TableMetadata(
        n_rows=table.num_rows,
        columns=[
            Column(name, _infer_output_column_type(column))
            for name, column in zip(table.column_names, table.columns)
        ],
    )

Exemple #12

0

Afficher le fichier

 def test_duplicate_column_name(self):
     with self.assertRaises(DuplicateColumnName):
         validate(
             pyarrow.Table.from_arrays(
                 [pyarrow.array(["a"]),
                  pyarrow.array(["b"])], ["A", "A"]),
             TableMetadata(1, [Text("A"), Text("A")]),
         )

Exemple #13

0

Afficher le fichier

 def test_clean_multicolumn_valid(self):
     input_shape = TableMetadata(3, [
         Column("A", ColumnType.Number()),
         Column("B", ColumnType.Number())
     ])
     result = clean_value(ParamSchema.Multicolumn(), ["A", "B"],
                          input_shape)
     self.assertEqual(result, ["A", "B"])

Exemple #14

0

Afficher le fichier

Fichier : test_fetchprep.py Projet : lenamax2355/cjworkbench

 def test_clean_multicolumn_missing_is_removed(self):
     input_shape = TableMetadata(3, [
         Column("A", ColumnType.Number()),
         Column("B", ColumnType.Number())
     ])
     result = clean_value(ParamDType.Multicolumn(), ["A", "X", "B"],
                          input_shape)
     self.assertEqual(result, ["A", "B"])

Exemple #15

0

Afficher le fichier

    def test_column_name_not_utf8(self):
        bad_str = "\ud800x"  # invalid continuation byte
        bad_bytes = b"\xed\xa0\x80x"  # bad_str, encoded to utf-8

        with self.assertRaises(ColumnNameIsInvalidUtf8):
            validate(
                pyarrow.Table.from_arrays([pyarrow.array(["a"])], [bad_bytes]),
                TableMetadata(1, [Text(bad_str)]),
            )

Exemple #16

0

Afficher le fichier

 def test_column_name_mismatch(self):
     with self.assertRaises(WrongColumnName):
         validate(
             pyarrow.table({
                 "A": ["a"],
                 "B": ["b"]
             }),
             TableMetadata(1, [Text("A"), Text("B2")]),
         )

Exemple #17

0

Afficher le fichier

 def test_text_dictionary_zero_chunks_is_valid(self):
     validate(
         pyarrow.Table.from_batches(
             [],
             pyarrow.schema([("A",
                              pyarrow.dictionary(pyarrow.int32(),
                                                 pyarrow.string()))]),
         ),
         TableMetadata(0, [Text("A")]),
     )

Exemple #18

0

Afficher le fichier

Fichier : test_fetchprep.py Projet : lenamax2355/cjworkbench

 def test_clean_normal_dict(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     schema = ParamDType.Dict({
         "str": ParamDType.String(),
         "int": ParamDType.Integer()
     })
     value = {"str": "foo", "int": 3}
     expected = dict(value)  # no-op
     result = clean_value(schema, value, input_shape)
     self.assertEqual(result, expected)

Exemple #19

0

Afficher le fichier

 def test_dictionary_column_has_unused_entry(self):
     with self.assertRaises(DictionaryColumnHasUnusedEntry):
         validate(
             pyarrow.table({
                 "A":
                 pyarrow.array(["x", None, "y", "y",
                                "z"]).dictionary_encode()[0:4]
             }),
             TableMetadata(4, [Text("A")]),
         )

Exemple #20

0

Afficher le fichier

 def test_table_not_one_batch(self):
     with self.assertRaises(TableHasTooManyRecordBatches):
         validate(
             pyarrow.Table.from_batches([
                 pyarrow.RecordBatch.from_arrays([pyarrow.array(["a"])],
                                                 ["A"]),
                 pyarrow.RecordBatch.from_arrays([pyarrow.array(["b"])],
                                                 ["A"]),
             ]),
             TableMetadata(2, [Text("A")]),
         )

Exemple #21

0

Afficher le fichier

 def test_column_datetime_must_be_ns_resolution(self):
     # [2019-09-17] Pandas only supports datetime64[ns]
     # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563
     with self.assertRaises(DatetimeUnitNotAllowed):
         validate(
             pyarrow.table({
                 "A":
                 pyarrow.array([5298375234],
                               type=pyarrow.timestamp("us", tz=None))
             }),
             TableMetadata(1, [Datetime("A")]),
         )

Exemple #22

0

Afficher le fichier

 def test_column_datetime_should_be_tz_naive(self):
     with self.assertRaises(DatetimeTimezoneNotAllowed):
         validate(
             pyarrow.table({
                 "A":
                 pyarrow.array(
                     [5298375234123],
                     type=pyarrow.timestamp("ns", "America/New_York"),
                 )
             }),
             TableMetadata(1, [Datetime("A")]),
         )

Exemple #23

0

Afficher le fichier

Fichier : test_fetchprep.py Projet : lenamax2355/cjworkbench

    def test_clean_column_prompting_error_convert_to_number(self):
        input_shape = TableMetadata(3, [Column("A", ColumnType.Text())])
        with self.assertRaises(PromptingError) as cm:
            clean_value(ParamDType.Column(column_types=frozenset({"number"})),
                        "A", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A"], "text",
                                               frozenset({"number"}))
            ],
        )

Exemple #24

0

Afficher le fichier

Fichier : test_fetchprep.py Projet : lenamax2355/cjworkbench

    def test_clean_column_prompting_error_convert_to_text(self):
        # TODO make this _automatic_ instead of quick-fix?
        # Consider Regex. We probably want to pass the module a text Series
        # _separately_ from the input DataFrame. That way Regex can output
        # a new Text column but preserve its input column's data type.
        #
        # ... but for now: prompt for a Quick Fix.
        input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
        with self.assertRaises(PromptingError) as cm:
            clean_value(ParamDType.Column(column_types=frozenset({"text"})),
                        "A", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [PromptingError.WrongColumnType(["A"], None, frozenset({"text"}))],
        )

Exemple #25

0

Afficher le fichier

 def test_text_invalid_utf8(self):
     # Let's construct a particularly tricky case: two strings that are
     # invalid on their own but are valid when concatenated. (In the buffer
     # they're concatenated, so the buffer bytes are valid utf-8 even though
     # the values aren't.)
     #
     # We'll also throw in a NULL, so we don't get tempted to ignore them
     # when we optimize this validation.
     poop_bytes = "ðŸ’©".encode("utf-8")
     binary_array = pyarrow.array([None, poop_bytes[:2], poop_bytes[2:]])
     _, offsets, data = binary_array.buffers()
     with self.assertRaises(ColumnDataIsInvalidUtf8):
         validate(
             pyarrow.table(
                 {"A": pyarrow.StringArray.from_buffers(3, offsets, data)}),
             TableMetadata(3, [Text("A")]),
         )

Exemple #26

0

Afficher le fichier

Fichier : test_fetch.py Projet : brandonrobertz/cjworkbench

    def test_input_crr(self, downloaded_parquet_file, clean_value):
        def do_fetch(
            compiled_module,
            chroot_context,
            basedir,
            params,
            secrets,
            last_fetch_result,
            input_parquet_filename,
            output_filename,
        ):
            shutil.copy(basedir / input_parquet_filename,
                        basedir / output_filename)
            return FetchResult(basedir / output_filename)

        self.kernel.fetch.side_effect = do_fetch
        clean_value.return_value = {}

        with tempfile_context(dir=self.basedir,
                              suffix=".parquet") as parquet_path:
            parquet_path.write_bytes(b"abc123")
            downloaded_parquet_file.return_value = parquet_path

            input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
            input_crr = CachedRenderResult(1, 2, 3, "ok", [], {},
                                           input_metadata)
            with self.assertLogs("fetcher.fetch", level=logging.INFO):
                result = fetch.fetch_or_wrap_error(
                    self.ctx,
                    self.chroot_context,
                    self.basedir,
                    "mod",
                    create_module_zipfile("mod"),
                    {},
                    {},
                    None,
                    input_crr,
                    self.output_path,
                )

            # Passed file is downloaded from rendercache
            self.assertEqual(result.path.read_bytes(), b"abc123")
            # clean_value() is called with input metadata from CachedRenderResult
            clean_value.assert_called()
            self.assertEqual(clean_value.call_args[0][2], input_metadata)

Exemple #27

0

Afficher le fichier

Fichier : WfModule.py Projet : brandonrobertz/cjworkbench

    def _build_cached_render_result_fresh_or_not(self) -> Optional[CachedRenderResult]:
        """
        Build a CachedRenderResult with this WfModule's rendered output.

        If the output is stale, return it anyway. (The return value's .delta_id
        will not match this WfModule's .delta_id.)

        This does not read the dataframe from disk. If you want a "snapshot in
        time" of the `render()` output, you need a lock, like this:

            # Lock the workflow, making sure we don't overwrite data
            with workflow.cooperative_lock():
                wf_module.refresh_from_db()
                # Read from disk
                with cjwstate.rendercache.io.open_cached_render_result(
                    wf_module.get_stale_cached_render_result()
                ) as result:
        """
        if self.cached_render_result_delta_id is None:
            return None

        delta_id = self.cached_render_result_delta_id
        status = self.cached_render_result_status
        columns = self.cached_render_result_columns
        errors = self.cached_render_result_errors
        nrows = self.cached_render_result_nrows

        # cached_render_result_json is sometimes a memoryview
        json_bytes = bytes(self.cached_render_result_json)
        if json_bytes:
            json_dict = json.loads(json_bytes)
        else:
            json_dict = {}

        return CachedRenderResult(
            workflow_id=self.workflow_id,
            wf_module_id=self.id,
            delta_id=delta_id,
            status=status,
            errors=errors,
            json=json_dict,
            table_metadata=TableMetadata(nrows, columns),
        )

Exemple #28

0

Afficher le fichier

Fichier : test_fetch.py Projet : lenamax2355/cjworkbench

 def test_input_crr_corrupt_cache_error_is_none(self, downloaded_parquet_file):
     self.kernel.fetch.return_value = FetchResult(self.output_path, [])
     downloaded_parquet_file.side_effect = rendercache.CorruptCacheError(
         "file not found"
     )
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     with self.assertLogs("fetcher.fetch", level=logging.INFO):
         fetch.fetch_or_wrap_error(
             self.ctx,
             self.chroot_context,
             self.basedir,
             "mod",
             create_module_zipfile("mod"),
             {},
             {},
             None,
             input_crr,
             self.output_path,
         )
     # fetch is still called, with `None` as argument.
     self.assertIsNone(self.kernel.fetch.call_args[1]["input_parquet_filename"])

Exemple #29

0

Afficher le fichier

Fichier : test_fetchprep.py Projet : lenamax2355/cjworkbench

    def test_clean_multicolumn_prompting_error_convert_to_text(self):
        # TODO make this _automatic_ instead of quick-fix?
        # ... but for now: prompt for a Quick Fix.
        input_shape = TableMetadata(
            3,
            [
                Column("A", ColumnType.Number()),
                Column("B", ColumnType.Timestamp()),
                Column("C", ColumnType.Text()),
            ],
        )
        with self.assertRaises(PromptingError) as cm:
            schema = ParamDType.Multicolumn(column_types=frozenset({"text"}))
            clean_value(schema, "A,B", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A", "B"], None,
                                               frozenset({"text"}))
            ],
        )

Exemple #30

0

Afficher le fichier

Fichier : test_fetchprep.py Projet : lenamax2355/cjworkbench

    def test_dict_prompting_error(self):
        input_shape = TableMetadata(
            3,
            [Column("A", ColumnType.Text()),
             Column("B", ColumnType.Text())])
        schema = ParamDType.Dict({
            "col1":
            ParamDType.Column(column_types=frozenset({"number"})),
            "col2":
            ParamDType.Column(column_types=frozenset({"timestamp"})),
        })
        with self.assertRaises(PromptingError) as cm:
            clean_value(schema, {"col1": "A", "col2": "B"}, input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A"], "text",
                                               frozenset({"number"})),
                PromptingError.WrongColumnType(["B"], "text",
                                               frozenset({"timestamp"})),
            ],
        )