Python crr_parquet_key Examples, cjwstate.rendercache.io.crr_parquet_key Python Examples

Example #1

0

Show file

File: test_io.py Project: afcarl/cjworkbench

    def test_metadata_comes_from_db_columns(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Datetime()),
            Column("C", ColumnType.Text()),
        ]
        result = RenderResult(
            arrow_table({
                "A": [1],
                "B": [datetime.datetime.now()],
                "C": ["x"]
            },
                        columns=columns))
        cache_render_result(self.workflow, self.wf_module, self.delta.id,
                            result)
        # Delete from disk entirely, to prove we did not read.
        minio.remove(BUCKET,
                     crr_parquet_key(self.wf_module.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_wf_module = WfModule.objects.get(id=self.wf_module.id)
        cached_result = fresh_wf_module.cached_render_result

        self.assertEqual(cached_result.table_metadata,
                         TableMetadata(1, columns))

Example #2

0

Show file

    def test_metadata_does_not_require_file_read(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Timestamp()),
            Column("C", ColumnType.Text()),
            Column("D", ColumnType.Date("month")),
        ]
        with arrow_table_context(
            make_column("A", [1], format="{:,.2f}"),
            make_column("B", [datetime.datetime(2021, 4, 13)]),
            make_column("C", ["c"]),
            make_column("D", [datetime.date(2021, 4, 1)], unit="month"),
        ) as (path, table):
            result = LoadedRenderResult(
                path=path, table=table, columns=columns, errors=[], json={}
            )
            cache_render_result(self.workflow, self.step, 1, result)
        # Delete from disk entirely, to prove we did not read.
        s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_step = Step.objects.get(id=self.step.id)
        cached_result = fresh_step.cached_render_result

        self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))

Example #3

0

Show file

File: test_io.py Project: afcarl/cjworkbench

 def test_invalid_parquet_is_corrupt_cache_error(self):
     result = RenderResult(arrow_table({"A": [1]}))
     cache_render_result(self.workflow, self.wf_module, self.delta.id,
                         result)
     crr = self.wf_module.cached_render_result
     minio.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET")
     with tempfile_context() as arrow_path:
         with self.assertRaises(CorruptCacheError):
             load_cached_render_result(crr, arrow_path)

Example #4

0

Show file

File: test_cached_render_result.py Project: lenamax2355/cjworkbench

    def test_delete_step(self):
        result = RenderResult(arrow_table({"A": [1]}),
                              [RenderError(I18nMessage("X", {}, None), [])],
                              {})
        cache_render_result(self.workflow, self.step, 1, result)

        parquet_key = crr_parquet_key(self.step.cached_render_result)
        self.step.delete()
        self.assertFalse(s3.exists(BUCKET, parquet_key))

Example #5

0

Show file

    def test_delete_wfmodule(self):
        result = RenderResult(
            arrow_table({"A": [1]}), [RenderError(I18nMessage("X", []), [])], {}
        )
        cache_render_result(self.workflow, self.wf_module, self.delta.id, result)

        parquet_key = crr_parquet_key(self.wf_module.cached_render_result)
        self.wf_module.delete()
        self.assertFalse(minio.exists(BUCKET, parquet_key))

Example #6

0

Show file

File: test_io.py Project: afcarl/cjworkbench

    def test_clear(self):
        result = RenderResult(arrow_table({"A": [1]}))
        cache_render_result(self.workflow, self.wf_module, self.delta.id,
                            result)
        parquet_key = crr_parquet_key(self.wf_module.cached_render_result)
        clear_cached_render_result_for_wf_module(self.wf_module)

        db_wf_module = WfModule.objects.get(id=self.wf_module.id)
        self.assertIsNone(db_wf_module.cached_render_result)

        self.assertFalse(minio.exists(BUCKET, parquet_key))

Example #7

0

Show file

    def test_delete_step(self):
        write_to_rendercache(
            self.workflow,
            self.step,
            1,
            table=make_table(make_column("A", [1])),
            errors=[RenderError(I18nMessage("X", {}, None), [])],
            json={"foo": "bar"},
        )

        parquet_key = crr_parquet_key(self.step.cached_render_result)
        self.step.delete()
        self.assertFalse(s3.exists(BUCKET, parquet_key))

Example #8

0

Show file

 def test_invalid_parquet_is_corrupt_cache_error(self):
     with arrow_table_context(make_column("A", ["x"])) as (path, table):
         result = LoadedRenderResult(
             path=path,
             table=table,
             columns=[Column("A", ColumnType.Text())],
             errors=[],
             json={},
         )
         cache_render_result(self.workflow, self.step, 1, result)
     crr = self.step.cached_render_result
     s3.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET")
     with tempfile_context() as arrow_path:
         with self.assertRaises(CorruptCacheError):
             with open_cached_render_result(crr) as loaded:
                 pass

Example #9

0

Show file

    def test_clear(self):
        with arrow_table_context(make_column("A", [1])) as (path, table):
            result = LoadedRenderResult(
                path=path,
                table=table,
                columns=[Column("A", ColumnType.Number(format="{:,}"))],
                errors=[],
                json={},
            )
            cache_render_result(self.workflow, self.step, 1, result)

        parquet_key = crr_parquet_key(self.step.cached_render_result)
        clear_cached_render_result_for_step(self.step)

        db_step = Step.objects.get(id=self.step.id)
        self.assertIsNone(db_step.cached_render_result)

        self.assertFalse(s3.exists(BUCKET, parquet_key))

Example #10

0

Show file

    def test_cache_render_result(self):
        with arrow_table_context(make_column("A", [1])) as (table_path, table):
            result = LoadedRenderResult(
                path=table_path,
                table=table,
                columns=[Column("A", ColumnType.Number(format="{:,}"))],
                errors=[
                    RenderError(
                        I18nMessage("e1", {"text": "hi"}, None),
                        [
                            QuickFix(
                                I18nMessage("q1", {"var": 2}, None),
                                QuickFixAction.PrependStep("filter", {"a": "x"}),
                            )
                        ],
                    ),
                    RenderError(I18nMessage("e2", {}, None), []),
                ],
                json={"foo": "bar"},
            )
            cache_render_result(self.workflow, self.step, 1, result)

        cached = self.step.cached_render_result
        self.assertEqual(cached.step_id, self.step.id)
        self.assertEqual(cached.delta_id, 1)

        self.assertEqual(
            crr_parquet_key(cached),
            f"wf-{self.workflow.id}/wfm-{self.step.id}/delta-1.dat",
        )

        # Reading completely freshly from the DB should give the same thing
        db_step = Step.objects.get(id=self.step.id)
        from_db = db_step.cached_render_result
        self.assertEqual(from_db, cached)

        with open_cached_render_result(from_db) as result2:
            assert_arrow_table_equals(
                result2.table, make_table(make_column("A", [1], format="{:,}"))
            )
            self.assertEqual(
                result2.columns, [Column("A", ColumnType.Number(format="{:,}"))]
            )

Example #11

0

Show file

File: test_io.py Project: afcarl/cjworkbench

    def test_cache_render_result(self):
        result = RenderResult(
            arrow_table({"A": [1]}),
            [
                RenderError(
                    I18nMessage("e1", [1, "x"]),
                    [
                        QuickFix(
                            I18nMessage("q1", []),
                            QuickFixAction.PrependStep("filter", {"a": "x"}),
                        )
                    ],
                ),
                RenderError(I18nMessage("e2", []), []),
            ],
            {"foo": "bar"},
        )
        cache_render_result(self.workflow, self.wf_module, self.delta.id,
                            result)

        cached = self.wf_module.cached_render_result
        self.assertEqual(cached.wf_module_id, self.wf_module.id)
        self.assertEqual(cached.delta_id, self.delta.id)

        self.assertEqual(
            crr_parquet_key(cached),
            f"wf-{self.workflow.id}/wfm-{self.wf_module.id}/delta-{self.delta.id}.dat",
        )

        # Reading completely freshly from the DB should give the same thing
        db_wf_module = WfModule.objects.get(id=self.wf_module.id)
        from_db = db_wf_module.cached_render_result
        self.assertEqual(from_db, cached)

        with open_cached_render_result(from_db) as result2:
            assert_render_result_equals(result2, result)

Example #12

0

Show file

File: WfModule.py Project: brandonrobertz/cjworkbench

    def _duplicate_with_slug_and_delta_id(self, to_tab, slug, last_relevant_delta_id):
        # Initialize but don't save
        new_step = WfModule(
            tab=to_tab,
            slug=slug,
            module_id_name=self.module_id_name,
            fetch_errors=self.fetch_errors,
            stored_data_version=self.stored_data_version,
            order=self.order,
            notes=self.notes,
            is_collapsed=self.is_collapsed,
            auto_update_data=False,
            next_update=None,
            update_interval=self.update_interval,
            last_update_check=self.last_update_check,
            last_relevant_delta_id=last_relevant_delta_id,
            params=self.params,
            secrets={},  # DO NOT COPY SECRETS
        )

        # Copy cached render result, if there is one.
        #
        # If we duplicate a Workflow mid-render, the cached render result might
        # not have any useful data. But that's okay: just kick off a new
        # render. The common case (all-rendered Workflow) will produce a
        # fully-rendered duplicate Workflow.
        #
        # We cannot copy the cached result if the destination Tab has a
        # different name than this one: tab_name is passed to render(), so even
        # an exactly-duplicated WfModule can have a different output.
        cached_result = self.cached_render_result
        if cached_result is not None and self.tab.name == to_tab.name:
            # assuming file-copy succeeds, copy cached results.
            new_step.cached_render_result_delta_id = new_step.last_relevant_delta_id
            for attr in ("status", "errors", "json", "columns", "nrows"):
                full_attr = f"cached_render_result_{attr}"
                setattr(new_step, full_attr, getattr(self, full_attr))

            new_step.save()  # so there is a new_step.id for parquet_key

            # Now new_step.cached_render_result will return a
            # CachedRenderResult, because all the DB values are set. It'll have
            # a .parquet_key ... but there won't be a file there (because we
            # never wrote it).
            from cjwstate.rendercache.io import BUCKET, crr_parquet_key

            old_parquet_key = crr_parquet_key(cached_result)
            new_parquet_key = crr_parquet_key(new_step.cached_render_result)

            try:
                minio.copy(
                    minio.CachedRenderResultsBucket,
                    new_parquet_key,
                    "%(Bucket)s/%(Key)s" % {"Bucket": BUCKET, "Key": old_parquet_key},
                )
            except minio.error.NoSuchKey:
                # DB and filesystem are out of sync. CachedRenderResult handles
                # such cases gracefully. So `new_result` will behave exactly
                # like `cached_result`.
                pass
        else:
            new_step.save()

        # Duplicate the current stored data only, not the history
        if self.stored_data_version is not None:
            self.stored_objects.get(stored_at=self.stored_data_version).duplicate(
                new_step
            )

        # Duplicate the "selected" file, if there is one; otherwise, duplicate
        # the most-recently-uploaded file.
        #
        # We special-case the 'upload' module because it's the only one that
        # has 'file' params right now. (If that ever changes, we'll want to
        # change a few things: upload paths should include param name, and this
        # test will need to check module_zipfile to find the param name of the
        # file.)
        if self.module_id_name == "upload":
            uuid = self.params["file"]
            uploaded_file = self.uploaded_files.filter(uuid=uuid).first()
            if uploaded_file is not None:
                new_key = uploaded_file.key.replace(
                    self.uploaded_file_prefix, new_step.uploaded_file_prefix
                )
                assert new_key != uploaded_file.key
                # TODO handle file does not exist
                minio.copy(
                    minio.UserFilesBucket,
                    new_key,
                    f"{minio.UserFilesBucket}/{uploaded_file.key}",
                )
                new_step.uploaded_files.create(
                    created_at=uploaded_file.created_at,
                    name=uploaded_file.name,
                    size=uploaded_file.size,
                    uuid=uploaded_file.uuid,
                    key=new_key,
                )

        return new_step

Example #13

0

Show file

    def _duplicate_with_slug_and_delta_id(self, to_tab, slug, last_relevant_delta_id):
        # Initialize but don't save
        new_step = Step(
            tab=to_tab,
            slug=slug,
            module_id_name=self.module_id_name,
            fetch_errors=self.fetch_errors,
            stored_data_version=self.stored_data_version,
            order=self.order,
            notes=self.notes,
            is_collapsed=self.is_collapsed,
            auto_update_data=False,
            next_update=None,
            update_interval=self.update_interval,
            last_update_check=self.last_update_check,
            last_relevant_delta_id=last_relevant_delta_id,
            params=self.params,
            secrets={},  # DO NOT COPY SECRETS
        )

        # Copy cached render result, if there is one.
        #
        # If we duplicate a Workflow mid-render, the cached render result might
        # not have any useful data. But that's okay: just kick off a new
        # render. The common case (all-rendered Workflow) will produce a
        # fully-rendered duplicate Workflow.
        #
        # We cannot copy the cached result if the destination Tab has a
        # different name than this one: tab_name is passed to render(), so even
        # an exactly-duplicated Step can have a different output.
        cached_result = self.cached_render_result
        if cached_result is not None and self.tab.name == to_tab.name:
            # assuming file-copy succeeds, copy cached results.
            new_step.cached_render_result_delta_id = new_step.last_relevant_delta_id
            for attr in ("status", "errors", "json", "columns", "nrows"):
                full_attr = f"cached_render_result_{attr}"
                setattr(new_step, full_attr, getattr(self, full_attr))

            new_step.save()  # so there is a new_step.id for parquet_key

            # Now new_step.cached_render_result will return a
            # CachedRenderResult, because all the DB values are set. It'll have
            # a .parquet_key ... but there won't be a file there (because we
            # never wrote it).
            from cjwstate.rendercache.io import BUCKET, crr_parquet_key

            old_parquet_key = crr_parquet_key(cached_result)
            new_parquet_key = crr_parquet_key(new_step.cached_render_result)

            try:
                s3.copy(
                    s3.CachedRenderResultsBucket,
                    new_parquet_key,
                    "%(Bucket)s/%(Key)s" % {"Bucket": BUCKET, "Key": old_parquet_key},
                )
            except s3.layer.error.NoSuchKey:
                # DB and filesystem are out of sync. CachedRenderResult handles
                # such cases gracefully. So `new_result` will behave exactly
                # like `cached_result`.
                pass
        else:
            new_step.save()

        # Duplicate the current stored data only, not the history
        if self.stored_data_version is not None:
            self.stored_objects.get(stored_at=self.stored_data_version).duplicate(
                new_step
            )

        # For each "file" param, duplicate the "selected" uploaded_file if there
        # is one.
        #
        # We assume any UUID in `params` that points to an uploaded file _is_
        # a file-dtype param. ([adamhooper, 2020-07-14] when the assumption does
        # not hold, will this cause DB errors? Not sure, but it's not a security
        # risk.)
        #
        # Why not check the param schema? Because we'd need to define behavior
        # for when the module doesn't exist, or its version is changed, or its
        # code breaks.... bah! These behaviors don't line up with any user
        # expectations. Users want to copy the thing they see.
        for uuid_str in self.params.values():
            if not isinstance(uuid_str, str):
                continue
            try:
                UUID(uuid_str)
            except ValueError:
                continue
            uploaded_file = self.uploaded_files.filter(uuid=uuid_str).first()
            if not uploaded_file:
                continue

            new_key = uploaded_file.key.replace(
                self.uploaded_file_prefix, new_step.uploaded_file_prefix
            )
            assert new_key != uploaded_file.key
            # TODO handle file does not exist
            s3.copy(
                s3.UserFilesBucket,
                new_key,
                f"{s3.UserFilesBucket}/{uploaded_file.key}",
            )
            new_step.uploaded_files.create(
                created_at=uploaded_file.created_at,
                name=uploaded_file.name,
                size=uploaded_file.size,
                uuid=uploaded_file.uuid,
                key=new_key,
            )

        return new_step