Esempio n. 1
0
 def test_input_crr_corrupt_cache_error_is_none(
     self, downloaded_parquet_file, load_module
 ):
     load_module.return_value.migrate_params.return_value = {}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     downloaded_parquet_file.side_effect = rendercache.CorruptCacheError(
         "file not found"
     )
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     fetch.fetch_or_wrap_error(
         self.ctx,
         self.chroot_context,
         self.basedir,
         WfModule(),
         MockModuleVersion(),
         {},
         None,
         input_crr,
         self.output_path,
     )
     # fetch is still called, with `None` as argument.
     self.assertIsNone(
         load_module.return_value.fetch.call_args[1]["input_parquet_filename"]
     )
Esempio n. 2
0
 def test_input_crr(self, downloaded_parquet_file, clean_value, load_module):
     load_module.return_value.migrate_params.return_value = {}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     clean_value.return_value = {}
     downloaded_parquet_file.return_value = Path("/path/to/x.parquet")
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     fetch.fetch_or_wrap_error(
         self.ctx,
         self.chroot_context,
         self.basedir,
         WfModule(),
         MockModuleVersion(),
         {},
         None,
         input_crr,
         self.output_path,
     )
     # Passed file is downloaded from rendercache
     downloaded_parquet_file.assert_called_with(input_crr, dir=self.basedir)
     self.assertEqual(
         load_module.return_value.fetch.call_args[1]["input_parquet_filename"],
         "x.parquet",
     )
     # clean_value() is called with input metadata from CachedRenderResult
     clean_value.assert_called()
     self.assertEqual(clean_value.call_args[0][2], input_metadata)
Esempio n. 3
0
    def test_metadata_comes_from_db_columns(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Datetime()),
            Column("C", ColumnType.Text()),
        ]
        result = RenderResult(
            arrow_table({
                "A": [1],
                "B": [datetime.datetime.now()],
                "C": ["x"]
            },
                        columns=columns))
        cache_render_result(self.workflow, self.wf_module, self.delta.id,
                            result)
        # Delete from disk entirely, to prove we did not read.
        minio.remove(BUCKET,
                     crr_parquet_key(self.wf_module.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_wf_module = WfModule.objects.get(id=self.wf_module.id)
        cached_result = fresh_wf_module.cached_render_result

        self.assertEqual(cached_result.table_metadata,
                         TableMetadata(1, columns))
Esempio n. 4
0
def _infer_output_column_type(column: pyarrow.ChunkedArray) -> ColumnType:
    if column.type == pyarrow.utf8() or (hasattr(column.type, "value_type")
                                         and column.type.value_type
                                         == pyarrow.utf8()):
        return ColumnType.Text()
    else:
        return ColumnType.Number()
Esempio n. 5
0
    def test_render_using_tab_output(self):
        def render(table, params):
            self.assertEqual(params["tabparam"].slug, "tab-1")
            self.assertEqual(params["tabparam"].name, "Tab 1")
            self.assertEqual(
                params["tabparam"].columns,
                {
                    "X": ptypes.RenderColumn("X", "number", "{:,d}"),
                    "Y": ptypes.RenderColumn("Y", "text", None),
                },
            )
            assert_frame_equal(params["tabparam"].dataframe,
                               pd.DataFrame({
                                   "X": [1],
                                   "Y": ["y"]
                               }))

        with arrow_table_context(
            {
                "X": [1],
                "Y": ["y"]
            },
                columns=[
                    Column("X", ColumnType.Number("{:,d}")),
                    Column("Y", ColumnType.Text()),
                ],
                dir=self.basedir,
        ) as atable:
            self._test_render(
                render,
                params={"tabparam": TabOutput(Tab("tab-1", "Tab 1"), atable)})
Esempio n. 6
0
    def test_render_with_input_columns(self):
        def render(*args, input_columns):
            self.assertEqual(
                input_columns,
                {
                    "A": ptypes.RenderColumn("A", "text", None),
                    "B": ptypes.RenderColumn("B", "number", "{:,.3f}"),
                    "C": ptypes.RenderColumn("C", "datetime", None),
                },
            )

        with arrow_table_context(
            {
                "A": ["x"],
                "B": [1],
                "C": pa.array([datetime.now()], pa.timestamp("ns"))
            },
                columns=[
                    Column("A", ColumnType.Text()),
                    Column("B", ColumnType.Number("{:,.3f}")),
                    Column("C", ColumnType.Datetime()),
                ],
                dir=self.basedir,
        ) as arrow_table:
            self._test_render(render, arrow_table=arrow_table)
Esempio n. 7
0
    def test_metadata_does_not_require_file_read(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Timestamp()),
            Column("C", ColumnType.Text()),
            Column("D", ColumnType.Date("month")),
        ]
        with arrow_table_context(
            make_column("A", [1], format="{:,.2f}"),
            make_column("B", [datetime.datetime(2021, 4, 13)]),
            make_column("C", ["c"]),
            make_column("D", [datetime.date(2021, 4, 1)], unit="month"),
        ) as (path, table):
            result = LoadedRenderResult(
                path=path, table=table, columns=columns, errors=[], json={}
            )
            cache_render_result(self.workflow, self.step, 1, result)
        # Delete from disk entirely, to prove we did not read.
        s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_step = Step.objects.get(id=self.step.id)
        cached_result = fresh_step.cached_render_result

        self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
Esempio n. 8
0
 def render(arrow_table, params, output_path, *, columns, **kwargs):
     # Test the "columns" kwarg
     #
     # TODO nix this! The only module that uses it is `converttotext`.
     self.assertEqual(
         columns,
         [
             Column("A", ColumnType.Number("{:,.3f}")),
             Column("B", ColumnType.Number("{:,.3f}")),
             Column("C", ColumnType.Number("{:,.3f}")),
             Column("D", ColumnType.Timestamp()),
             Column("E", ColumnType.Timestamp()),
             Column("F", ColumnType.Timestamp()),
             Column("G", ColumnType.Text()),
             Column("H", ColumnType.Text()),
             Column("I", ColumnType.Text()),
             Column("J", ColumnType.Date(unit="day")),
             Column("K", ColumnType.Date(unit="week")),
             Column("L", ColumnType.Text()),
         ],
     )
     table = pa.table(
         {
             "A": [1],
             "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
             "C": ["a"],
             "D": [1],
             "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
             "F": ["a"],
             "G": [1],
             "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
             "I": ["a"],
             "J": pa.array([date(2021, 4, 1)]),
             "K": pa.array([date(2021, 4, 12)]),
             "L": pa.array([date(2021, 4, 1)]),
         }
     )
     schema = table.schema.set(
         table.schema.get_field_index("J"),
         pa.field("J", pa.date32(), metadata={"unit": "month"}),
     )
     with pa.ipc.RecordBatchFileWriter(output_path, schema) as writer:
         writer.write_table(pa.table(table.columns, schema=schema))
     return []
Esempio n. 9
0
def _arrow_column_to_column(name: str, column: pyarrow.ChunkedArray) -> Column:
    if pyarrow.types.is_floating(column.type) or pyarrow.types.is_integer(
            column.type):
        column_type = ColumnType.Number("{:,}")
    elif pyarrow.types.is_timestamp(column.type):
        column_type = ColumnType.Timestamp()
    elif pyarrow.types.is_string(column.type) or pyarrow.types.is_dictionary(
            column.type):
        column_type = ColumnType.Text()
    else:
        raise RuntimeError("Unknown column type %r" % column.type)
    return Column(name, column_type)
Esempio n. 10
0
    def test_fetch_return_dataframe(self):
        async def fetch(params):
            return pd.DataFrame({"A": ["x", "y"]})

        with tempfile_context(dir=self.basedir) as outfile:
            result = self._test_fetch(fetch, output_filename=outfile.name)

            self.assertEqual(result.errors, [])
            arrow_table = read_parquet_as_arrow(
                outfile, [Column("A", ColumnType.Text())])
            assert_arrow_table_equals(arrow_table,
                                      make_table(make_column("A", ["x", "y"])))
Esempio n. 11
0
    def test_resume_backtrack_on_corrupt_cache_error(self):
        module_zipfile = create_module_zipfile(
            "mod", spec_kwargs={"loads_data": True})
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        # step1: cached result is fresh -- but CORRUPT
        step1 = tab.steps.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        write_to_rendercache(workflow, step1, workflow.last_delta_id,
                             make_table(make_column("A", [1])))
        step1.refresh_from_db()
        s3.put_bytes(
            # Write corrupted data -- will lead to CorruptCacheError
            rendercache.io.BUCKET,
            rendercache.io.crr_parquet_key(step1.cached_render_result),
            b"CORRUPT",
        )
        # step2: no cached result -- must re-render
        step2 = tab.steps.create(order=1, slug="step-2", module_id_name="mod")

        tab_flow = TabFlow(
            Tab(tab.slug, tab.name),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        new_table = make_table(make_column("B", ["b"]))

        with patch.object(Kernel, "render",
                          side_effect=mock_render(new_table)):
            with self._execute(workflow,
                               tab_flow, {},
                               expect_log_level=logging.ERROR) as (result,
                                                                   path):
                self.assertEqual(
                    result, StepResult(path, [Column("B", ColumnType.Text())]))

            self.assertEqual(
                # called with step1, then step2
                Kernel.render.call_count,
                2,
            )
            self.assertRegex(
                # Output is to the correct file
                Kernel.render.call_args[1]["output_filename"],
                r"execute-tab-output.*\.arrow",
            )
Esempio n. 12
0
    def test_execute_partial_cache_hit(self):
        module_zipfile = create_module_zipfile(
            "mod", spec_kwargs={"loads_data": True})
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        # step1: cached result is fresh. Should not render.
        step1 = tab.steps.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        write_to_rendercache(workflow, step1, workflow.last_delta_id,
                             make_table(make_column("A", ["a"])))
        # step2: cached result is stale, so must be re-rendered
        step2 = tab.steps.create(
            order=1,
            slug="step-2",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        write_to_rendercache(
            workflow,
            step2,
            workflow.last_delta_id - 1,
            make_table(make_column("B", ["b"])),
        )

        tab_flow = TabFlow(
            Tab(tab.slug, tab.name),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        new_table = make_table(make_column("C", ["c"]))

        with patch.object(Kernel, "render",
                          side_effect=mock_render(new_table)):
            with self._execute(workflow, tab_flow, {}) as (result, path):
                self.assertEqual(
                    result, StepResult(path, [Column("C", ColumnType.Text())]))
                assert_arrow_table_equals(load_trusted_arrow_file(path),
                                          new_table)

            Kernel.render.assert_called_once()  # step2, not step1

            self.assertRegex(
                # Output is to the correct file
                Kernel.render.call_args[1]["output_filename"],
                r"execute-tab-output.*\.arrow",
            )
Esempio n. 13
0
    def test_clean_column_prompting_error_convert_to_number(self):
        input_shape = TableMetadata(3, [Column("A", ColumnType.Text())])
        with self.assertRaises(PromptingError) as cm:
            clean_value(ParamDType.Column(column_types=frozenset({"number"})),
                        "A", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A"], "text",
                                               frozenset({"number"}))
            ],
        )
Esempio n. 14
0
    def test_dict_prompting_error(self):
        input_shape = TableMetadata(
            3,
            [Column("A", ColumnType.Text()),
             Column("B", ColumnType.Text())])
        schema = ParamDType.Dict({
            "col1":
            ParamDType.Column(column_types=frozenset({"number"})),
            "col2":
            ParamDType.Column(column_types=frozenset({"timestamp"})),
        })
        with self.assertRaises(PromptingError) as cm:
            clean_value(schema, {"col1": "A", "col2": "B"}, input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A"], "text",
                                               frozenset({"number"})),
                PromptingError.WrongColumnType(["B"], "text",
                                               frozenset({"timestamp"})),
            ],
        )
Esempio n. 15
0
 def test_invalid_parquet_is_corrupt_cache_error(self):
     with arrow_table_context(make_column("A", ["x"])) as (path, table):
         result = LoadedRenderResult(
             path=path,
             table=table,
             columns=[Column("A", ColumnType.Text())],
             errors=[],
             json={},
         )
         cache_render_result(self.workflow, self.step, 1, result)
     crr = self.step.cached_render_result
     s3.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET")
     with tempfile_context() as arrow_path:
         with self.assertRaises(CorruptCacheError):
             with open_cached_render_result(crr) as loaded:
                 pass
Esempio n. 16
0
    def test_input_crr(self, downloaded_parquet_file, clean_value):
        def do_fetch(
            compiled_module,
            chroot_context,
            basedir,
            params,
            secrets,
            last_fetch_result,
            input_parquet_filename,
            output_filename,
        ):
            shutil.copy(basedir / input_parquet_filename,
                        basedir / output_filename)
            return FetchResult(basedir / output_filename)

        self.kernel.fetch.side_effect = do_fetch
        clean_value.return_value = {}

        with tempfile_context(dir=self.basedir,
                              suffix=".parquet") as parquet_path:
            parquet_path.write_bytes(b"abc123")
            downloaded_parquet_file.return_value = parquet_path

            input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
            input_crr = CachedRenderResult(1, 2, 3, "ok", [], {},
                                           input_metadata)
            with self.assertLogs("fetcher.fetch", level=logging.INFO):
                result = fetch.fetch_or_wrap_error(
                    self.ctx,
                    self.chroot_context,
                    self.basedir,
                    "mod",
                    create_module_zipfile("mod"),
                    {},
                    {},
                    None,
                    input_crr,
                    self.output_path,
                )

            # Passed file is downloaded from rendercache
            self.assertEqual(result.path.read_bytes(), b"abc123")
            # clean_value() is called with input metadata from CachedRenderResult
            clean_value.assert_called()
            self.assertEqual(clean_value.call_args[0][2], input_metadata)
Esempio n. 17
0
    def test_execute_cache_miss(self):
        module_zipfile = create_module_zipfile(
            "mod", spec_kwargs={"loads_data": True})
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        step1 = tab.steps.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        step2 = tab.steps.create(
            order=1,
            slug="step-2",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )

        tab_flow = TabFlow(
            Tab(tab.slug, tab.name),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        table = make_table(make_column("A", ["a"]))

        with patch.object(Kernel, "render", side_effect=mock_render(table)):
            with self._execute(workflow, tab_flow, {}) as (result, path):
                self.assertEqual(
                    result, StepResult(path, [Column("A", ColumnType.Text())]))
                assert_arrow_table_equals(load_trusted_arrow_file(path), table)

            self.assertEqual(Kernel.render.call_count, 2)  # step2, not step1
            self.assertRegex(
                # Output is to the correct file
                Kernel.render.call_args[1]["output_filename"],
                r"execute-tab-output.*\.arrow",
            )
Esempio n. 18
0
 def test_input_crr_corrupt_cache_error_is_none(self, downloaded_parquet_file):
     self.kernel.fetch.return_value = FetchResult(self.output_path, [])
     downloaded_parquet_file.side_effect = rendercache.CorruptCacheError(
         "file not found"
     )
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     with self.assertLogs("fetcher.fetch", level=logging.INFO):
         fetch.fetch_or_wrap_error(
             self.ctx,
             self.chroot_context,
             self.basedir,
             "mod",
             create_module_zipfile("mod"),
             {},
             {},
             None,
             input_crr,
             self.output_path,
         )
     # fetch is still called, with `None` as argument.
     self.assertIsNone(self.kernel.fetch.call_args[1]["input_parquet_filename"])
Esempio n. 19
0
    def test_clean_multicolumn_prompting_error_convert_to_text(self):
        # TODO make this _automatic_ instead of quick-fix?
        # ... but for now: prompt for a Quick Fix.
        input_shape = TableMetadata(
            3,
            [
                Column("A", ColumnType.Number()),
                Column("B", ColumnType.Timestamp()),
                Column("C", ColumnType.Text()),
            ],
        )
        with self.assertRaises(PromptingError) as cm:
            schema = ParamDType.Multicolumn(column_types=frozenset({"text"}))
            clean_value(schema, "A,B", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A", "B"], None,
                                               frozenset({"text"}))
            ],
        )
Esempio n. 20
0
def Text(name: str) -> Column:
    return Column(name, ColumnType.Text())
Esempio n. 21
0
 def test_text_ok(self):
     self.assertEqual(read_columns(pa.table({"A": ["x"]})),
                      [Column("A", ColumnType.Text())])
Esempio n. 22
0
 def test_text_dictionary_ok(self):
     self.assertEqual(
         read_columns(pa.table({"A":
                                pa.array(["x"]).dictionary_encode()}), ),
         [Column("A", ColumnType.Text())],
     )
Esempio n. 23
0
def TEXT(name: str):
    return Column(name, ColumnType.Text())
Esempio n. 24
0
 def migrate_params(params):
     return [ColumnType.Text()]
Esempio n. 25
0
    def test_render_arrow_table_infer_output_column_formats_from_input(self):
        input_columns = [
            Column("A", ColumnType.Number("{:,.3f}")),
            Column("B", ColumnType.Number("{:,.3f}")),
            Column("C", ColumnType.Number("{:,.3f}")),
            Column("D", ColumnType.Datetime()),
            Column("E", ColumnType.Datetime()),
            Column("F", ColumnType.Datetime()),
            Column("G", ColumnType.Text()),
            Column("H", ColumnType.Text()),
            Column("I", ColumnType.Text()),
        ]
        # The param name "arrow_table" is a special case
        def render(arrow_table, params, output_path, *, columns, **kwargs):
            # Test the "columns" kwarg
            self.assertEqual(columns, input_columns)
            table = pa.table(
                {
                    "A": [1],
                    "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                    "C": ["a"],
                    "D": [1],
                    "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                    "F": ["a"],
                    "G": [1],
                    "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                    "I": ["a"],
                }
            )
            with pa.ipc.RecordBatchFileWriter(output_path, table.schema) as writer:
                writer.write_table(table)
            return []

        with arrow_table_context(
            {
                "A": [1],
                "B": [1],
                "C": [1],
                "D": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                "F": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                "G": ["a"],
                "H": ["a"],
                "I": ["a"],
            },
            columns=input_columns,
            dir=self.basedir,
        ) as arrow_table:
            result = self._test_render(render, arrow_table=arrow_table)
            self.assertEqual(
                result.table.metadata.columns,
                [
                    Column("A", ColumnType.Number("{:,.3f}")),  # recalled
                    Column("B", ColumnType.Datetime()),  # inferred
                    Column("C", ColumnType.Text()),  # inferred
                    Column("D", ColumnType.Number("{:,}")),  # inferred
                    Column("E", ColumnType.Datetime()),  # recalled
                    Column("F", ColumnType.Text()),  # inferred
                    Column("G", ColumnType.Number("{:,}")),  # inferred
                    Column("H", ColumnType.Datetime()),  # inferred
                    Column("I", ColumnType.Text()),  # recalled
                ],
            )