Esempio n. 1
0
    def test_execute_partial_cache_hit(self, fake_load_module):
        ModuleVersion.create_or_replace_from_spec(
            {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []}
        )
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        # step1: cached result is fresh. Should not render.
        step1 = tab.wf_modules.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        rendercache.cache_render_result(
            workflow,
            step1,
            workflow.last_delta_id,
            RenderResult(arrow_table({"A": [1]})),
        )
        # step2: cached result is stale, so must be re-rendered
        step2 = tab.wf_modules.create(
            order=1,
            slug="step-2",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id - 1,
        )
        rendercache.cache_render_result(
            workflow,
            step2,
            workflow.last_delta_id - 1,
            RenderResult(arrow_table({"B": [2]})),
        )
        step2.last_relevant_delta_id = workflow.last_delta_id
        step2.save(update_fields=["last_relevant_delta_id"])

        tab_flow = TabFlow(
            tab.to_arrow(),
            [
                ExecuteStep(step1, ParamDType.Dict({}), {}),
                ExecuteStep(step2, ParamDType.Dict({}), {}),
            ],
        )

        expected = RenderResult(arrow_table({"B": [3]}))
        fake_load_module.return_value.render.return_value = expected
        with self._execute(workflow, tab_flow, {}) as result:
            assert_render_result_equals(result, expected)

        fake_load_module.return_value.render.assert_called_once()  # step2, not step1
        self.assertRegex(
            # Output is to the correct file
            fake_load_module.return_value.render.call_args[1]["output_filename"],
            r"execute-tab-output.*\.arrow",
        )
Esempio n. 2
0
    def test_resume_backtrack_on_corrupt_cache_error(self, fake_load_module):
        ModuleVersion.create_or_replace_from_spec(
            {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []}
        )
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        # step1: cached result is fresh -- but CORRUPT
        step1 = tab.wf_modules.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        rendercache.cache_render_result(
            workflow,
            step1,
            workflow.last_delta_id,
            RenderResult(arrow_table({"A": [1]})),
        )
        minio.put_bytes(
            # Write corrupted data -- will lead to CorruptCacheError
            rendercache.io.BUCKET,
            rendercache.io.crr_parquet_key(step1.cached_render_result),
            b"CORRUPT",
        )
        # step2: no cached result -- must re-render
        step2 = tab.wf_modules.create(order=1, slug="step-2", module_id_name="mod")

        tab_flow = TabFlow(
            tab.to_arrow(),
            [
                ExecuteStep(step1, ParamDType.Dict({}), {}),
                ExecuteStep(step2, ParamDType.Dict({}), {}),
            ],
        )

        expected = RenderResult(arrow_table({"B": [2]}))
        fake_load_module.return_value.render.return_value = expected
        with self._execute(
            workflow, tab_flow, {}, expect_log_level=logging.ERROR
        ) as result:
            assert_render_result_equals(result, expected)

        self.assertEqual(
            # called with step1, then step2
            fake_load_module.return_value.render.call_count,
            2,
        )
        self.assertRegex(
            # Output is to the correct file
            fake_load_module.return_value.render.call_args[1]["output_filename"],
            r"execute-tab-output.*\.arrow",
        )
Esempio n. 3
0
 def test_clean_normal_dict(self):
     context = self._render_context()
     schema = ParamDType.Dict(
         {"str": ParamDType.String(), "int": ParamDType.Integer()}
     )
     value = {"str": "foo", "int": 3}
     expected = dict(value)  # no-op
     result = clean_value(schema, value, context)
     self.assertEqual(result, expected)
 def test_clean_normal_dict(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     schema = ParamDType.Dict({
         "str": ParamDType.String(),
         "int": ParamDType.Integer()
     })
     value = {"str": "foo", "int": 3}
     expected = dict(value)  # no-op
     result = clean_value(schema, value, input_shape)
     self.assertEqual(result, expected)
Esempio n. 5
0
    def test_execute_cache_miss(self, fake_load_module):
        ModuleVersion.create_or_replace_from_spec(
            {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []}
        )
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        step1 = tab.wf_modules.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        step2 = tab.wf_modules.create(
            order=1,
            slug="step-2",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )

        tab_flow = TabFlow(
            tab.to_arrow(),
            [
                ExecuteStep(step1, ParamDType.Dict({}), {}),
                ExecuteStep(step2, ParamDType.Dict({}), {}),
            ],
        )

        expected = RenderResult(arrow_table({"B": [2]}))
        fake_load_module.return_value.render.return_value = expected
        with self._execute(workflow, tab_flow, {}) as result:
            assert_render_result_equals(result, expected)

        self.assertEqual(
            fake_load_module.return_value.render.call_count, 2  # step2, not step1
        )
        self.assertRegex(
            # Output is to the correct file
            fake_load_module.return_value.render.call_args[1]["output_filename"],
            r"execute-tab-output.*\.arrow",
        )
Esempio n. 6
0
    def test_execute_cache_hit(self, fake_module):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        step1 = tab.wf_modules.create(
            order=0, slug="step-1", last_relevant_delta_id=workflow.last_delta_id
        )
        rendercache.cache_render_result(
            workflow,
            step1,
            workflow.last_delta_id,
            RenderResult(arrow_table({"A": [1]})),
        )
        step2 = tab.wf_modules.create(
            order=1, slug="step-2", last_relevant_delta_id=workflow.last_delta_id
        )
        rendercache.cache_render_result(
            workflow,
            step2,
            workflow.last_delta_id,
            RenderResult(arrow_table({"B": [2]})),
        )

        tab_flow = TabFlow(
            tab.to_arrow(),
            [
                ExecuteStep(step1, ParamDType.Dict({}), {}),
                ExecuteStep(step2, ParamDType.Dict({}), {}),
            ],
        )

        with self._execute(workflow, tab_flow, {}) as result:
            assert_render_result_equals(
                result, RenderResult(arrow_table({"B": [2]}), [])
            )

        fake_module.assert_not_called()
Esempio n. 7
0
    def test_dict_prompting_error_concatenate_same_type(self):
        context = self._render_context(
            input_table=arrow_table({"A": ["1"], "B": ["2"]})
        )
        schema = ParamDType.Dict(
            {
                "x": ParamDType.Column(column_types=frozenset({"number"})),
                "y": ParamDType.Column(column_types=frozenset({"number"})),
            }
        )
        with self.assertRaises(PromptingError) as cm:
            clean_value(schema, {"x": "A", "y": "B"}, context)

        self.assertEqual(
            cm.exception.errors,
            [PromptingError.WrongColumnType(["A", "B"], "text", frozenset({"number"}))],
        )
Esempio n. 8
0
 def test_clean_multicolumn_from_other_tab_that_does_not_exist(self):
     # The other tab would not exist if the user selected and then deleted
     # it.
     schema = ParamDType.Dict({
         "tab":
         ParamDType.Tab(),
         "columns":
         ParamDType.Multicolumn(tab_parameter="tab"),
     })
     params = {"tab": "tab-missing", "columns": ["A-from-tab-1"]}
     context = self._render_context(
         input_table=arrow_table({"A-from-tab-1": [1]}),
         tab_results={},
         params=params,
     )
     result = clean_value(schema, params, context)
     # result['tab'] is not what we're testing here
     self.assertEqual(result["columns"], [])
Esempio n. 9
0
 def test_map_parse(self):
     dtype = ParamDType.parse({
         "type": "map",
         "value_dtype": {
             "type": "dict",  # test nesting
             "properties": {
                 "foo": {
                     "type": "string"
                 }
             },
         },
     })
     self.assertEqual(
         repr(dtype),
         repr(
             ParamDType.Map(value_dtype=ParamDType.Dict(
                 properties={"foo": ParamDType.String()}))),
     )
Esempio n. 10
0
    def test_clean_multicolumn_from_other_tab(self):
        tab2 = Tab("tab-2", "Tab 2")
        tab2_output_table = arrow_table({"A-from-tab-2": [1, 2]})

        schema = ParamDType.Dict({
            "tab":
            ParamDType.Tab(),
            "columns":
            ParamDType.Multicolumn(tab_parameter="tab"),
        })
        params = {"tab": "tab-2", "columns": ["A-from-tab-1", "A-from-tab-2"]}
        context = self._render_context(
            input_table=arrow_table({"A-from-tab-1": [1]}),
            tab_results={tab2: RenderResult(tab2_output_table)},
            params=params,
        )
        result = clean_value(schema, params, context)
        # result['tab'] is not what we're testing here
        self.assertEqual(result["columns"], ["A-from-tab-2"])
    def test_dict_prompting_error(self):
        input_shape = TableMetadata(
            3,
            [Column("A", ColumnType.Text()),
             Column("B", ColumnType.Text())])
        schema = ParamDType.Dict({
            "col1":
            ParamDType.Column(column_types=frozenset({"number"})),
            "col2":
            ParamDType.Column(column_types=frozenset({"datetime"})),
        })
        with self.assertRaises(PromptingError) as cm:
            clean_value(schema, {"col1": "A", "col2": "B"}, input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A"], "text",
                                               frozenset({"number"})),
                PromptingError.WrongColumnType(["B"], "text",
                                               frozenset({"datetime"})),
            ],
        )
Esempio n. 12
0
    def test_dict_prompting_error_concatenate_different_types(self):
        context = self._render_context(input_table=arrow_table({
            "A": ["1"],
            "B":
            pa.array([datetime.now()], pa.timestamp("ns"))
        }))
        schema = ParamDType.Dict({
            "x":
            ParamDType.Column(column_types=frozenset({"number"})),
            "y":
            ParamDType.Column(column_types=frozenset({"number"})),
        })
        with self.assertRaises(PromptingError) as cm:
            clean_value(schema, {"x": "A", "y": "B"}, context)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A"], "text",
                                               frozenset({"number"})),
                PromptingError.WrongColumnType(["B"], "datetime",
                                               frozenset({"number"})),
            ],
        )