def test_execute_partial_cache_hit(self, fake_load_module): ModuleVersion.create_or_replace_from_spec( {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []} ) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh. Should not render. step1 = tab.wf_modules.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) # step2: cached result is stale, so must be re-rendered step2 = tab.wf_modules.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id - 1, ) rendercache.cache_render_result( workflow, step2, workflow.last_delta_id - 1, RenderResult(arrow_table({"B": [2]})), ) step2.last_relevant_delta_id = workflow.last_delta_id step2.save(update_fields=["last_relevant_delta_id"]) tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, ParamDType.Dict({}), {}), ExecuteStep(step2, ParamDType.Dict({}), {}), ], ) expected = RenderResult(arrow_table({"B": [3]})) fake_load_module.return_value.render.return_value = expected with self._execute(workflow, tab_flow, {}) as result: assert_render_result_equals(result, expected) fake_load_module.return_value.render.assert_called_once() # step2, not step1 self.assertRegex( # Output is to the correct file fake_load_module.return_value.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_resume_backtrack_on_corrupt_cache_error(self, fake_load_module): ModuleVersion.create_or_replace_from_spec( {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []} ) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh -- but CORRUPT step1 = tab.wf_modules.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) minio.put_bytes( # Write corrupted data -- will lead to CorruptCacheError rendercache.io.BUCKET, rendercache.io.crr_parquet_key(step1.cached_render_result), b"CORRUPT", ) # step2: no cached result -- must re-render step2 = tab.wf_modules.create(order=1, slug="step-2", module_id_name="mod") tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, ParamDType.Dict({}), {}), ExecuteStep(step2, ParamDType.Dict({}), {}), ], ) expected = RenderResult(arrow_table({"B": [2]})) fake_load_module.return_value.render.return_value = expected with self._execute( workflow, tab_flow, {}, expect_log_level=logging.ERROR ) as result: assert_render_result_equals(result, expected) self.assertEqual( # called with step1, then step2 fake_load_module.return_value.render.call_count, 2, ) self.assertRegex( # Output is to the correct file fake_load_module.return_value.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_clean_normal_dict(self): context = self._render_context() schema = ParamDType.Dict( {"str": ParamDType.String(), "int": ParamDType.Integer()} ) value = {"str": "foo", "int": 3} expected = dict(value) # no-op result = clean_value(schema, value, context) self.assertEqual(result, expected)
def test_clean_normal_dict(self): input_shape = TableMetadata(3, [Column("A", ColumnType.Number())]) schema = ParamDType.Dict({ "str": ParamDType.String(), "int": ParamDType.Integer() }) value = {"str": "foo", "int": 3} expected = dict(value) # no-op result = clean_value(schema, value, input_shape) self.assertEqual(result, expected)
def test_execute_cache_miss(self, fake_load_module): ModuleVersion.create_or_replace_from_spec( {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []} ) workflow = Workflow.create_and_init() tab = workflow.tabs.first() step1 = tab.wf_modules.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) step2 = tab.wf_modules.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, ParamDType.Dict({}), {}), ExecuteStep(step2, ParamDType.Dict({}), {}), ], ) expected = RenderResult(arrow_table({"B": [2]})) fake_load_module.return_value.render.return_value = expected with self._execute(workflow, tab_flow, {}) as result: assert_render_result_equals(result, expected) self.assertEqual( fake_load_module.return_value.render.call_count, 2 # step2, not step1 ) self.assertRegex( # Output is to the correct file fake_load_module.return_value.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_execute_cache_hit(self, fake_module): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step1 = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=workflow.last_delta_id ) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) step2 = tab.wf_modules.create( order=1, slug="step-2", last_relevant_delta_id=workflow.last_delta_id ) rendercache.cache_render_result( workflow, step2, workflow.last_delta_id, RenderResult(arrow_table({"B": [2]})), ) tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, ParamDType.Dict({}), {}), ExecuteStep(step2, ParamDType.Dict({}), {}), ], ) with self._execute(workflow, tab_flow, {}) as result: assert_render_result_equals( result, RenderResult(arrow_table({"B": [2]}), []) ) fake_module.assert_not_called()
def test_dict_prompting_error_concatenate_same_type(self): context = self._render_context( input_table=arrow_table({"A": ["1"], "B": ["2"]}) ) schema = ParamDType.Dict( { "x": ParamDType.Column(column_types=frozenset({"number"})), "y": ParamDType.Column(column_types=frozenset({"number"})), } ) with self.assertRaises(PromptingError) as cm: clean_value(schema, {"x": "A", "y": "B"}, context) self.assertEqual( cm.exception.errors, [PromptingError.WrongColumnType(["A", "B"], "text", frozenset({"number"}))], )
def test_clean_multicolumn_from_other_tab_that_does_not_exist(self): # The other tab would not exist if the user selected and then deleted # it. schema = ParamDType.Dict({ "tab": ParamDType.Tab(), "columns": ParamDType.Multicolumn(tab_parameter="tab"), }) params = {"tab": "tab-missing", "columns": ["A-from-tab-1"]} context = self._render_context( input_table=arrow_table({"A-from-tab-1": [1]}), tab_results={}, params=params, ) result = clean_value(schema, params, context) # result['tab'] is not what we're testing here self.assertEqual(result["columns"], [])
def test_map_parse(self): dtype = ParamDType.parse({ "type": "map", "value_dtype": { "type": "dict", # test nesting "properties": { "foo": { "type": "string" } }, }, }) self.assertEqual( repr(dtype), repr( ParamDType.Map(value_dtype=ParamDType.Dict( properties={"foo": ParamDType.String()}))), )
def test_clean_multicolumn_from_other_tab(self): tab2 = Tab("tab-2", "Tab 2") tab2_output_table = arrow_table({"A-from-tab-2": [1, 2]}) schema = ParamDType.Dict({ "tab": ParamDType.Tab(), "columns": ParamDType.Multicolumn(tab_parameter="tab"), }) params = {"tab": "tab-2", "columns": ["A-from-tab-1", "A-from-tab-2"]} context = self._render_context( input_table=arrow_table({"A-from-tab-1": [1]}), tab_results={tab2: RenderResult(tab2_output_table)}, params=params, ) result = clean_value(schema, params, context) # result['tab'] is not what we're testing here self.assertEqual(result["columns"], ["A-from-tab-2"])
def test_dict_prompting_error(self): input_shape = TableMetadata( 3, [Column("A", ColumnType.Text()), Column("B", ColumnType.Text())]) schema = ParamDType.Dict({ "col1": ParamDType.Column(column_types=frozenset({"number"})), "col2": ParamDType.Column(column_types=frozenset({"datetime"})), }) with self.assertRaises(PromptingError) as cm: clean_value(schema, {"col1": "A", "col2": "B"}, input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType(["B"], "text", frozenset({"datetime"})), ], )
def test_dict_prompting_error_concatenate_different_types(self): context = self._render_context(input_table=arrow_table({ "A": ["1"], "B": pa.array([datetime.now()], pa.timestamp("ns")) })) schema = ParamDType.Dict({ "x": ParamDType.Column(column_types=frozenset({"number"})), "y": ParamDType.Column(column_types=frozenset({"number"})), }) with self.assertRaises(PromptingError) as cm: clean_value(schema, {"x": "A", "y": "B"}, context) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType(["B"], "datetime", frozenset({"number"})), ], )