def test_clean_normal_dict(self): context = self._render_context() schema = ParamDType.Dict( {"str": ParamDType.String(), "int": ParamDType.Integer()} ) value = {"str": "foo", "int": 3} expected = dict(value) # no-op result = clean_value(schema, value, context) self.assertEqual(result, expected)
def test_clean_normal_dict(self): input_shape = TableMetadata(3, [Column("A", ColumnType.Number())]) schema = ParamDType.Dict({ "str": ParamDType.String(), "int": ParamDType.Integer() }) value = {"str": "foo", "int": 3} expected = dict(value) # no-op result = clean_value(schema, value, input_shape) self.assertEqual(result, expected)
def test_execute_partial_cache_hit(self, fake_load_module): ModuleVersion.create_or_replace_from_spec( {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []} ) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh. Should not render. step1 = tab.wf_modules.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) # step2: cached result is stale, so must be re-rendered step2 = tab.wf_modules.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id - 1, ) rendercache.cache_render_result( workflow, step2, workflow.last_delta_id - 1, RenderResult(arrow_table({"B": [2]})), ) step2.last_relevant_delta_id = workflow.last_delta_id step2.save(update_fields=["last_relevant_delta_id"]) tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, ParamDType.Dict({}), {}), ExecuteStep(step2, ParamDType.Dict({}), {}), ], ) expected = RenderResult(arrow_table({"B": [3]})) fake_load_module.return_value.render.return_value = expected with self._execute(workflow, tab_flow, {}) as result: assert_render_result_equals(result, expected) fake_load_module.return_value.render.assert_called_once() # step2, not step1 self.assertRegex( # Output is to the correct file fake_load_module.return_value.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_resume_backtrack_on_corrupt_cache_error(self, fake_load_module): ModuleVersion.create_or_replace_from_spec( {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []} ) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh -- but CORRUPT step1 = tab.wf_modules.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) minio.put_bytes( # Write corrupted data -- will lead to CorruptCacheError rendercache.io.BUCKET, rendercache.io.crr_parquet_key(step1.cached_render_result), b"CORRUPT", ) # step2: no cached result -- must re-render step2 = tab.wf_modules.create(order=1, slug="step-2", module_id_name="mod") tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, ParamDType.Dict({}), {}), ExecuteStep(step2, ParamDType.Dict({}), {}), ], ) expected = RenderResult(arrow_table({"B": [2]})) fake_load_module.return_value.render.return_value = expected with self._execute( workflow, tab_flow, {}, expect_log_level=logging.ERROR ) as result: assert_render_result_equals(result, expected) self.assertEqual( # called with step1, then step2 fake_load_module.return_value.render.call_count, 2, ) self.assertRegex( # Output is to the correct file fake_load_module.return_value.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_list_prompting_error_concatenate_different_type_to_text(self): context = self._render_context( input_table=arrow_table({"A": [1], "B": [datetime.now()]}) ) schema = ParamDType.List( inner_dtype=ParamDType.Column(column_types=frozenset({"text"})) ) with self.assertRaises(PromptingError) as cm: clean_value(schema, ["A", "B"], context) self.assertEqual( cm.exception.errors, [PromptingError.WrongColumnType(["A", "B"], None, frozenset({"text"}))], )
def test_list_prompting_error_concatenate_same_type(self): context = self._render_context( input_table=arrow_table({"A": ["1"], "B": ["2"]}) ) schema = ParamDType.List( inner_dtype=ParamDType.Column(column_types=frozenset({"number"})) ) with self.assertRaises(PromptingError) as cm: clean_value(schema, ["A", "B"], context) self.assertEqual( cm.exception.errors, [PromptingError.WrongColumnType(["A", "B"], "text", frozenset({"number"}))], )
def test_clean_multicolumn_sort_in_table_order(self): input_shape = TableMetadata(3, [ Column("B", ColumnType.Number()), Column("A", ColumnType.Number()) ]) result = clean_value(ParamDType.Multicolumn(), ["A", "B"], input_shape) self.assertEqual(result, ["B", "A"])
def test_clean_multichartseries_non_number_is_prompting_error(self): context = self._render_context(input_table=arrow_table({ "A": ["a"], "B": pa.array([datetime.now()], pa.timestamp("ns")) })) value = [ { "column": "A", "color": "#aaaaaa" }, { "column": "B", "color": "#cccccc" }, ] with self.assertRaises(PromptingError) as cm: clean_value(ParamDType.Multichartseries(), value, context) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType(["B"], "datetime", frozenset({"number"})), ], )
def test_clean_float_with_int_value(self): # ParamDType.Float can have `int` values (because values come from # json.parse(), which only gives Numbers so can give "3" instead of # "3.0". We want to pass that as `float` in the `params` dict. result = clean_value(ParamDType.Float(), 3, None) self.assertEqual(result, 3.0) self.assertIsInstance(result, float)
def test_clean_file_happy_path(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.wf_modules.create(module_id_name="uploadfile", order=0, slug="step-1") id = str(uuid.uuid4()) key = f"wf-${workflow.id}/wfm-${step.id}/${id}" minio.put_bytes(minio.UserFilesBucket, key, b"1234") UploadedFile.objects.create( wf_module=step, name="x.csv.gz", size=4, uuid=id, bucket=minio.UserFilesBucket, key=key, ) with ExitStack() as inner_stack: context = self._render_context(wf_module_id=step.id, exit_stack=inner_stack) result: Path = clean_value(ParamDType.File(), id, context) self.assertIsInstance(result, Path) self.assertEqual(result.read_bytes(), b"1234") self.assertEqual(result.suffixes, [".csv", ".gz"]) # Assert that once `exit_stack` goes out of scope, file is deleted self.assertFalse(result.exists())
def test_clean_column_happy_path(self): input_shape = TableMetadata(3, [Column("A", ColumnType.Number())]) self.assertEqual( clean_value(ParamDType.Column(column_types=frozenset({"number"})), "A", input_shape), "A", )
def test_clean_multicolumn_sort_in_table_order(self): context = self._render_context(input_table=arrow_table({ "B": [1], "A": [2] })) result = clean_value(ParamDType.Multicolumn(), ["A", "B"], context) self.assertEqual(result, ["B", "A"])
def test_clean_file_wrong_wf_module(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.wf_modules.create(module_id_name="uploadfile", order=0, slug="step-1") step2 = tab.wf_modules.create(module_id_name="uploadfile", order=1, slug="step-2") id = str(uuid.uuid4()) key = f"wf-${workflow.id}/wfm-${step.id}/${id}" minio.put_bytes(minio.UserFilesBucket, key, b"1234") UploadedFile.objects.create( wf_module=step2, name="x.csv.gz", size=4, uuid=id, bucket=minio.UserFilesBucket, key=key, ) context = self._render_context(wf_module_id=step.id) result = clean_value(ParamDType.File(), id, context) self.assertIsNone(result) # Assert that if a temporary file was created to house the download, it # no longer exists. self.assertListEqual(list(self.basedir.iterdir()), [])
def test_clean_multicolumn_missing_is_removed(self): context = self._render_context(input_table=arrow_table({ "A": [1], "B": [1] })) result = clean_value(ParamDType.Multicolumn(), ["A", "X", "B"], context) self.assertEqual(result, ["A", "B"])
def test_clean_multichartseries_missing_is_removed(self): context = self._render_context(input_table=arrow_table({"A": [1], "B": [1]})) value = [ {"column": "A", "color": "#aaaaaa"}, {"column": "C", "color": "#cccccc"}, ] result = clean_value(ParamDType.Multichartseries(), value, context) self.assertEqual(result, [{"column": "A", "color": "#aaaaaa"}])
def test_clean_multicolumn_missing_is_removed(self): input_shape = TableMetadata(3, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Number()) ]) result = clean_value(ParamDType.Multicolumn(), ["A", "X", "B"], input_shape) self.assertEqual(result, ["A", "B"])
def test_map_parse(self): dtype = ParamDType.parse({ "type": "map", "value_dtype": { "type": "dict", # test nesting "properties": { "foo": { "type": "string" } }, }, }) self.assertEqual( repr(dtype), repr( ParamDType.Map(value_dtype=ParamDType.Dict( properties={"foo": ParamDType.String()}))), )
def test_clean_tab_missing_tab_selected_gives_none(self): """ If the user has selected a nonexistent tab, pretend tab is blank. JS sees nonexistent tab slugs. render() doesn't. """ context = self._render_context(tab_results={}) result = clean_value(ParamDType.Tab(), "tab-XXX", context) self.assertEqual(result, None)
def test_clean_multicolumn_from_other_tab_that_does_not_exist(self): # The other tab would not exist if the user selected and then deleted # it. schema = ParamDType.Dict({ "tab": ParamDType.Tab(), "columns": ParamDType.Multicolumn(tab_parameter="tab"), }) params = {"tab": "tab-missing", "columns": ["A-from-tab-1"]} context = self._render_context( input_table=arrow_table({"A-from-tab-1": [1]}), tab_results={}, params=params, ) result = clean_value(schema, params, context) # result['tab'] is not what we're testing here self.assertEqual(result["columns"], [])
def test_clean_column_prompting_error_convert_to_number(self): context = self._render_context(input_table=arrow_table({"A": ["1"]})) with self.assertRaises(PromptingError) as cm: clean_value( ParamDType.Column(column_types=frozenset({"number"})), "A", context ) self.assertEqual( cm.exception.errors, [PromptingError.WrongColumnType(["A"], "text", frozenset({"number"}))], )
def test_clean_multicolumn_from_other_tab(self): tab2 = Tab("tab-2", "Tab 2") tab2_output_table = arrow_table({"A-from-tab-2": [1, 2]}) schema = ParamDType.Dict({ "tab": ParamDType.Tab(), "columns": ParamDType.Multicolumn(tab_parameter="tab"), }) params = {"tab": "tab-2", "columns": ["A-from-tab-1", "A-from-tab-2"]} context = self._render_context( input_table=arrow_table({"A-from-tab-1": [1]}), tab_results={tab2: RenderResult(tab2_output_table)}, params=params, ) result = clean_value(schema, params, context) # result['tab'] is not what we're testing here self.assertEqual(result["columns"], ["A-from-tab-2"])
def test_clean_file_no_uploaded_file(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wfm = tab.wf_modules.create(module_id_name="uploadfile", order=0, slug="step-1") context = self._render_context(wf_module_id=wfm.id) result = clean_value(ParamDType.File(), str(uuid.uuid4()), context) self.assertIsNone(result) # Assert that if a temporary file was created to house the download, it # no longer exists. self.assertListEqual(list(self.basedir.iterdir()), [])
def test_dict_prompting_error(self): context = self._render_context( input_table=arrow_table({"A": ["a"], "B": ["b"]}) ) schema = ParamDType.Dict( { "col1": ParamDType.Column(column_types=frozenset({"number"})), "col2": ParamDType.Column(column_types=frozenset({"datetime"})), } ) with self.assertRaises(PromptingError) as cm: clean_value(schema, {"col1": "A", "col2": "B"}, context) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType(["B"], "text", frozenset({"datetime"})), ], )
def test_list_prompting_error_concatenate_different_type(self): context = self._render_context(input_table=arrow_table({ "A": ["1"], "B": pa.array([datetime.now()], pa.timestamp("ns")) })) schema = ParamDType.List(inner_dtype=ParamDType.Column( column_types=frozenset({"number"}))) with self.assertRaises(PromptingError) as cm: clean_value(schema, ["A", "B"], context) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType(["B"], "datetime", frozenset({"number"})), ], )
def test_execute_cache_miss(self, fake_load_module): ModuleVersion.create_or_replace_from_spec( {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []} ) workflow = Workflow.create_and_init() tab = workflow.tabs.first() step1 = tab.wf_modules.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) step2 = tab.wf_modules.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, ParamDType.Dict({}), {}), ExecuteStep(step2, ParamDType.Dict({}), {}), ], ) expected = RenderResult(arrow_table({"B": [2]})) fake_load_module.return_value.render.return_value = expected with self._execute(workflow, tab_flow, {}) as result: assert_render_result_equals(result, expected) self.assertEqual( fake_load_module.return_value.render.call_count, 2 # step2, not step1 ) self.assertRegex( # Output is to the correct file fake_load_module.return_value.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_clean_column_prompting_error_convert_to_number(self): input_shape = TableMetadata(3, [Column("A", ColumnType.Text())]) with self.assertRaises(PromptingError) as cm: clean_value(ParamDType.Column(column_types=frozenset({"number"})), "A", input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})) ], )
def test_dict_prompting_error(self): input_shape = TableMetadata( 3, [Column("A", ColumnType.Text()), Column("B", ColumnType.Text())]) schema = ParamDType.Dict({ "col1": ParamDType.Column(column_types=frozenset({"number"})), "col2": ParamDType.Column(column_types=frozenset({"datetime"})), }) with self.assertRaises(PromptingError) as cm: clean_value(schema, {"col1": "A", "col2": "B"}, input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType(["B"], "text", frozenset({"datetime"})), ], )
def test_clean_multicolumn_prompting_error_convert_to_text(self): # TODO make this _automatic_ instead of quick-fix? # ... but for now: prompt for a Quick Fix. context = self._render_context( input_table=arrow_table({"A": [1], "B": [datetime.now()], "C": ["x"]}) ) with self.assertRaises(PromptingError) as cm: schema = ParamDType.Multicolumn(column_types=frozenset({"text"})) clean_value(schema, ["A", "B"], context) self.assertEqual( cm.exception.errors, [PromptingError.WrongColumnType(["A", "B"], None, frozenset({"text"}))], )
def test_execute_cache_hit(self, fake_module): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step1 = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=workflow.last_delta_id ) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) step2 = tab.wf_modules.create( order=1, slug="step-2", last_relevant_delta_id=workflow.last_delta_id ) rendercache.cache_render_result( workflow, step2, workflow.last_delta_id, RenderResult(arrow_table({"B": [2]})), ) tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, ParamDType.Dict({}), {}), ExecuteStep(step2, ParamDType.Dict({}), {}), ], ) with self._execute(workflow, tab_flow, {}) as result: assert_render_result_equals( result, RenderResult(arrow_table({"B": [2]}), []) ) fake_module.assert_not_called()
def test_clean_tabs_happy_path(self): tab2 = Tab("tab-2", "Tab 2") tab2_output = arrow_table({"B": [1]}) tab3 = Tab("tab-3", "Tab 3") tab3_output = arrow_table({"C": [1]}) context = self._render_context(tab_results={ tab2: RenderResult(tab2_output), tab3: RenderResult(tab3_output), }) result = clean_value(ParamDType.Multitab(), ["tab-2", "tab-3"], context) self.assertEqual( result, [TabOutput(tab2, tab2_output), TabOutput(tab3, tab3_output)])