def test_execute_cache_hit(self): workflow = Workflow.objects.create() create_module_zipfile("mod") tab = workflow.tabs.create(position=0) delta = InitWorkflowCommand.create(workflow) wf_module1 = tab.wf_modules.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=delta.id, ) cache_render_result(workflow, wf_module1, delta.id, RenderResult(arrow_table({"A": [1]}))) wf_module2 = tab.wf_modules.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=delta.id, ) cache_render_result(workflow, wf_module2, delta.id, RenderResult(arrow_table({"B": [2]}))) with patch.object(Kernel, "render", return_value=None): self._execute(workflow) Kernel.render.assert_not_called()
def test_execute_new_revision(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() create_module_zipfile( "mod", spec_kwargs={"loads_data": True}, python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"B": [2]})', ) step = tab.steps.create( order=0, slug="step-1", last_relevant_delta_id=1, module_id_name="mod", ) cache_render_result(workflow, step, 1, RenderResult(arrow_table({"A": [1]}))) step.last_relevant_delta_id = 2 step.save(update_fields=["last_relevant_delta_id"]) self._execute(workflow) step.refresh_from_db() with open_cached_render_result(step.cached_render_result) as result: assert_render_result_equals(result, RenderResult(arrow_table({"B": [2]})))
def test_execute_new_revision(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() delta1 = workflow.last_delta create_module_zipfile( "mod", python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"B": [2]})', ) wf_module = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=delta1.id, module_id_name="mod", ) result1 = RenderResult(arrow_table({"A": [1]})) cache_render_result(workflow, wf_module, delta1.id, result1) delta2 = InitWorkflowCommand.create(workflow) wf_module.last_relevant_delta_id = delta2.id wf_module.save(update_fields=["last_relevant_delta_id"]) self._execute(workflow) wf_module.refresh_from_db() with open_cached_render_result( wf_module.cached_render_result) as result: assert_render_result_equals(result, RenderResult(arrow_table({"B": [2]})))
def test_email_no_delta_when_not_changed(self, email): workflow = Workflow.objects.create() tab = workflow.tabs.create(position=0) create_module_zipfile( "mod", spec_kwargs={"loads_data": True}, python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"A": [1]})', ) step = tab.steps.create( order=0, slug="step-1", last_relevant_delta_id=1, module_id_name="mod", notifications=True, ) cache_render_result(workflow, step, 1, RenderResult(arrow_table({"A": [1]}))) # Make a new delta, so we need to re-render. Give it the same output. step.last_relevant_delta_id = 2 step.save(update_fields=["last_relevant_delta_id"]) self._execute(workflow) email.assert_not_called()
def test_email_no_delta_when_not_changed(self, email, fake_load_module): workflow = Workflow.objects.create() tab = workflow.tabs.create(position=0) delta1 = InitWorkflowCommand.create(workflow) ModuleVersion.create_or_replace_from_spec({ "id_name": "mod", "name": "Mod", "category": "Clean", "parameters": [] }) wf_module = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=delta1.id, module_id_name="mod", notifications=True, ) cache_render_result(workflow, wf_module, delta1.id, RenderResult(arrow_table({"A": [1]}))) # Make a new delta, so we need to re-render. Give it the same output. delta2 = InitWorkflowCommand.create(workflow) wf_module.last_relevant_delta_id = delta2.id wf_module.save(update_fields=["last_relevant_delta_id"]) fake_loaded_module = Mock(LoadedModule) fake_load_module.return_value = fake_loaded_module fake_loaded_module.migrate_params.return_value = {} fake_loaded_module.render.return_value = RenderResult( arrow_table({"A": [1]})) self._execute(workflow) email.assert_not_called()
def test_email_no_delta_when_not_changed(self, email): workflow = Workflow.objects.create() tab = workflow.tabs.create(position=0) delta1 = InitWorkflowCommand.create(workflow) create_module_zipfile( "mod", python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"A": [1]})', ) wf_module = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=delta1.id, module_id_name="mod", notifications=True, ) cache_render_result(workflow, wf_module, delta1.id, RenderResult(arrow_table({"A": [1]}))) # Make a new delta, so we need to re-render. Give it the same output. delta2 = InitWorkflowCommand.create(workflow) wf_module.last_relevant_delta_id = delta2.id wf_module.save(update_fields=["last_relevant_delta_id"]) self._execute(workflow) email.assert_not_called()
def _execute_wfmodule_save( workflow: Workflow, wf_module: WfModule, result: RenderResult ) -> SaveResult: """ Call rendercache.cache_render_result() and build notifications.OutputDelta. All this runs synchronously within a database lock. (It's a separate function so that when we're done awaiting it, we can continue executing in a context that doesn't use a database thread.) Raise UnneededExecution if the WfModule has changed in the interim. """ # raises UnneededExecution with locked_wf_module(workflow, wf_module) as safe_wf_module: if safe_wf_module.notifications: stale_crr = safe_wf_module.get_stale_cached_render_result() if stale_crr is None: stale_result = None else: try: # Read entire old Parquet file, blocking with rendercache.open_cached_render_result( stale_crr ) as stale_result: pass # stale_result is deleted from disk but still mmapped except rendercache.CorruptCacheError: # No, let's not send an email. Corrupt cache probably means # we've been messing with our codebase. logger.exception( "Ignoring CorruptCacheError on workflow %d, wf_module %d because we are about to overwrite it", workflow.id, wf_module.id, ) stale_result = None else: stale_result = None rendercache.cache_render_result( workflow, safe_wf_module, wf_module.last_relevant_delta_id, result ) if ( safe_wf_module.notifications and stale_result is not None and result != stale_result ): safe_wf_module.has_unseen_notification = True safe_wf_module.save(update_fields=["has_unseen_notification"]) maybe_delta = notifications.OutputDelta( safe_wf_module.workflow.owner, safe_wf_module.workflow, safe_wf_module, stale_result, result, ) else: maybe_delta = None # nothing to email return SaveResult(safe_wf_module.cached_render_result, maybe_delta)
def test_execute_partial_cache_hit(self): module_zipfile = create_module_zipfile("mod") workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh. Should not render. step1 = tab.wf_modules.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) # step2: cached result is stale, so must be re-rendered step2 = tab.wf_modules.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id - 1, ) rendercache.cache_render_result( workflow, step2, workflow.last_delta_id - 1, RenderResult(arrow_table({"B": [2]})), ) step2.last_relevant_delta_id = workflow.last_delta_id step2.save(update_fields=["last_relevant_delta_id"]) tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) with patch.object(Kernel, "render", side_effect=mock_render({"B": [3]})): with self._execute(workflow, tab_flow, {}) as result: expected = RenderResult(arrow_table({"B": [3]})) assert_render_result_equals(result, expected) Kernel.render.assert_called_once() # step2, not step1 self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_resume_backtrack_on_corrupt_cache_error(self): module_zipfile = create_module_zipfile("mod") workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh -- but CORRUPT step1 = tab.wf_modules.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) minio.put_bytes( # Write corrupted data -- will lead to CorruptCacheError rendercache.io.BUCKET, rendercache.io.crr_parquet_key(step1.cached_render_result), b"CORRUPT", ) # step2: no cached result -- must re-render step2 = tab.wf_modules.create(order=1, slug="step-2", module_id_name="mod") tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) with patch.object(Kernel, "render", side_effect=mock_render({"B": [2]})): with self._execute(workflow, tab_flow, {}, expect_log_level=logging.ERROR) as result: expected = RenderResult(arrow_table({"B": [2]})) assert_render_result_equals(result, expected) self.assertEqual( # called with step1, then step2 Kernel.render.call_count, 2, ) self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_execute_partial_cache_hit(self, fake_load_module): ModuleVersion.create_or_replace_from_spec( {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []} ) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh. Should not render. step1 = tab.wf_modules.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) # step2: cached result is stale, so must be re-rendered step2 = tab.wf_modules.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id - 1, ) rendercache.cache_render_result( workflow, step2, workflow.last_delta_id - 1, RenderResult(arrow_table({"B": [2]})), ) step2.last_relevant_delta_id = workflow.last_delta_id step2.save(update_fields=["last_relevant_delta_id"]) tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, ParamDType.Dict({}), {}), ExecuteStep(step2, ParamDType.Dict({}), {}), ], ) expected = RenderResult(arrow_table({"B": [3]})) fake_load_module.return_value.render.return_value = expected with self._execute(workflow, tab_flow, {}) as result: assert_render_result_equals(result, expected) fake_load_module.return_value.render.assert_called_once() # step2, not step1 self.assertRegex( # Output is to the correct file fake_load_module.return_value.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_resume_backtrack_on_corrupt_cache_error(self, fake_load_module): ModuleVersion.create_or_replace_from_spec( {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []} ) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh -- but CORRUPT step1 = tab.wf_modules.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) minio.put_bytes( # Write corrupted data -- will lead to CorruptCacheError rendercache.io.BUCKET, rendercache.io.crr_parquet_key(step1.cached_render_result), b"CORRUPT", ) # step2: no cached result -- must re-render step2 = tab.wf_modules.create(order=1, slug="step-2", module_id_name="mod") tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, ParamDType.Dict({}), {}), ExecuteStep(step2, ParamDType.Dict({}), {}), ], ) expected = RenderResult(arrow_table({"B": [2]})) fake_load_module.return_value.render.return_value = expected with self._execute( workflow, tab_flow, {}, expect_log_level=logging.ERROR ) as result: assert_render_result_equals(result, expected) self.assertEqual( # called with step1, then step2 fake_load_module.return_value.render.call_count, 2, ) self.assertRegex( # Output is to the correct file fake_load_module.return_value.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_email_delta_when_errors_change(self, email_delta): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) # We need to actually populate the cache to set up the test. The code # under test will only try to open the render result if the database # says there's something there. rendercache.cache_render_result( workflow, step, workflow.last_delta_id - 1, RenderResult(errors=[ RenderError( I18nMessage("py.renderer.execute.step.noModule", {}, None)) ]), ) step.last_relevant_delta_id = workflow.last_delta_id step.save(update_fields=["last_relevant_delta_id"]) module_zipfile = create_module_zipfile( "x", spec_kwargs={"loads_data": True}, # returns different error python_code= 'import pandas as pd\ndef render(table, params): return [{"id": "err"}]', ) with self.assertLogs(level=logging.INFO): self.run_with_async_db( execute_step( self.chroot_context, workflow, step, module_zipfile, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_called() # there's new data
def test_email_delta_when_stale_crr_is_unreachable(self, email_delta, read_cache): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) # We need to actually populate the cache to set up the test. The code # under test will only try to open the render result if the database # says there's something there. rendercache.cache_render_result( workflow, step, workflow.last_delta_id - 1, RenderResult(arrow_table({})), # does not write a Parquet file ) step.last_relevant_delta_id = workflow.last_delta_id step.save(update_fields=["last_relevant_delta_id"]) module_zipfile = create_module_zipfile( "x", spec_kwargs={"loads_data": True}, # returns different data python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"A": [2]})', ) with self.assertLogs(level=logging.INFO): self.run_with_async_db( execute_step( self.chroot_context, workflow, step, module_zipfile, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) read_cache.assert_not_called() # it would give CorruptCacheError email_delta.assert_called() # there's new data
def test_email_delta(self, email_delta): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) rendercache.cache_render_result( workflow, wf_module, workflow.last_delta_id - 1, RenderResult(arrow_table({"A": [1]})), ) wf_module.last_relevant_delta_id = workflow.last_delta_id wf_module.save(update_fields=["last_relevant_delta_id"]) module_zipfile = create_module_zipfile( "x", python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"A": [2]})', ) with self.assertLogs(level=logging.INFO): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, module_zipfile, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_called() delta = email_delta.call_args[0][0] self.assertEqual(delta.user, workflow.owner) self.assertEqual(delta.workflow, workflow) self.assertEqual(delta.wf_module, wf_module) self.assertEqual(delta.old_result, RenderResult(arrow_table({"A": [1]}))) self.assertEqual(delta.new_result, RenderResult(arrow_table({"A": [2]})))
def test_email_delta_ignore_corrupt_cache_error(self, email_delta, read_cache): read_cache.side_effect = rendercache.CorruptCacheError workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) # We need to actually populate the cache to set up the test. The code # under test will only try to open the render result if the database # says there's something there. rendercache.cache_render_result( workflow, wf_module, workflow.last_delta_id - 1, RenderResult(arrow_table({"A": [1]})), ) wf_module.last_relevant_delta_id = workflow.last_delta_id wf_module.save(update_fields=["last_relevant_delta_id"]) module_zipfile = create_module_zipfile( "x", # returns different data -- but CorruptCacheError means we won't care. python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"A": [2]})', ) with self.assertLogs(level=logging.ERROR): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, module_zipfile, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_not_called()
def test_email_delta(self, email_delta): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) rendercache.cache_render_result( workflow, wf_module, workflow.last_delta_id - 1, RenderResult(arrow_table({"A": [1]})), ) wf_module.last_relevant_delta_id = workflow.last_delta_id wf_module.save(update_fields=["last_relevant_delta_id"]) with arrow_table_context({"A": [2]}) as table2: def render(*args, **kwargs): return RenderResult(table2) with self._stub_module(render): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_called() delta = email_delta.call_args[0][0] self.assertEqual(delta.user, workflow.owner) self.assertEqual(delta.workflow, workflow) self.assertEqual(delta.wf_module, wf_module) self.assertEqual(delta.old_result, RenderResult(arrow_table({"A": [1]}))) self.assertEqual(delta.new_result, RenderResult(arrow_table({"A": [2]})))
def test_workflow_view_triggers_render_if_stale_cache(self): step = self.tab1.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=self.delta.id, cached_render_result_delta_id=self.delta.id, # stale ) # Cache a result cache_render_result(self.workflow1, step, self.delta.id, RenderResult(arrow_table({"A": ["a"]}))) # Make the cached result stale. (The view will actually send the # stale-result metadata to the client. That's why we cached it.) delta2 = InitWorkflowCommand.create(self.workflow1) step.last_relevant_delta_id = delta2.id step.save(update_fields=["last_relevant_delta_id"]) self.client.force_login(self.user) self.client.get("/workflows/%d/" % self.workflow1.id) self.queue_render.assert_called_with(self.workflow1.id, delta2.id)
def test_execute_cache_hit(self, fake_module): workflow = Workflow.objects.create() tab = workflow.tabs.create(position=0) delta = InitWorkflowCommand.create(workflow) wf_module1 = tab.wf_modules.create(order=0, slug="step-1", last_relevant_delta_id=delta.id) cache_render_result(workflow, wf_module1, delta.id, RenderResult(arrow_table({"A": [1]}))) wf_module2 = tab.wf_modules.create(order=1, slug="step-2", last_relevant_delta_id=delta.id) cache_render_result(workflow, wf_module2, delta.id, RenderResult(arrow_table({"B": [2]}))) self._execute(workflow) fake_module.assert_not_called()
def test_email_delta_ignore_corrupt_cache_error(self, email_delta, read_cache): read_cache.side_effect = rendercache.CorruptCacheError workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) # We need to actually populate the cache to set up the test. The code # under test will only try to open the render result if the database # says there's something there. rendercache.cache_render_result( workflow, wf_module, workflow.last_delta_id - 1, RenderResult(arrow_table({"A": [1]})), ) wf_module.last_relevant_delta_id = workflow.last_delta_id wf_module.save(update_fields=["last_relevant_delta_id"]) with arrow_table_context({"A": [2]}) as table2: def render(*args, **kwargs): return RenderResult(table2) with self._stub_module(render): with self.assertLogs(level=logging.ERROR): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_not_called()
def test_load_input_cached_render_result(self): with arrow_table_context({"A": [1]}) as atable: input_render_result = RenderResult(atable) workflow = Workflow.create_and_init() step1 = workflow.tabs.first().steps.create( order=0, slug="step-1", last_relevant_delta_id=workflow.last_delta_id ) step2 = workflow.tabs.first().steps.create(order=1, slug="step-2") rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, input_render_result ) result = self.run_with_async_db( fetch.load_database_objects(workflow.id, step2.id) ) input_crr = step1.cached_render_result assert input_crr is not None self.assertEqual(result[4], input_crr) self.assertEqual(result.input_cached_render_result, input_crr)
def test_resume_without_rerunning_unneeded_renders(self, fake_load_module): workflow = Workflow.create_and_init() tab = workflow.tabs.first() delta_id = workflow.last_delta_id ModuleVersion.create_or_replace_from_spec({ "id_name": "mod", "name": "Mod", "category": "Clean", "parameters": [] }) # wf_module1: has a valid, cached result wf_module1 = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=delta_id, module_id_name="mod", ) cache_render_result(workflow, wf_module1, delta_id, RenderResult(arrow_table({"A": [1]}))) # wf_module2: has no cached result (must be rendered) wf_module2 = tab.wf_modules.create( order=1, slug="step-2", last_relevant_delta_id=delta_id, module_id_name="mod", ) fake_loaded_module = Mock(LoadedModule) fake_loaded_module.migrate_params.return_value = {} fake_load_module.return_value = fake_loaded_module result2 = RenderResult(arrow_table({"A": [2]})) fake_loaded_module.render.return_value = result2 self._execute(workflow) fake_loaded_module.render.assert_called_once() # only with module2 wf_module2.refresh_from_db() with open_cached_render_result( wf_module2.cached_render_result) as actual: assert_render_result_equals(actual, result2)
def test_workflow_view_triggers_render_if_stale_cache(self): step = self.tab1.steps.create( order=0, slug="step-1", last_relevant_delta_id=1, cached_render_result_delta_id=1, ) # Cache a result cache_render_result( self.workflow1, step, 1, RenderResult(arrow_table({"A": ["a"]})), ) step.last_relevant_delta_id = 2 step.save(update_fields=["last_relevant_delta_id"]) self.client.force_login(self.user) self.client.get("/workflows/%d/" % self.workflow1.id) self.queue_render.assert_called_with(self.workflow1.id, self.workflow1.last_delta_id)
def test_email_no_delta_when_errors_stay_the_same(self, email_delta): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) # We need to actually populate the cache to set up the test. The code # under test will only try to open the render result if the database # says there's something there. rendercache.cache_render_result( workflow, step, workflow.last_delta_id - 1, RenderResult(errors=[ RenderError( I18nMessage("py.renderer.execute.step.noModule", {}, None)) ]), ) step.last_relevant_delta_id = workflow.last_delta_id step.save(update_fields=["last_relevant_delta_id"]) self.run_with_async_db( execute_step( self.chroot_context, workflow, step, None, # module_zipfile {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_not_called() # error is the same error
def test_execute_cache_hit(self): module_zipfile = create_module_zipfile("mod") workflow = Workflow.create_and_init() tab = workflow.tabs.first() step1 = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=workflow.last_delta_id) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) step2 = tab.wf_modules.create( order=1, slug="step-2", last_relevant_delta_id=workflow.last_delta_id) rendercache.cache_render_result( workflow, step2, workflow.last_delta_id, RenderResult(arrow_table({"B": [2]})), ) tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) with patch.object(Kernel, "render", side_effect=mock_render({"No": ["bad"]})): with self._execute(workflow, tab_flow, {}) as result: assert_render_result_equals( result, RenderResult(arrow_table({"B": [2]}), []))
def test_resume_without_rerunning_unneeded_renders(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() delta_id = workflow.last_delta_id create_module_zipfile( # If this runs on step1, it'll return pd.DataFrame(). # If this runs on step2, it'll return step1-output * 2. # ... step2's output depends on whether we run this on # step1. "mod", spec_kwargs={"loads_data": True}, python_code="def render(table, params): return table * 2", ) # step1: has a valid, cached result step1 = tab.steps.create( order=0, slug="step-1", last_relevant_delta_id=1, module_id_name="mod", ) cache_render_result(workflow, step1, 1, RenderResult(arrow_table({"A": [1]}))) # step2: has no cached result (must be rendered) step2 = tab.steps.create( order=1, slug="step-2", last_relevant_delta_id=1, module_id_name="mod", ) self._execute(workflow) step2.refresh_from_db() with open_cached_render_result(step2.cached_render_result) as actual: assert_render_result_equals(actual, RenderResult(arrow_table({"A": [2]})))
def test_execute_new_revision(self, fake_load_module): workflow = Workflow.create_and_init() tab = workflow.tabs.first() delta1 = workflow.last_delta ModuleVersion.create_or_replace_from_spec({ "id_name": "mod", "name": "Mod", "category": "Clean", "parameters": [] }) wf_module = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=delta1.id, module_id_name="mod", ) result1 = RenderResult(arrow_table({"A": [1]})) cache_render_result(workflow, wf_module, delta1.id, result1) delta2 = InitWorkflowCommand.create(workflow) wf_module.last_relevant_delta_id = delta2.id wf_module.save(update_fields=["last_relevant_delta_id"]) result2 = RenderResult(arrow_table({"B": [2]})) fake_module = Mock(LoadedModule) fake_module.migrate_params.return_value = {} fake_load_module.return_value = fake_module fake_module.render.return_value = result2 self._execute(workflow) wf_module.refresh_from_db() with open_cached_render_result( wf_module.cached_render_result) as result: assert_render_result_equals(result, result2)
def test_execute_cache_hit(self, fake_module): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step1 = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=workflow.last_delta_id ) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) step2 = tab.wf_modules.create( order=1, slug="step-2", last_relevant_delta_id=workflow.last_delta_id ) rendercache.cache_render_result( workflow, step2, workflow.last_delta_id, RenderResult(arrow_table({"B": [2]})), ) tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, ParamDType.Dict({}), {}), ExecuteStep(step2, ParamDType.Dict({}), {}), ], ) with self._execute(workflow, tab_flow, {}) as result: assert_render_result_equals( result, RenderResult(arrow_table({"B": [2]}), []) ) fake_module.assert_not_called()
def _execute_step_save( workflow: Workflow, step: Step, result: LoadedRenderResult ) -> SaveResult: """Call rendercache.cache_render_result() and build notifications.OutputDelta. All this runs synchronously within a database lock. (It's a separate function so that when we're done awaiting it, we can continue executing in a context that doesn't use a database thread.) Raise UnneededExecution if the Step has changed in the interim. """ # raises UnneededExecution with contextlib.ExitStack() as exit_stack: safe_step = exit_stack.enter_context(locked_step(workflow, step)) if safe_step.notifications and workflow.owner_id is not None: stale_crr = safe_step.get_stale_cached_render_result() if stale_crr is None: stale_parquet_file = None elif stale_crr.status == "ok": try: stale_parquet_file = exit_stack.enter_context( rendercache.downloaded_parquet_file(stale_crr) ) except rendercache.CorruptCacheError: # No, let's not send an email. Corrupt cache probably means # we've been messing with our codebase. logger.exception( "Ignoring CorruptCacheError on workflow %d, step %d because we are about to overwrite it", workflow.id, step.id, ) stale_crr = None stale_parquet_file = None else: # status is 'error'/'unreachable'. There's no Parquet file. stale_parquet_file = None else: stale_crr = None stale_parquet_file = None rendercache.cache_render_result( workflow, safe_step, step.last_relevant_delta_id, result ) is_changed = False # nothing to email, usually if stale_crr is not None: fresh_crr = safe_step.cached_render_result if ( fresh_crr.status != stale_crr.status or fresh_crr.errors != stale_crr.errors or fresh_crr.json != stale_crr.json or fresh_crr.table_metadata != stale_crr.table_metadata ): # Output other than table data has changed (e.g., nRows) is_changed = True if not is_changed and fresh_crr.status == "ok": # Download the new parquet file and compare to the old one fresh_parquet_file = exit_stack.enter_context( rendercache.downloaded_parquet_file(fresh_crr) ) is_changed = not cjwparquet.are_files_equal( stale_parquet_file, fresh_parquet_file ) if is_changed: with connection.cursor() as cursor: # Don't import cjworkbench.models.userprofile: it relies on # settings.FREE_TIER_USAGE_LIMITS, buy renderer doesn't set it. # # TODO nix django-ORM. cursor.execute( """ SELECT locale_id FROM cjworkbench_userprofile WHERE user_id = %s """, [safe_step.workflow.owner_id], ) locale_id = cursor.fetchone()[0] maybe_delta = notifications.OutputDelta( user=safe_step.workflow.owner, workflow=safe_step.workflow, step=safe_step, locale_id=locale_id, ) else: maybe_delta = None return SaveResult(safe_step.cached_render_result, maybe_delta)