def test_load_and_dispatch(self): test_dir = self.fake_github_clone() import_module_from_directory('https://github.com/account/reponame', 'reponame', '123456', test_dir) # Module and ModuleVersion should have loaded -- these will raise exception if they don't exist module = Module.objects.get(id_name=self.importable_id_name) module_version = ModuleVersion.objects.get(module=module) # Create a test workflow that uses this imported module workflow = add_new_workflow('Dynamic Dispatch Test Workflow') wfm = add_new_wf_module(workflow, module_version, order=1) # These will fail if we haven't correctly loaded the json describing # the parameters stringparam = get_param_by_id_name('test', wf_module=wfm) colparam = get_param_by_id_name('test_column', wf_module=wfm) multicolparam = get_param_by_id_name('test_multicolumn', wf_module=wfm) # Does it render right? test_csv = 'Class,M,F,Other\n' \ 'math,10,12,100\n' \ 'english,,7\,200\n' \ 'history,11,13,\n' \ 'economics,20,20,20' test_table = pd.read_csv(io.StringIO(test_csv), header=0, skipinitialspace=True) test_table_out = test_table.copy() test_table_out['M'] *= 2 test_table_out[['F', 'Other']] *= 3 colparam.set_value('M') # double this multicolparam.set_value('F,Other') # triple these with self.assertLogs(dynamicdispatch.__name__): result = module_dispatch_render(module_version, wfm.get_params(), test_table, None) self.assertEqual(result, ProcessResult(test_table_out)) # Test that bad column parameter values are removed colparam.set_value('missing_column_name') multicolparam.set_value('Other,junk_column_name') test_table_out = test_table.copy() # multicolumn parameter has only one valid col test_table_out[['Other']] *= 3 result = module_dispatch_render(module_version, wfm.get_params(), test_table, None) self.assertEqual(result, ProcessResult(test_table_out)) # if the module crashes, we should get an error with a line number stringparam.set_value('crashme') result = module_dispatch_render(module_version, wfm.get_params(), test_table, None) self.assertEqual( result, ProcessResult( error='ValueError: we crashed! at line 7 of importable.py'))
def find_output_deltas_to_notify_from_fetched_tables( wf_module: 'WfModule', old_result: Optional[ProcessResult], new_result: ProcessResult) -> List[OutputDelta]: """Compute a list of OutputDeltas to email to the owner. `wf_module` is the fetch module whose data just changed from `old_table` to `new_table`. (Either may be `None` or empty.) Assumes `old_result` and `new_result` are different. Must be called within a workflow.cooperative_lock(). TODO make this easier to unit-test, and then unit-test it. """ # Import here, to prevent recursive import from server.dispatch import module_dispatch_render output_deltas = [] all_modules = list(wf_module.workflow.wf_modules.all()) # Truncate all_modules: nix all after the last `.notifications` module while all_modules and not all_modules[-1].notifications: all_modules.pop() # Advance in the list up until one _after_ `wf_module` while all_modules and all_modules[0].id != wf_module.id: all_modules.pop(0) if all_modules: # remove wf_module itself all_modules.pop(0) if wf_module.notifications: # Notify on wf_module itself output_deltas.append(OutputDelta(wf_module, old_result, new_result)) if old_result is None: old_result = ProcessResult() # Now iterate through dependent modules: calculate tables and compare for wf_module in all_modules: old_result = module_dispatch_render(wf_module, old_result.dataframe) new_result = module_dispatch_render(wf_module, new_result.dataframe) if old_result == new_result: # From this point forward, tables will never diverge so we should # never notify the user. return output_deltas if wf_module.notifications: output_deltas.append(OutputDelta(wf_module, old_result, new_result)) return output_deltas
def execute_wfmodule(wf_module: WfModule) -> ProcessResult: """ Process all WfModules until the given one; return its result. By default, this will both read and write each WfModule's cached render result. Pass nocache=True to avoid modifying the cache. You must call this within a workflow.cooperative_lock(). """ # Do we already have what we need? If so, return quickly. cached_result = _get_render_cache(wf_module) if cached_result: return cached_result.result # Recurse -- ensuring the smallest possible number of renders input_wf_module = wf_module.previous_in_stack() if input_wf_module: input_result = execute_wfmodule(input_wf_module) else: input_result = ProcessResult() result = dispatch.module_dispatch_render(wf_module, input_result.dataframe) wf_module.cache_render_result(wf_module.last_relevant_delta_id, result) wf_module.save() return result
def execute_wfmodule(wf_module: WfModule, last_result: ProcessResult) -> CachedRenderResult: """ Render a single WfModule; cache and return output. CONCURRENCY NOTES: This function is reasonably concurrency-friendly: * It locks the workflow, so two renders won't happen on the same workflow at the same time. * It returns a valid cache result immediately. * It checks with the database that `wf_module` hasn't been deleted from its workflow. * It checks with the database that `wf_module` hasn't been deleted from the database entirely. * It checks with the database that `wf_module` hasn't been modified. (It is very common for a user to request a module's output -- kicking off a sequence of `execute_wfmodule` -- and then change a param in a prior module, making all those calls obsolete. * It runs in a transaction (obviously -- FOR UPDATE and all), which will stall `models.Delta` as it tries to write last_relevant_delta_id, effectively stalling users' update HTTP requests until after the `wf_module`'s render is complete. These guarantees mean: * It's relatively cheap to render twice. * Users who modify a WfModule while it's rendering will be stalled -- for as short a duration as possible. * When a user changes a workflow significantly, all prior renders will end relatively cheaply. Raises `UnneededExecution` when the input WfModule should not be rendered. """ with locked_wf_module(wf_module) as safe_wf_module: cached_render_result = wf_module.get_cached_render_result() # If the cache is good, just return it -- skipping the render() call if ( cached_render_result and (cached_render_result.delta_id == wf_module.last_relevant_delta_id) ): return cached_render_result result = dispatch.module_dispatch_render(safe_wf_module, last_result.dataframe) cached_render_result = safe_wf_module.cache_render_result( safe_wf_module.last_relevant_delta_id, result ) # Save safe_wf_module, not wf_module, because we know we've only # changed the cached_render_result columns. (We know because we # locked the row before fetching it.) `wf_module.save()` might # overwrite some newer values. safe_wf_module.save() return cached_render_result
def test_table_truncation(self): nrows = settings.MAX_ROWS_PER_TABLE + 1 bigtable = pd.DataFrame(np.random.randint(0, 100, size=(nrows, 4)), columns=list('ABCD')) wfm = load_and_add_module( 'editcells') # because it never changes row count out = module_dispatch_render(wfm, bigtable) self.assertTrue(len(out) == settings.MAX_ROWS_PER_TABLE) self.assertEqual(wfm.status, WfModule.ERROR)
def test_render_static_truncates_table(self): table = pd.DataFrame({'a': [1, 2, 3]}) wfm = load_and_add_module('editcells') # it never changes row count result = module_dispatch_render(wfm, table) self.assertEqual( result, ProcessResult(dataframe=pd.DataFrame({'a': [1, 2]}), error='Truncated output from 3 rows to 2')) wfm.refresh_from_db()
def execute_wfmodule(wfmodule): table = pd.DataFrame() workflow = wfmodule.workflow for wfm in workflow.wf_modules.all(): table = module_dispatch_render(wfm, table) if wfm == wfmodule: break if table is None: table = pd.DataFrame() return table
def test_error_render(self): # Force an error, ensure that it's returned and the output is a NOP wfm = load_and_add_module('pythoncode', workflow=self.workflow) code_pval = get_param_by_id_name('code') code_pval.set_value('not python code') out = module_dispatch_render(wfm, self.test_table) wfm.refresh_from_db() self.assertTrue(wfm.status == WfModule.ERROR) self.assertEqual(wfm.error_msg, 'invalid syntax (<string>, line 2) at line 1') self.assertTrue(out.equals(self.test_table))
def test_error_render(self): # Force an error, ensure that it's returned and the output is a NOP wfm = load_and_add_module('pythoncode', workflow=self.workflow) code_pval = get_param_by_id_name('code') code_pval.set_value('not python code') result = module_dispatch_render(wfm, self.test_table) self.assertEqual( result, ProcessResult( error='Line 1: invalid syntax (user input, line 1)', json={'output': ''} # not part of this test ))
def execute_wfmodule(wfmodule, nocache=False): workflow = wfmodule.workflow target_rev = workflow.revision() # Do we already have what we need? cache = None if not nocache: cache = get_render_cache(wfmodule, target_rev) if cache: return cache.get_table() # No, let's render from the top, shortcutting with cache whenever possible table = pd.DataFrame() # Start from the top, re-rendering any modules which do not have a cache at the current revision # Assumes not possible to have later revision cache after a module which has an earlier revision cache # (i.e. module stack always rendered in order) # If everything is rendered already, this will just return the cache for wfm in workflow.wf_modules.all(): # Get module output from cache, if available and desired cache = None if not nocache: cache = get_render_cache(wfm, target_rev) # if we did not find an available cache, render if cache is None: # previous revisions are dead to us now (well, maybe good for undo, but we can re-render) StoredObject.objects.filter(wf_module=wfm, type=StoredObject.CACHED_TABLE).delete() table = module_dispatch_render(wfm, table) StoredObject.create_table(wfm, StoredObject.CACHED_TABLE, table, metadata=target_rev) else: table = cache.get_table() # found the module we were looking for, all done if wfm == wfmodule: break return table
def test_internal_render(self): out = module_dispatch_render(self.wfm, self.test_table) self.assertTrue(out.equals(self.test_table_MF))
def test_none_table_render(self): result = module_dispatch_render(self.wfm, pd.DataFrame()) self.assertEqual(result, ProcessResult())
def test_missing_module(self): workflow = add_new_workflow('Missing module') wfm = add_new_wf_module(workflow, None, 0) result = module_dispatch_render(wfm, mock_csv_table) self.assertEqual(result, ProcessResult())
def test_multicolumn_sanitize(self): # no M,F cols result = module_dispatch_render(self.wfm, mock_csv_table) self.assertEqual(result, ProcessResult(pd.DataFrame([{}, {}])))
def test_internal_render(self): result = module_dispatch_render(self.wfm, self.test_table) self.assertEqual(result, ProcessResult(self.test_table_MF))
def test_missing_module(self): workflow = add_new_workflow('Missing module') wfm = add_new_wf_module(workflow, None, 0) out = module_dispatch_render(wfm, mock_csv_table) self.assertTrue(out.empty)
def test_none_table_render(self): out = module_dispatch_render(self.wfm, pd.DataFrame()) self.assertTrue(out.empty)
def test_multicolumn_sanitize(self): out = module_dispatch_render(self.wfm, mock_csv_table) # no M,F cols self.assertTrue(out.empty)