def test_render(self): # No columns specified, no output out = execute_wfmodule(self.wf_module) self.assertTrue(out.empty) # Basic search self.query_pval.set_value('Feb') self.colnames_pval.set_value('Month') out = execute_wfmodule(self.wf_module) self.assertEqual(str(out), ' Month Amount\n1 Feb 20') # Case sensitive - should return nothing because no match self.query_pval.set_value('feb') self.case_pval.set_value(True) out = execute_wfmodule(self.wf_module) self.assertTrue(out.empty) # Regex self.query_pval.set_value('Jan|Feb') self.regex_pval.set_value(True) out = execute_wfmodule(self.wf_module) self.assertEqual(str(out), ' Month Amount\n0 Jan 10\n1 Feb 20')
def test_render(self): # select a single column self.cols_pval.value = 'Month' self.cols_pval.save() out = execute_wfmodule(self.wf_module) table = mock_csv_table[['Month']] self.assertEqual(str(out), str(table)) # select a single column, with stripped whitespace self.cols_pval.value = 'Month ' self.cols_pval.save() out = execute_wfmodule(self.wf_module) self.assertEqual(str(out), str(table)) # reverse column order, should not reverse self.cols_pval.value = 'Amount,Month' self.cols_pval.save() out = execute_wfmodule(self.wf_module) table = mock_csv_table[['Month', 'Amount']] self.assertEqual(str(out), str(mock_csv_table)) # bad column name should produce error self.cols_pval.value = 'Amountxxx,Month' self.cols_pval.save() out = execute_wfmodule(self.wf_module) self.wf_module.refresh_from_db() self.assertEqual(self.wf_module.status, WfModule.ERROR)
def test_execute_cache_hit(self): workflow = create_testdata_workflow(table_csv) wf_module2 = load_and_add_module('selectcolumns', workflow=workflow) # Execute -- which should cache the result expected = execute_wfmodule(wf_module2) with mock.patch('server.dispatch.module_dispatch_render') as mdr: result = execute_wfmodule(wf_module2) self.assertFalse(mdr.called) self.assertEqual(result, expected)
def test_bad_colname(self): # No output if no column given set_string(self.col_pval, '') out = execute_wfmodule(self.wf_module) self.assertTrue(out.empty) # bad column name should produce error set_string(self.col_pval,'hilarious') out = execute_wfmodule(self.wf_module) self.wf_module.refresh_from_db() self.assertEqual(self.wf_module.status, WfModule.ERROR)
def test_bad_dates(self): # integers are not dates set_string(self.col_pval, 'Amount') out = execute_wfmodule(self.wf_module) self.wf_module.refresh_from_db() self.assertEqual(self.wf_module.status, WfModule.ERROR) # Weird strings are not dates (different error code path) set_string(self.col_pval, 'Foo') out = execute_wfmodule(self.wf_module) self.wf_module.refresh_from_db() self.assertEqual(self.wf_module.status, WfModule.ERROR)
def test_render(self): # sort by value. # Use out.to_csv() instead of str(out) to ensure rows are output in index order (otherwise variable) set_string(self.col_pval, 'Date') set_integer(self.sort_pval,0) # 0 = sort by value out = execute_wfmodule(self.wf_module) self.assertEqual(out.to_csv(index=False), 'date,count\n2011-07-25,1\n2016-01-10,2\n' ) # sort by freq set_integer(self.sort_pval,1) # 1 = sort by freq out = execute_wfmodule(self.wf_module) self.assertEqual(out.to_csv(index=False), 'date,count\n2016-01-10,2\n2011-07-25,1\n')
def test_execute_revision_0(self): # Don't crash on a new workflow (rev=0, no caches) workflow = create_testdata_workflow(table_csv) wf_module2 = load_and_add_module('selectcolumns', workflow=workflow) result = execute_wfmodule(wf_module2) self.assertEqual(result, ProcessResult(table_dataframe)) self.assertEqual(cached_render_result_revision_list(workflow), [0, 0])
def test_rename(self): self.entries_pval.value = json.dumps({'name': 'name1', 'count': 'CNT'}) self.entries_pval.save() result = execute_wfmodule(self.wf_module) expected_table = reference_table.copy() expected_table.columns = ['name1', 'date', 'CNT', 'float'] self.assertEqual(result, ProcessResult(expected_table))
def test_reorder(self): # In chronological order, starting with # ['name', 'date', 'count', 'float'] reorder_ops = [ { 'column': 'count', 'from': 2, 'to': 0 }, # gives ['count', 'name', 'date', 'float'] { 'column': 'name', 'from': 1, 'to': 2 }, # gives ['count', 'date', 'name', 'float'] { 'column': 'float', 'from': 3, 'to': 1 }, # gives ['count', 'float', 'date', 'name'] ] self.history_pval.value = json.dumps(reorder_ops) self.history_pval.save() result = execute_wfmodule(self.wf_module) self.assertEqual(result, reordered_result(['count', 'float', 'date', 'name']))
def test_scrape_table(self): url = 'http://test.com/tablepage.html' self.url_pval.set_value(url) self.url_pval.save() # should be no data saved yet, no Deltas on the workflow self.assertIsNone(self.wfmodule.get_fetched_data_version()) self.assertIsNone(self.wfmodule.retrieve_fetched_table()) self.assertIsNone(self.wfmodule.workflow.last_delta) with mock.patch('pandas.read_html') as readmock: readmock.return_value = [mock_csv_table] self.press_fetch_button() self.assertEqual(readmock.call_args, mock.call(url, flavor='html5lib')) result = execute_wfmodule(self.wfmodule) self.assertEqual(result, ProcessResult(mock_csv_table)) # should create a new data version on the WfModule, and a new delta # representing the change self.wfmodule.refresh_from_db() self.wfmodule.workflow.refresh_from_db() self.assertIsNotNone(self.wfmodule.get_fetched_data_version()) self.assertIsNotNone(self.wfmodule.workflow.last_delta)
def test_scrape_list(self): source_options = "List|Input column".split('|') source_pval = get_param_by_id_name('urlsource') source_pval.value = source_options.index('List') source_pval.save() get_param_by_id_name('urllist').set_value('\n'.join([ 'http://a.com/file', 'https://b.com/file2', 'c.com/file/dir' # Removed 'http://' to test the URL-fixing part ])) # Code below mostly lifted from the column test async def mock_scrapeurls(urls, table): table['status'] = self.scraped_table['status'] table['html'] = self.scraped_table['html'] return with mock.patch('django.utils.timezone.now') as now: now.return_value = testnow with mock.patch('server.modules.urlscraper.scrape_urls') as scrape: # call the mock function instead, the real fn is tested above scrape.side_effect = mock_scrapeurls self.press_fetch_button() result = execute_wfmodule(self.wfmodule) self.assertEqual(result, ProcessResult(self.scraped_table))
def wfmodule_columns(request, pk, format=None): if request.method == 'GET': try: wf_module = WfModule.objects.get(pk=pk) except WfModule.DoesNotExist: return HttpResponseNotFound() if not wf_module.workflow.user_authorized_read(request.user): return HttpResponseForbidden() with wf_module.workflow.cooperative_lock(): table = execute_wfmodule(wf_module) dtypes = table.dtypes.to_dict() ret_types = [] for col in dtypes: # We are simplifying the data types here. # More stuff can be added to these lists if we run into anything new. stype = "String" if str(dtypes[col]) in ['int64', 'float64', 'bool']: stype = "Number" elif str(dtypes[col]) in ['datetime64[ns]']: ret_types.append((col, "Date")) stype = "Date" ret_types.append({ "name": col, "type": stype }) return HttpResponse(json.dumps(ret_types), content_type="application/json")
def test_missing_column(self): # If an input column is removed (e.g. via select columns) # then reorders which refer to it simply do nothing reorder_ops = [ # starts from ['name', 'date', 'count', 'float'] { 'column': 'count', 'from': 2, 'to': 0 }, # gives ['count', 'name', 'date', 'float'] { 'column': 'nonexistent-name', 'from': 4, 'to': 1 }, # invalid, nop { 'column': 'count', 'from': 0, 'to': 4 }, # invalid, nop { 'column': 'float', 'from': 3, 'to': 2 }, # gives ['count', 'name', 'float', 'date'] ] self.history_pval.value = json.dumps(reorder_ops) self.history_pval.save() result = execute_wfmodule(self.wf_module) self.assertEqual(result, reordered_result(['count', 'name', 'float', 'date']))
def execute_and_notify(wf_module): """ Render (and cache) a WfModule; send websocket updates and return result. """ workflow = wf_module.workflow with workflow.cooperative_lock(): priors = {} for a_wf_module in workflow.wf_modules.all(): priors[a_wf_module.id] = \ _client_attributes_that_change_on_render(a_wf_module) result = execute.execute_wfmodule(wf_module) changes = {} for a_wf_module in workflow.wf_modules.all(): prior = priors[a_wf_module.id] current = _client_attributes_that_change_on_render(a_wf_module) if current != prior: changes[str(a_wf_module.id)] = current if changes: websockets.ws_client_send_delta_sync(wf_module.workflow_id, { 'updateWfModules': changes }) return result
def test_resume_without_rerunning_unneeded_renders(self): workflow = create_testdata_workflow(table_csv) wf_module1 = workflow.wf_modules.first() wf_module2 = load_and_add_module('selectcolumns', workflow=workflow, last_relevant_delta_id=1) wf_module1.last_relevant_delta_id = 1 wf_module1.save() expected = execute_wfmodule(wf_module2) wf_module2.refresh_from_db() wf_module2.last_relevant_delta_id = 2 wf_module2.save() with mock.patch('server.dispatch.module_dispatch_render') as mdr: mdr.return_value = expected result = execute_wfmodule(wf_module2) mdr.assert_called_once() self.assertEqual(result, expected)
def test_load_csv_bad_content_type(self): # return text/plain type and rely on filename detection, as # https://raw.githubusercontent.com/ does url = 'https://raw.githubusercontent.com/user/repo/branch/the.csv' self.url_pval.set_value(url) self.url_pval.save() with patch('requests.get') as get: get.return_value = mock_text_response(mock_csv_text, 'text/plain') self.press_fetch_button() result = execute_wfmodule(self.wfmodule) self.assertEqual(result, ProcessResult(mock_csv_table))
def test_nop_with_initial_col_selection(self): # When a column is first selected and no scraping is performed, the # initial table should be returned source_options = "List of URLs|Load from column".split('|') source_pval = get_param_by_id_name('urlsource') source_pval.value = source_options.index('Load from column') source_pval.save() column_pval = get_param_by_id_name('urlcol') column_pval.value = 'url' column_pval.save() result = execute_wfmodule(self.wfmodule) self.assertEqual(result, self.expected_url_table_result)
def test_render(self): # Replace the output with our own data code = "columns = ['A','B', 'C']\ndata = np.array([np.arange(5)]*3).T\nreturn pd.DataFrame(columns=columns, data=data)" self.code_pval.string = code self.code_pval.save() out = execute_wfmodule(self.wf_module) self.assertEqual( str(out), " A B C\n0 0 0 0\n1 1 1 1\n2 2 2 2\n3 3 3 3\n4 4 4 4" )
def table_result(request, wf_module): # Get first and last row from query parameters, or default to all if not specified try: startrow = int_or_none(request.GET.get('startrow')) endrow = int_or_none(request.GET.get('endrow')) except ValueError: return Response({'message': 'bad row number', 'status_code': 400}, status=status.HTTP_400_BAD_REQUEST) with wf_module.workflow.cooperative_lock(): table = execute_wfmodule(wf_module) j = make_render_json(table, startrow, endrow) return HttpResponse(j, content_type="application/json")
def wfmodule_render(request, pk, format=None): if request.method == 'GET': try: wf_module = WfModule.objects.get(pk=pk) except WfModule.DoesNotExist: return HttpResponseNotFound() if not wf_module.user_authorized(request.user): return HttpResponseForbidden() table = execute_wfmodule(wf_module) d = table.to_json(orient='records') return HttpResponse(d, content_type="application/json")
def test_load_xlsx(self): url = 'http://test.com/the.xlsx' self.url_pval.set_value(url) self.url_pval.save() xlsx_bytes = open(mock_xlsx_path, "rb").read() xlsx_table = pd.read_excel(mock_xlsx_path) with patch('requests.get') as get: get.return_value = mock_bytes_response(xlsx_bytes, XLSX_MIME_TYPE) self.press_fetch_button() result = execute_wfmodule(self.wfmodule) self.assertEqual(result, ProcessResult(xlsx_table))
def event(wfm, **kwargs): urls = [] urlsource = wfm.get_param_menu_string('urlsource') if urlsource == 'List': urllist_text = wfm.get_param_string('urllist') urllist_raw = urllist_text.split('\n') for url in urllist_raw: s_url = url.strip() if len(s_url) == 0: continue # Fix in case user adds an URL without http(s) prefix if not re.match('^https?://.*', s_url): urls.append('http://{}'.format(s_url)) else: urls.append(s_url) elif urlsource == 'Input column': # get our list of URLs from a column in the input table urlcol = wfm.get_param_column('urlcol') if urlcol == '': return from server.execute import execute_wfmodule prev_table = execute_wfmodule(wfm.previous_in_stack()).dataframe # column parameters are not sanitized here, could be missing # this col if urlcol in prev_table.columns: urls = prev_table[urlcol].tolist() if len(urls) > 0: table = pd.DataFrame({ 'url': urls, 'status': '' }, columns=['url', 'date', 'status', 'html']) event_loop = get_thread_event_loop() event_loop.run_until_complete(scrape_urls(urls, table)) else: table = pd.DataFrame() table['date'] = timezone.now().isoformat(timespec='seconds') \ .replace('+00:00', 'Z') result = ProcessResult(dataframe=table) # No need to truncate: input is already truncated # No need to sanitize: we only added text+date+status ModuleImpl.commit_result(wfm, result)
def test_render(self): # NOP if no column given set_string(self.col_pval, '') out = execute_wfmodule(self.wf_module) self.assertFalse(out.empty) # sort by value set_string(self.col_pval, 'Amount') set_integer(self.sort_pval, 0) out = execute_wfmodule(self.wf_module) self.assertEqual( str(out), ' Amount count\n0 5 1\n1 10 2') # sort by freq set_integer(self.sort_pval, 1) out = execute_wfmodule(self.wf_module) self.assertEqual( str(out), ' Amount count\n0 10 2\n1 5 1') # bad column name should produce error set_string(self.col_pval, 'hilarious') out = execute_wfmodule(self.wf_module) self.wf_module.refresh_from_db() self.assertEqual(self.wf_module.status, WfModule.ERROR)
def test_first_row_is_header(self): url = 'http://test.com/tablepage.html' self.url_pval.set_value(url) self.url_pval.save() self.first_row_pval.set_value(True) self.first_row_pval.save() with mock.patch('pandas.read_html') as readmock: readmock.return_value = [mock_csv_table] self.press_fetch_button() self.assertEqual(readmock.call_args, mock.call(url, flavor='html5lib')) result = execute_wfmodule(self.wfmodule) self.assertListEqual(list(result.dataframe.columns), [str(x) for x in mock_csv_table.iloc[0, :]]) self.assertEqual(len(result.dataframe), len(mock_csv_table) - 1)
def wfmodule_public_output(request, pk, type, format=None): try: wf_module = WfModule.objects.get(pk=pk) except WfModule.DoesNotExist: return HttpResponseNotFound() if not wf_module.user_authorized_read(request.user): return HttpResponseNotFound() table = execute_wfmodule(wf_module) if type == 'json': d = table.to_json(orient='records') return HttpResponse(d, content_type="application/json") elif type == 'csv': d = table.to_csv(index=False) return HttpResponse(d, content_type="text/csv") else: return HttpResponseNotFound()
def wfmodule_input(request, pk, format=None): if request.method == 'GET': try: wf_module = WfModule.objects.get(pk=pk) except WfModule.DoesNotExist: return HttpResponseNotFound() if not wf_module.user_authorized(request.user): return HttpResponseForbidden() prev_modules = WfModule.objects.filter(workflow=wf_module.workflow, order__lt=wf_module.order) if not prev_modules: table = pd.DataFrame() else: table = execute_wfmodule(prev_modules.last()) d = table.to_json(orient='records') return HttpResponse(d, content_type="application/json")
def test_execute_new_revision(self): workflow = create_testdata_workflow(table_csv) wf_module2 = load_and_add_module('selectcolumns', workflow=workflow) # Add command, modifying revision pval = get_param_by_id_name('colnames', wf_module=wf_module2) ChangeParameterCommand.create(pval, 'A') self.assertEqual(cached_render_result_revision_list(workflow), [None, None]) wf_module1 = workflow.wf_modules.first() wf_module1.last_relevant_delta_id = 1 wf_module1.save() wf_module2.last_relevant_delta_id = 2 wf_module2.save() result = execute_wfmodule(wf_module2) self.assertEqual(result, ProcessResult(table_dataframe[['A']])) self.assertEqual(cached_render_result_revision_list(workflow), [1, 2])
def test_load_json(self): url = 'http://test.com/the.json' self.url_pval.set_value(url) self.url_pval.save() # use a complex example with nested data fname = os.path.join(settings.BASE_DIR, 'server/tests/test_data/sfpd.json') sfpd_json = open(fname).read() # OrderedDict otherwise cols get sorted sfpd_table = pd.DataFrame( json.loads(sfpd_json, object_pairs_hook=OrderedDict)) expected = ProcessResult(sfpd_table) expected.sanitize_in_place() with patch('requests.get') as get: get.return_value = mock_text_response(sfpd_json, 'application/json') self.press_fetch_button() result = execute_wfmodule(self.wfmodule) self.assertEqual(result, expected)
def wfmodule_histogram(request, pk, col, format=None): if request.method == 'GET': try: wf_module = WfModule.objects.get(pk=pk) except WfModule.DoesNotExist: return HttpResponseNotFound() if not wf_module.workflow.user_authorized_read(request.user): return HttpResponseForbidden() INTERNAL_COUNT_COLNAME = '__internal_count_column__' prev_modules = WfModule.objects.filter(workflow=wf_module.workflow, order__lt=wf_module.order) if not prev_modules: return HttpResponse(make_render_json(pd.DataFrame()), content_type="application/json") table = execute_wfmodule(prev_modules.last()) if col not in table.columns: return Response({'message': 'Column does not exist in module input', 'status_code': 400}, status=status.HTTP_400_BAD_REQUEST) hist_table = table.groupby(col).size().reset_index() hist_table.columns = [col, INTERNAL_COUNT_COLNAME] hist_table = hist_table.sort_values(by=[INTERNAL_COUNT_COLNAME, col], ascending=[False, True]) hist_table[col] = hist_table[col].astype(str) return HttpResponse(make_render_json(hist_table), content_type="application/json")
def wfmodule_output(request, pk, format=None): if request.method == 'GET': try: wf_module = WfModule.objects.get(pk=pk) except WfModule.DoesNotExist: return HttpResponseNotFound() if not wf_module.workflow.user_authorized_read(request.user): return HttpResponseForbidden() table = execute_wfmodule(wf_module) html, input_data, params = module_dispatch_output(wf_module, table, request=request) input_data_json = make_render_json(input_data) init_data = json.dumps({ 'input': json.loads(input_data_json), 'params': params }) js=""" <script> var workbench = %s </script>""" % init_data head_tag_pattern = re.compile('<\w*[H|h][E|e][A|a][D|d]\w*>') result = head_tag_pattern.search(html) modified_html = '%s %s %s' % ( html[:result.end()], js, html[result.end():] ) return HttpResponse(content=modified_html)