def parse_csv( path: Path, *, output_path: Path, encoding: Optional[str], delimiter: Optional[str], has_header: bool, autoconvert_text_to_numbers: bool, ) -> RenderResult: result = _parse_csv( path, encoding=encoding, delimiter=delimiter, has_header=has_header, autoconvert_text_to_numbers=autoconvert_text_to_numbers, ) with pyarrow.ipc.RecordBatchFileWriter( output_path.as_posix(), schema=result.table.schema) as writer: writer.write_table(result.table) metadata = infer_table_metadata(result.table) if len(metadata.columns) == 0: arrow_table = ArrowTable() else: arrow_table = ArrowTable(output_path, result.table, metadata) if result.warnings: # TODO when we support i18n, this will be even simpler.... en_message = "\n".join([str(warning) for warning in result.warnings]) errors = [RenderError(I18nMessage.TODO_i18n(en_message))] else: errors = [] return RenderResult(arrow_table, errors)
def test_render_xlsx_bad_content(self): with tempfile_context("fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", XLSX_MIME_TYPE)], io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")), ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) self.assertEqual( result, RenderResult( ArrowTable(), [ RenderError( I18nMessage.TODO_i18n( 'Error reading Excel file: Unsupported format, or corrupt file: Expected BOF record; found b"ce\\xc3\\xa7i n\'"' ) ) ], ), )
def test_metadata_comes_from_db_columns(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Datetime()), Column("C", ColumnType.Text()), ] result = RenderResult( arrow_table( { "A": [1], "B": pa.array([datetime.datetime.now()], pa.timestamp("ns")), "C": ["x"], }, columns=columns, )) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) # Delete from disk entirely, to prove we did not read. minio.remove(BUCKET, crr_parquet_key(self.wf_module.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_wf_module = WfModule.objects.get(id=self.wf_module.id) cached_result = fresh_wf_module.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
def test_double_clear(self): result = RenderResult( arrow_table({"A": [1]}), [RenderError(I18nMessage("X", []), [])], {} ) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) clear_cached_render_result_for_wf_module(self.wf_module) clear_cached_render_result_for_wf_module(self.wf_module) # don't crash
def test_fetch_result_happy_path(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, fetch_error="maybe an error", ) with parquet_file({"A": [1]}) as path: so = create_stored_object(workflow.id, wf_module.id, path) wf_module.stored_data_version = so.stored_at wf_module.save(update_fields=["stored_data_version"]) def render(*args, fetch_result, **kwargs): self.assertEqual( fetch_result.errors, [RenderError(I18nMessage.TODO_i18n("maybe an error"))], ) assert_arrow_table_equals( pyarrow.parquet.read_table(str(fetch_result.path)), {"A": [1]}) return RenderResult() with self._stub_module(render): self.run_with_async_db( execute_wfmodule( workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, Path("/unused"), ))
def test_double_clear(self): result = RenderResult(arrow_table({"A": [1]}), [RenderError(I18nMessage("X", {}, None), [])], {}) cache_render_result(self.workflow, self.step, 1, result) clear_cached_render_result_for_step(self.step) clear_cached_render_result_for_step(self.step) # don't crash
def open_cached_render_result( crr: CachedRenderResult) -> ContextManager[RenderResult]: """ Yield a RenderResult equivalent to the one passed to `cache_render_result()`. Raise CorruptCacheError if the cached data does not match `crr`. That can mean: * The cached Parquet file is corrupt * The cached Parquet file is missing * `crr` is stale -- the cached result is for a different delta. This could be detected by a `Workflow.cooperative_lock()`, too, should the caller want to distinguish this error from the others. The returned RenderResult is backed by an mmapped file on disk, so it doesn't require much physical RAM. """ if not crr.table_metadata.columns: # Zero-column tables aren't written to cache yield RenderResult( ArrowTable.from_zero_column_metadata( TableMetadata(crr.table_metadata.n_rows, [])), crr.errors, crr.json, ) return with tempfile_context(prefix="cached-render-result") as arrow_path: # raise CorruptCacheError (deleting `arrow_path` in the process) result = load_cached_render_result(crr, arrow_path) yield result
def test_render_empty_file_fetch_result_is_parquet(self): def render(*args, fetch_result): return fetch_result.dataframe with tempfile_context(dir=self.basedir) as tf: result = self._test_render(render, fetch_result=FetchResult(tf)) assert_render_result_equals(result, RenderResult(arrow_table({})))
def test_fetch_result_no_stored_object_means_none(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, ) module_zipfile = create_module_zipfile( "x", python_code=textwrap.dedent(""" import pandas as pd def render(table, params, *, fetch_result, **kwargs): assert fetch_result is None return pd.DataFrame() """), ) with self.assertLogs(level=logging.INFO): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, module_zipfile, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, ))
def test_execute_tempdir_not_in_tmpfs(self, fake_load_module): # /tmp is RAM; /var/tmp is disk. Assert big files go on disk. workflow = Workflow.create_and_init() tab = workflow.tabs.first() delta1 = workflow.last_delta ModuleVersion.create_or_replace_from_spec({ "id_name": "mod", "name": "Mod", "category": "Clean", "parameters": [] }) wf_module = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=delta1.id - 1, module_id_name="mod", ) result2 = RenderResult(arrow_table({"B": [2]})) fake_load_module.return_value.migrate_params.return_value = {} fake_load_module.return_value.render.return_value = result2 self._execute(workflow) self.assertRegex( str(fake_load_module.return_value.render.call_args[1]["basedir"]), r"^/var/tmp/", )
def test_wf_module_render_clip_out_of_bounds(self): cache_render_result( self.workflow, self.wf_module2, self.wf_module2.last_relevant_delta_id, RenderResult(arrow_table({"A": [0, 1]})), ) # index out of bounds should clip response = self.client.get( "/api/wfmodules/%d/render?startrow=-1&endrow=500" % self.wf_module2.id) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual( json.loads(response.content), { "start_row": 0, "end_row": 2, "rows": [{ "A": 0 }, { "A": 1 }] }, )
def test_duplicate_nonempty_rendered_tab(self, send_update, queue_render): send_update.side_effect = async_noop queue_render.side_effect = async_noop workflow = Workflow.create_and_init() init_delta_id = workflow.last_delta_id tab = workflow.tabs.first() # step1 and step2 have not yet been rendered. (But while we're # duplicating, conceivably a render could be running; so when we # duplicate them, we need to queue a render.) step1 = tab.steps.create( order=0, slug="step-1", module_id_name="x", params={"p": "s1"}, last_relevant_delta_id=init_delta_id, ) render_result = RenderResult(arrow_table({"A": [1]})) cache_render_result(workflow, step1, init_delta_id, render_result) cmd = self.run_with_async_db( commands.do( DuplicateTab, workflow_id=workflow.id, from_tab=tab, slug="tab-2", name="Tab 2", )) tab2 = workflow.tabs.last() self.assertNotEqual(tab2.id, tab.id) step2 = tab2.steps.last() # We need to render: render() in Steps in the second Tab will be called # with different `tab_name` than in the first Tab, meaning their output # may be different. self.assertIsNone(step2.cached_render_result) queue_render.assert_called_with(workflow.id, cmd.id)
def test_wf_module_render_null_datetime(self): # Ran into problems 2019-09-06, when switching to Arrow cache_render_result( self.workflow, self.wf_module2, self.wf_module2.last_relevant_delta_id, RenderResult( arrow_table({ "A": pa.array( [dt(2019, 1, 2, 3, 4, 5, 6007, None), None], pa.timestamp("ns"), ) })), ) response = self.client.get("/api/wfmodules/%d/render" % self.wf_module2.id) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual( json.loads(response.content)["rows"], [{ "A": "2019-01-02T03:04:05.006007Z" }, { "A": None }], )
def test_fetch_result_no_stored_object_means_none(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, ) def render(*args, fetch_result, **kwargs): self.assertIsNone(fetch_result) return RenderResult() with self._stub_module(render): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, ))
def test_fetch_result_no_bucket_or_key_stored_object_means_none(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, stored_data_version=timezone.now(), ) wf_module.stored_objects.create( stored_at=wf_module.stored_data_version, bucket="", key="", size=0, hash="whatever", ) def render(*args, fetch_result, **kwargs): self.assertIsNone(fetch_result) return RenderResult() with self._stub_module(render): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, ))
def test_fetch_result_deleted_file_means_none(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, ) with parquet_file({"A": [1]}) as path: so = create_stored_object(workflow.id, wf_module.id, path) wf_module.stored_data_version = so.stored_at wf_module.save(update_fields=["stored_data_version"]) # Now delete the file on S3 -- but leave the DB pointing to it. minio.remove(so.bucket, so.key) def render(*args, fetch_result, **kwargs): self.assertIsNone(fetch_result) return RenderResult() with self._stub_module(render): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, ))
def test_fetch_result_deleted_stored_object_means_none(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, # wf_module.stored_data_version is buggy: it can point at a nonexistent # StoredObject. Let's do that. stored_data_version=timezone.now(), ) def render(*args, fetch_result, **kwargs): self.assertIsNone(fetch_result) return RenderResult() with self._stub_module(render): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, ))
def test_missing_cached_response_sends_503(self, queue_render): queue_render.side_effect = async_noop create_module_zipfile("chart", spec_kwargs={"html_output": True}, html="hi", version="develop") cache_render_result( self.workflow, self.step, 1, RenderResult(arrow_table({"A": [1]}), json={}), ) self.step.last_relevant_delta_id = 2 self.step.save(update_fields=["last_relevant_delta_id"]) self.workflow.last_delta_id = 3 self.workflow.save(update_fields=["last_delta_id"]) with self.assertLogs("cjwstate.params", level="INFO"): # migrate_params() from jsonize with self.assertLogs("django.request", level="ERROR"): # 503 response = self._request() self.assertEqual(response.status_code, status.SERVICE_UNAVAILABLE) self.assertIn(b"window.location.reload()", response.content) queue_render.assert_called_with(self.workflow.id, 3)
def load_module_and_delete(module_version): Workflow.objects.filter(id=workflow.id).delete() fake_module = Mock(LoadedModule) fake_module.migrate_params.return_value = {} fake_module.render.return_value = RenderResult( arrow_table({"A": [1]})) return fake_module
def test_null_timestamp(self): # Ran into problems 2019-09-06, when switching to Arrow cache_render_result( self.workflow, self.step2, self.step2.last_relevant_delta_id, RenderResult( arrow_table({ "A": pa.array( [dt(2019, 1, 2, 3, 4, 5, 6007, None), None], pa.timestamp("ns"), ) })), ) response = self._request_step(self.step2) self.assertEqual(response.status_code, 200) self.assertEqual( read_streaming_json(response), [{ "A": "2019-01-02T03:04:05.006007Z" }, { "A": None }], )
def test_email_no_delta_when_not_changed(self, email): workflow = Workflow.objects.create() tab = workflow.tabs.create(position=0) delta1 = InitWorkflowCommand.create(workflow) create_module_zipfile( "mod", python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"A": [1]})', ) wf_module = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=delta1.id, module_id_name="mod", notifications=True, ) cache_render_result(workflow, wf_module, delta1.id, RenderResult(arrow_table({"A": [1]}))) # Make a new delta, so we need to re-render. Give it the same output. delta2 = InitWorkflowCommand.create(workflow) wf_module.last_relevant_delta_id = delta2.id wf_module.save(update_fields=["last_relevant_delta_id"]) self._execute(workflow) email.assert_not_called()
def test_default_render_returns_fetch_result(self): # Functionality used by libraryofcongress with ExitStack() as ctx: input_arrow_table = ctx.enter_context( arrow_table_context({"A": [1]}, dir=self.basedir) ) parquet_filename = Path( ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name ).name out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name thrift_result = module.render_thrift( ttypes.RenderRequest( str(self.basedir), input_arrow_table.to_thrift(), Params({}).to_thrift(), ttypes.Tab("tab-1", "Tab 1"), ttypes.FetchResult( parquet_filename, [RenderError(I18nMessage.TODO_i18n("A warning")).to_thrift()], ), out_filename, ) ) result = RenderResult.from_thrift(thrift_result, self.basedir) assert_render_result_equals( result, RenderResult( arrow_table({"A": [2]}), [RenderError(I18nMessage.TODO_i18n("A warning"))], ), )
def test_clean_tabs_happy_path(self): tab2 = Tab("tab-2", "Tab 2") tab2_output = arrow_table({"B": [1]}) tab3 = Tab("tab-3", "Tab 3") tab3_output = arrow_table({"C": [1]}) context = self._render_context(tab_results={ tab2: RenderResult(tab2_output), tab3: RenderResult(tab3_output), }) result = clean_value(ParamDType.Multitab(), ["tab-2", "tab-3"], context) self.assertEqual( result, [TabOutput(tab2, tab2_output), TabOutput(tab3, tab3_output)])
def test_email_delta(self, email_delta): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) rendercache.cache_render_result( workflow, wf_module, workflow.last_delta_id - 1, RenderResult(arrow_table({"A": [1]})), ) wf_module.last_relevant_delta_id = workflow.last_delta_id wf_module.save(update_fields=["last_relevant_delta_id"]) module_zipfile = create_module_zipfile( "x", python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"A": [2]})', ) with self.assertLogs(level=logging.INFO): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, module_zipfile, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_called() delta = email_delta.call_args[0][0] self.assertEqual(delta.user, workflow.owner) self.assertEqual(delta.workflow, workflow) self.assertEqual(delta.wf_module, wf_module) self.assertEqual(delta.old_result, RenderResult(arrow_table({"A": [1]}))) self.assertEqual(delta.new_result, RenderResult(arrow_table({"A": [2]})))
def test_email_delta_when_stale_crr_is_unreachable(self, email_delta, read_cache): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) # We need to actually populate the cache to set up the test. The code # under test will only try to open the render result if the database # says there's something there. rendercache.cache_render_result( workflow, step, workflow.last_delta_id - 1, RenderResult(arrow_table({})), # does not write a Parquet file ) step.last_relevant_delta_id = workflow.last_delta_id step.save(update_fields=["last_relevant_delta_id"]) module_zipfile = create_module_zipfile( "x", spec_kwargs={"loads_data": True}, # returns different data python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"A": [2]})', ) with self.assertLogs(level=logging.INFO): self.run_with_async_db( execute_step( self.chroot_context, workflow, step, module_zipfile, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) read_cache.assert_not_called() # it would give CorruptCacheError email_delta.assert_called() # there's new data
def render(*args, fetch_result, **kwargs): self.assertEqual( fetch_result.errors, [RenderError(I18nMessage.TODO_i18n("maybe an error"))], ) assert_arrow_table_equals( pyarrow.parquet.read_table(str(fetch_result.path)), {"A": [1]}) return RenderResult()
def test_email_delta_ignore_corrupt_cache_error(self, email_delta, read_cache): read_cache.side_effect = rendercache.CorruptCacheError workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) # We need to actually populate the cache to set up the test. The code # under test will only try to open the render result if the database # says there's something there. rendercache.cache_render_result( workflow, wf_module, workflow.last_delta_id - 1, RenderResult(arrow_table({"A": [1]})), ) wf_module.last_relevant_delta_id = workflow.last_delta_id wf_module.save(update_fields=["last_relevant_delta_id"]) module_zipfile = create_module_zipfile( "x", # returns different data -- but CorruptCacheError means we won't care. python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"A": [2]})', ) with self.assertLogs(level=logging.ERROR): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, module_zipfile, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_not_called()
def test_render_json(self): def render_arrow_v1(table, params, **kwargs): return ArrowRenderResult(make_table(), [], {"json": ["A-", 1]}) with ModuleTestEnv(render_arrow_v1=render_arrow_v1) as env: outcome = env.call_render(make_table(), {}) self.assertEqual(outcome.result, RenderResult([], {"json": ["A-", 1]}))
def test_render_with_parquet_fetch_result(self): def render(*args, fetch_result): return fetch_result with parquet_file({"A": ["fetched"]}, dir=self.basedir) as pf: result = self._test_render(render, fetch_result=FetchResult(pf)) assert_render_result_equals( result, RenderResult(arrow_table({"A": ["fetched"]})))
def test_email_delta(self, email_delta): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) rendercache.cache_render_result( workflow, wf_module, workflow.last_delta_id - 1, RenderResult(arrow_table({"A": [1]})), ) wf_module.last_relevant_delta_id = workflow.last_delta_id wf_module.save(update_fields=["last_relevant_delta_id"]) with arrow_table_context({"A": [2]}) as table2: def render(*args, **kwargs): return RenderResult(table2) with self._stub_module(render): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_called() delta = email_delta.call_args[0][0] self.assertEqual(delta.user, workflow.owner) self.assertEqual(delta.workflow, workflow) self.assertEqual(delta.wf_module, wf_module) self.assertEqual(delta.old_result, RenderResult(arrow_table({"A": [1]}))) self.assertEqual(delta.new_result, RenderResult(arrow_table({"A": [2]})))