def test_render_using_tab_output(self): def render(table, params): self.assertEqual(params["tabparam"].name, "Tab 1") self.assertEqual( params["tabparam"].columns, { "X": ptypes.RenderColumn("X", "number", "{:,d}"), "Y": ptypes.RenderColumn("Y", "text", None), }, ) assert_frame_equal(params["tabparam"].dataframe, pd.DataFrame({ "X": [1], "Y": ["y"] })) param_schema = ParamSchema.Dict({"tabparam": ParamSchema.Tab()}) with ModuleTestEnv(param_schema=param_schema, render=render) as env: with arrow_table_context( make_column("X", [1], format="{:,d}"), make_column("Y", ["y"]), dir=env.basedir, ) as (path, _): env.call_render( make_table(), params={"tabparam": "tab-1"}, tab_outputs={ "tab-1": TabOutput(tab_name="Tab 1", table_filename=path.name) }, )
def test_render_using_tab_output(self): def render(table, params): self.assertEqual(params["tabparam"].slug, "tab-1") self.assertEqual(params["tabparam"].name, "Tab 1") self.assertEqual( params["tabparam"].columns, { "X": ptypes.RenderColumn("X", "number", "{:,d}"), "Y": ptypes.RenderColumn("Y", "text", None), }, ) assert_frame_equal(params["tabparam"].dataframe, pd.DataFrame({ "X": [1], "Y": ["y"] })) with arrow_table_context( { "X": [1], "Y": ["y"] }, columns=[ Column("X", ColumnType.Number("{:,d}")), Column("Y", ColumnType.Text()), ], dir=self.basedir, ) as atable: self._test_render( render, params={"tabparam": TabOutput(Tab("tab-1", "Tab 1"), atable)})
def test_render_exception(self): module = self.kernel.compile( MockPath( ["foo.py"], b"import os\ndef render(table, params): raise RuntimeError('fail')", ), "foo", ) with self.assertRaises(ModuleExitedError) as cm: with arrow_table_context({"A": [1]}, dir=self.basedir) as input_table: input_table.path.chmod(0o644) with self.chroot_context.tempfile_context( prefix="output-", dir=self.basedir) as output_path: self.kernel.render( module, self.chroot_context, self.basedir, input_table, types.Params({ "m": 2.5, "s": "XX" }), types.Tab("tab-1", "Tab 1"), None, output_filename=output_path.name, ) self.assertEquals(cm.exception.exit_code, 1) # Python exit code self.assertRegex(cm.exception.log, r"\bRuntimeError\b") self.assertRegex(cm.exception.log, r"\bfail\b") # Regression test: [2019-10-02], the "pyspawner_main()->spawn_child()" # process would raise _another_ exception while exiting. It would try to # close an already-closed socket. self.assertNotRegex(cm.exception.log, r"Bad file descriptor")
def _test_render( self, render_fn, arrow_table_dict={}, arrow_table=None, params={}, tab=Tab("tab-1", "Tab 1"), fetch_result=None, output_filename=None, ): with ExitStack() as ctx: if arrow_table is None: arrow_table = ctx.enter_context( arrow_table_context(arrow_table_dict, dir=self.basedir)) ctx.enter_context(patch.object(module, "render", render_fn)) out_filename = ctx.enter_context( tempfile_context(dir=self.basedir)).name thrift_result = module.render_thrift( ttypes.RenderRequest( str(self.basedir), arrow_arrow_table_to_thrift(arrow_table), arrow_params_to_thrift(Params(params)), arrow_tab_to_thrift(tab), arrow_fetch_result_to_thrift(fetch_result) if fetch_result is not None else None, out_filename, )) return thrift_render_result_to_arrow(thrift_result, self.basedir)
def test_metadata_does_not_require_file_read(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Timestamp()), Column("C", ColumnType.Text()), Column("D", ColumnType.Date("month")), ] with arrow_table_context( make_column("A", [1], format="{:,.2f}"), make_column("B", [datetime.datetime(2021, 4, 13)]), make_column("C", ["c"]), make_column("D", [datetime.date(2021, 4, 1)], unit="month"), ) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=columns, errors=[], json={} ) cache_render_result(self.workflow, self.step, 1, result) # Delete from disk entirely, to prove we did not read. s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_step = Step.objects.get(id=self.step.id) cached_result = fresh_step.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
def test_render_with_input_columns(self): def render(*args, input_columns): self.assertEqual( input_columns, { "A": ptypes.RenderColumn("A", "text", None), "B": ptypes.RenderColumn("B", "number", "{:,.3f}"), "C": ptypes.RenderColumn("C", "datetime", None), }, ) with arrow_table_context( { "A": ["x"], "B": [1], "C": pa.array([datetime.now()], pa.timestamp("ns")) }, columns=[ Column("A", ColumnType.Text()), Column("B", ColumnType.Number("{:,.3f}")), Column("C", ColumnType.Datetime()), ], dir=self.basedir, ) as arrow_table: self._test_render(render, arrow_table=arrow_table)
def test_render_tab_outputs(self): def render_arrow_v1(table, params, *, tab_outputs, **kwargs): self.assertEqual(params["tab"], "tab-x") self.assertEqual(tab_outputs["tab-x"].tab_name, "Tab X") assert_arrow_table_equals( tab_outputs["tab-x"].table, make_table( make_column("X", [1], format="{:,d}"), make_column("Y", ["y"]), ), ) return ArrowRenderResult(make_table()) param_schema = ParamSchema.Dict({"tab": ParamSchema.Tab()}) with ModuleTestEnv(param_schema=param_schema, render_arrow_v1=render_arrow_v1) as env: with arrow_table_context( make_column("X", [1], format="{:,d}"), make_column("Y", ["y"]), dir=env.basedir, ) as (path, _): env.call_render( make_table(), params={"tab": "tab-x"}, tab_outputs={ "tab-x": TabOutput(tab_name="Tab X", table_filename=path.name) }, )
def test_default_render_returns_fetch_result(self): # Functionality used by libraryofcongress with ExitStack() as ctx: input_arrow_table = ctx.enter_context( arrow_table_context({"A": [1]}, dir=self.basedir) ) parquet_filename = Path( ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name ).name out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name thrift_result = module.render_thrift( ttypes.RenderRequest( str(self.basedir), input_arrow_table.to_thrift(), Params({}).to_thrift(), ttypes.Tab("tab-1", "Tab 1"), ttypes.FetchResult( parquet_filename, [RenderError(I18nMessage.TODO_i18n("A warning")).to_thrift()], ), out_filename, ) ) result = RenderResult.from_thrift(thrift_result, self.basedir) assert_render_result_equals( result, RenderResult( arrow_table({"A": [2]}), [RenderError(I18nMessage.TODO_i18n("A warning"))], ), )
def test_render_use_input_columns_as_try_fallback_columns(self): def render(*args, input_columns): return pd.DataFrame({"A": [1]}) with arrow_table_context({"A": [1]}, [Column("A", ColumnType.Number("{:,.3f}"))], dir=self.basedir) as arrow_table: result = self._test_render(render, arrow_table=arrow_table) self.assertEqual( result.table.metadata.columns, [Column("A", ColumnType.Number("{:,.3f}"))], )
def inner( module_zipfile, *, chroot_context, basedir, input_table, params, tab, fetch_result, output_filename, ): output_path = basedir / output_filename with arrow_table_context(arrow_table_dict) as arrow_table: shutil.copy(arrow_table.path, output_path) return RenderResult(table=replace(arrow_table, path=output_path))
def test_invalid_parquet_is_corrupt_cache_error(self): with arrow_table_context(make_column("A", ["x"])) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Text())], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) crr = self.step.cached_render_result s3.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET") with tempfile_context() as arrow_path: with self.assertRaises(CorruptCacheError): with open_cached_render_result(crr) as loaded: pass
def call_render( self, table: pa.Table, params: Dict[str, Any], tab_name: str = "Tab 1", tab_outputs: Dict[str, TabOutput] = {}, fetch_result: Optional[FetchResult] = None, uploaded_files: Dict[str, UploadedFile] = {}, ) -> RenderOutcome: """Conveniently call the module's `render_thrift()`. The calling convention is designed for ease of testing. """ # tempfile will be deleted in __exit__(). fd, output_filename = mkstemp(prefix="out-", suffix=".arrow", dir=self.basedir) os.close(fd) output_path = Path(output_filename) with arrow_table_context(table, dir=self.basedir) as (input_path, _): old_cwd = os.getcwd() os.chdir(self.basedir) try: thrift_result = cjwkernel.pandas.module.render_thrift( ttypes.RenderRequest( basedir=self.basedir, input_filename=input_path.name, params=pydict_to_thrift_json_object(params), tab_name=tab_name, tab_outputs={ k: arrow_tab_output_to_thrift(v) for k, v in tab_outputs.items() }, fetch_result=( arrow_fetch_result_to_thrift(fetch_result) if fetch_result is not None else None), uploaded_files={ k: arrow_uploaded_file_to_thrift(v) for k, v in uploaded_files.items() }, output_filename=output_path.name, )) finally: os.chdir(old_cwd) arrow_result = thrift_render_result_to_arrow(thrift_result) return RenderOutcome(arrow_result, output_path)
def test_read_cached_render_result_slice_as_text_timestamp(self): with arrow_table_context( make_column("A", [2134213412341232967, None], pa.timestamp("ns")) ) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Timestamp())], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) crr = self.step.cached_render_result self.assertEqual( read_cached_render_result_slice_as_text(crr, "csv", range(2), range(3)), "A\n2037-08-18T13:03:32.341232967Z\n", )
def test_email_delta(self, email_delta): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) rendercache.cache_render_result( workflow, wf_module, workflow.last_delta_id - 1, RenderResult(arrow_table({"A": [1]})), ) wf_module.last_relevant_delta_id = workflow.last_delta_id wf_module.save(update_fields=["last_relevant_delta_id"]) with arrow_table_context({"A": [2]}) as table2: def render(*args, **kwargs): return RenderResult(table2) with self._stub_module(render): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_called() delta = email_delta.call_args[0][0] self.assertEqual(delta.user, workflow.owner) self.assertEqual(delta.workflow, workflow) self.assertEqual(delta.wf_module, wf_module) self.assertEqual(delta.old_result, RenderResult(arrow_table({"A": [1]}))) self.assertEqual(delta.new_result, RenderResult(arrow_table({"A": [2]})))
def inner( module_zipfile, *, chroot_context, basedir, input_filename, params, tab_name, tab_outputs, uploaded_files, fetch_result, output_filename, ): output_path = basedir / output_filename with arrow_table_context(arrow_table) as (table_path, table): shutil.copy(table_path, output_path) return RenderResult(errors=[])
def test_clear(self): with arrow_table_context(make_column("A", [1])) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Number(format="{:,}"))], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) parquet_key = crr_parquet_key(self.step.cached_render_result) clear_cached_render_result_for_step(self.step) db_step = Step.objects.get(id=self.step.id) self.assertIsNone(db_step.cached_render_result) self.assertFalse(s3.exists(BUCKET, parquet_key))
def test_email_delta_ignore_corrupt_cache_error(self, email_delta, read_cache): read_cache.side_effect = rendercache.CorruptCacheError workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) # We need to actually populate the cache to set up the test. The code # under test will only try to open the render result if the database # says there's something there. rendercache.cache_render_result( workflow, wf_module, workflow.last_delta_id - 1, RenderResult(arrow_table({"A": [1]})), ) wf_module.last_relevant_delta_id = workflow.last_delta_id wf_module.save(update_fields=["last_relevant_delta_id"]) with arrow_table_context({"A": [2]}) as table2: def render(*args, **kwargs): return RenderResult(table2) with self._stub_module(render): with self.assertLogs(level=logging.ERROR): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_not_called()
def test_cache_render_result(self): with arrow_table_context(make_column("A", [1])) as (table_path, table): result = LoadedRenderResult( path=table_path, table=table, columns=[Column("A", ColumnType.Number(format="{:,}"))], errors=[ RenderError( I18nMessage("e1", {"text": "hi"}, None), [ QuickFix( I18nMessage("q1", {"var": 2}, None), QuickFixAction.PrependStep("filter", {"a": "x"}), ) ], ), RenderError(I18nMessage("e2", {}, None), []), ], json={"foo": "bar"}, ) cache_render_result(self.workflow, self.step, 1, result) cached = self.step.cached_render_result self.assertEqual(cached.step_id, self.step.id) self.assertEqual(cached.delta_id, 1) self.assertEqual( crr_parquet_key(cached), f"wf-{self.workflow.id}/wfm-{self.step.id}/delta-1.dat", ) # Reading completely freshly from the DB should give the same thing db_step = Step.objects.get(id=self.step.id) from_db = db_step.cached_render_result self.assertEqual(from_db, cached) with open_cached_render_result(from_db) as result2: assert_arrow_table_equals( result2.table, make_table(make_column("A", [1], format="{:,}")) ) self.assertEqual( result2.columns, [Column("A", ColumnType.Number(format="{:,}"))] )
def test_load_input_cached_render_result(self): with arrow_table_context({"A": [1]}) as atable: input_render_result = RenderResult(atable) workflow = Workflow.create_and_init() step1 = workflow.tabs.first().steps.create( order=0, slug="step-1", last_relevant_delta_id=workflow.last_delta_id ) step2 = workflow.tabs.first().steps.create(order=1, slug="step-2") rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, input_render_result ) result = self.run_with_async_db( fetch.load_database_objects(workflow.id, step2.id) ) input_crr = step1.cached_render_result assert input_crr is not None self.assertEqual(result[4], input_crr) self.assertEqual(result.input_cached_render_result, input_crr)
def test_default_render_returns_fetch_result(self): # Functionality used by libraryofcongress with ExitStack() as ctx: input_arrow_table = ctx.enter_context( arrow_table_context({"A": [1]}, dir=self.basedir) ) parquet_filename = Path( ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name ).name out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name thrift_result = module.render_thrift( ttypes.RenderRequest( str(self.basedir), arrow_arrow_table_to_thrift(input_arrow_table), {}, # params ttypes.Tab("tab-1", "Tab 1"), ttypes.FetchResult( parquet_filename, [ ttypes.RenderError( ttypes.I18nMessage( "TODO_i18n", { "text": ttypes.I18nArgument( string_value="A warning" ) }, ), [], ) ], ), out_filename, ) ) result = thrift_render_result_to_arrow(thrift_result, self.basedir) assert_render_result_equals( result, RenderResult( arrow_table({"A": [2]}), [RenderError(I18nMessage.TODO_i18n("A warning"))], ), )
def test_render_happy_path(self): module = self.kernel.compile( MockPath( ["foo.py"], b"import pandas as pd\ndef render(table, params): return pd.DataFrame({'A': table['A'] * params['m'], 'B': table['B'] + params['s']})", ), "foo", ) with arrow_table_context( { "A": [1, 2, 3], "B": ["a", "b", "c"] }, columns=[ types.Column("A", types.ColumnType.Number("{:,d}")), types.Column("B", types.ColumnType.Text()), ], dir=self.basedir, ) as input_table: input_table.path.chmod(0o644) with self.chroot_context.tempfile_context( prefix="output-", dir=self.basedir) as output_path: result = self.kernel.render( module, self.chroot_context, self.basedir, input_table, types.Params({ "m": 2.5, "s": "XX" }), types.Tab("tab-1", "Tab 1"), None, output_filename=output_path.name, ) self.assertEquals( result.table.table.to_pydict(), { "A": [2.5, 5.0, 7.5], "B": ["aXX", "bXX", "cXX"] }, )
def test_render_kill_timeout(self): mod = _compile( "foo", "import time\ndef render(table, params):\n time.sleep(2)") with patch.object(self.kernel, "render_timeout", 0.001): with self.assertRaises(ModuleTimeoutError): with arrow_table_context({"A": [1]}, dir=self.basedir) as input_table: input_table.path.chmod(0o644) with self.chroot_context.tempfile_context( prefix="output-", dir=self.basedir) as output_path: self.kernel.render( mod, self.chroot_context, self.basedir, input_table, types.Params({}), types.Tab("tab-1", "Tab 1"), None, output_filename=output_path.name, )
def test_render_killed_hard_out_of_memory(self): # This is similar to out-of-memory kill (but with different exit_code). # Testing out-of-memory is slow because we have to force the kernel to, # er, run out of memory. On a typical dev machine, that means filling # swap space -- gumming up the whole system. Not practical. # # In case of out-of-memory, the Linux out-of-memory killer will find # and kill a process using SIGKILL. # # So let's simulate that SIGKILL. module = self.kernel.compile( MockPath( ["foo.py"], b"import os\nimport time\ndef render(table, params): os.kill(os.getpid(), 9); time.sleep(1)", ), "foo", ) with self.assertRaises(ModuleExitedError) as cm: with arrow_table_context({"A": [1]}, dir=self.basedir) as input_table: input_table.path.chmod(0o644) with tempfile_context(prefix="output-", dir=self.basedir) as output_path: result = self.kernel.render( module, self.basedir, input_table, types.Params({ "m": 2.5, "s": "XX" }), types.Tab("tab-1", "Tab 1"), None, output_filename=output_path.name, ) print(repr(result)) self.assertEquals(cm.exception.exit_code, -9) # SIGKILL self.assertEquals(cm.exception.log, "")
def test_load_dynamic(self): code = b"def render(table, params):\n return table * 2" minio.client.put_object( Bucket=minio.ExternalModulesBucket, Key="imported/abcdef/imported.py", Body=code, ContentLength=len(code), ) with self.assertLogs("cjwstate.modules.loaded_module"): lm = LoadedModule.for_module_version( MockModuleVersion("imported", "abcdef", ParamDType.Dict({}), "now") ) self.assertEqual(lm.name, "imported:abcdef") # This ends up being kinda an integration test. with ExitStack() as ctx: basedir = Path(ctx.enter_context(tempdir_context(prefix="test-basedir-"))) basedir.chmod(0o755) input_table = ctx.enter_context( arrow_table_context({"A": [1]}, dir=basedir) ) input_table.path.chmod(0o644) output_tf = ctx.enter_context(tempfile.NamedTemporaryFile(dir=basedir)) ctx.enter_context(self.assertLogs("cjwstate.modules.loaded_module")) result = lm.render( basedir=basedir, input_table=input_table, params=Params({"col": "A"}), tab=Tab("tab-1", "Tab 1"), fetch_result=None, output_filename=Path(output_tf.name).name, ) assert_render_result_equals(result, RenderResult(arrow_table({"A": [2]})))
def write_to_rendercache( workflow: Workflow, step: Step, delta_id: int, table: pa.Table, errors: List[RenderError] = [], json: Dict[str, Any] = {}, ) -> None: with arrow_table_context(table) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=read_columns(table, full=False), errors=errors, json=json, ) # use the caller-provided delta ID: no assertion old_last_relevant_delta_id = step.last_relevant_delta_id step.last_relevant_delta_id = delta_id try: cache_render_result(workflow, step, delta_id, result) finally: step.last_relevant_delta_id = old_last_relevant_delta_id
def test_render_arrow_table_infer_output_column_formats_from_input(self): input_columns = [ Column("A", ColumnType.Number("{:,.3f}")), Column("B", ColumnType.Number("{:,.3f}")), Column("C", ColumnType.Number("{:,.3f}")), Column("D", ColumnType.Datetime()), Column("E", ColumnType.Datetime()), Column("F", ColumnType.Datetime()), Column("G", ColumnType.Text()), Column("H", ColumnType.Text()), Column("I", ColumnType.Text()), ] # The param name "arrow_table" is a special case def render(arrow_table, params, output_path, *, columns, **kwargs): # Test the "columns" kwarg self.assertEqual(columns, input_columns) table = pa.table( { "A": [1], "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "C": ["a"], "D": [1], "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "F": ["a"], "G": [1], "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "I": ["a"], } ) with pa.ipc.RecordBatchFileWriter(output_path, table.schema) as writer: writer.write_table(table) return [] with arrow_table_context( { "A": [1], "B": [1], "C": [1], "D": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "F": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "G": ["a"], "H": ["a"], "I": ["a"], }, columns=input_columns, dir=self.basedir, ) as arrow_table: result = self._test_render(render, arrow_table=arrow_table) self.assertEqual( result.table.metadata.columns, [ Column("A", ColumnType.Number("{:,.3f}")), # recalled Column("B", ColumnType.Datetime()), # inferred Column("C", ColumnType.Text()), # inferred Column("D", ColumnType.Number("{:,}")), # inferred Column("E", ColumnType.Datetime()), # recalled Column("F", ColumnType.Text()), # inferred Column("G", ColumnType.Number("{:,}")), # inferred Column("H", ColumnType.Datetime()), # inferred Column("I", ColumnType.Text()), # recalled ], )
def test_happy_path(self): with arrow_table_context(make_column("A", ["x"])) as (path, _): validate_arrow_file(path) # do not raise