def test_aggregate_numbers(): assert_arrow_table_equals( groupby( make_table( make_column("A", [2, 1, 2, 2], format="{:.2f}"), make_column("B", [1, 2, 5, 1], format="{:d}"), ), [Group("A", None)], [ Aggregation(Operation.SIZE, "", "size"), Aggregation(Operation.NUNIQUE, "B", "nunique"), Aggregation(Operation.SUM, "B", "sum"), Aggregation(Operation.MEAN, "B", "mean"), Aggregation(Operation.MEDIAN, "B", "median"), Aggregation(Operation.MIN, "B", "min"), Aggregation(Operation.MAX, "B", "max"), Aggregation(Operation.FIRST, "B", "first"), ], ), make_table( make_column("A", [1, 2], format="{:.2f}"), # format from A make_column("size", [1, 3], format="{:,d}"), # int format make_column("nunique", [1, 2], format="{:,d}"), # int format make_column("sum", [2, 7], format="{:d}"), # format from B make_column("mean", [2, 7 / 3], format="{:,}"), # default format make_column("median", [2.0, 1.0], format="{:,}"), # default format make_column("min", [2, 1], format="{:d}"), # format from B make_column("max", [2, 5], format="{:d}"), # format from B make_column("first", [2, 1], format="{:d}"), # format from B ), )
def test_execute_new_revision(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() create_module_zipfile( "mod", spec_kwargs={"loads_data": True}, python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"B": [2]})', ) step = tab.steps.create( order=0, slug="step-1", last_relevant_delta_id=2, module_id_name="mod", ) # stale write_to_rendercache(workflow, step, 1, make_table(make_column("A", ["a"]))) self._execute(workflow) step.refresh_from_db() with open_cached_render_result(step.cached_render_result) as result: assert_arrow_table_equals(result.table, make_table(make_column("B", [2])))
def test_fetch_integration(self, send_update, queue_render): queue_render.side_effect = async_value(None) send_update.side_effect = async_value(None) workflow = Workflow.create_and_init() create_module_zipfile( "mod", python_code= ("import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table" ), ) step = workflow.tabs.first().steps.create(order=0, slug="step-1", module_id_name="mod") cjwstate.modules.init_module_system() now = datetime.datetime.now() with self.assertLogs(level=logging.INFO): self.run_with_async_db( fetch.fetch(workflow_id=workflow.id, step_id=step.id, now=now)) step.refresh_from_db() so = step.stored_objects.get(stored_at=step.stored_data_version) with s3.temporarily_download(s3.StoredObjectsBucket, so.key) as parquet_path: # fetch results are stored without a schema. Let's hard-code a # schema simply so we can test that the table data is the same. table = read_parquet_as_arrow(parquet_path, [Column("A", ColumnType.Number())]) assert_arrow_table_equals(table, make_table(make_column("A", [1]))) workflow.refresh_from_db() queue_render.assert_called_with(workflow.id, workflow.last_delta_id) send_update.assert_called()
def test_to_arrow_normal_dataframe(self): fd, filename = tempfile.mkstemp() try: process_result = ProcessResult.coerce(pd.DataFrame({"A": [1, 2]})) result = process_result.to_arrow(Path(filename)) self.assertEqual( result, atypes.RenderResult( [], {}, ), ) with pa.ipc.open_file(filename) as reader: table = reader.read_all() assert_arrow_table_equals( table, make_table( make_column( # Whatever .format ProcessResult.coerce() gave "A", [1, 2], format=process_result.columns[0].type.format, ) ), ) finally: os.unlink(filename)
def test_duplicate_copies_fresh_cache(self): # The cache's filename depends on workflow_id and step_id. # Duplicating it would need more complex code :). table = make_table(make_column("A", [1], format="${:,.2f}")) write_to_rendercache( self.workflow, self.step, 1, table=table, errors=[RenderError(I18nMessage("X", {}, None))], json={"foo": "bar"}, ) workflow2 = Workflow.objects.create() tab2 = workflow2.tabs.create(position=0) dup = self.step.duplicate_into_new_workflow(tab2) dup_cached_result = dup.cached_render_result self.assertEqual( dup_cached_result, replace( self.step.cached_render_result, workflow_id=workflow2.id, step_id=dup.id, delta_id=0, ), ) with open_cached_render_result(dup_cached_result) as result2: assert_arrow_table_equals(result2.table, table) self.assertEqual(result2.errors, [RenderError(I18nMessage("X", {}, None))]) self.assertEqual(result2.json, {"foo": "bar"})
def test_fetch_truncate(self): def fetch(params): return pd.DataFrame({"A": [1, 2, 3]}) with tempfile_context(dir=self.basedir) as outfile: result = self._test_fetch(fetch, output_filename=outfile.name) self.assertEqual( result, FetchResult( outfile, errors=[ FetchError( I18nMessage( "py.cjwkernel.pandas.types.ProcessResult.truncate_in_place_if_too_big.warning", { "old_number": 3, "new_number": 2 }, None, )) ], ), ) assert_arrow_table_equals( read_parquet_as_arrow( outfile, [Column("A", ColumnType.Number("{:,}"))]), make_table(make_column("A", [1, 2])), )
def test_do_not_multiply_categories(): # Pandas default, when given categoricals, is to multiply them out: # in this example, we'd get four rows: # # a, c # a, d # b, c # b, d # # ... even though there are no values for (a, d) or (b, c). # # See https://github.com/pandas-dev/pandas/issues/17594. The solution # is .groupby(..., observed=True). assert_arrow_table_equals( groupby( make_table( make_column("A", ["a", "b", "a"], dictionary=True), make_column("B", ["c", "d", "d"], dictionary=True), make_column("C", [1, 2, 3]), ), [Group("A", None), Group("B", None)], [Aggregation(Operation.SUM, "C", "X")], ), make_table( make_column("A", ["a", "a", "b"], dictionary=True), make_column("B", ["c", "d", "d"], dictionary=True), make_column("X", [1, 3, 2]), ), )
def test_aggregate_text_category_values(): assert_arrow_table_equals( groupby( make_table( make_column("A", [1, 1, 1]), make_column("B", ["a", "b", "a"], dictionary=True), ), [Group("A", None)], [ Aggregation(Operation.SIZE, "B", "size"), Aggregation(Operation.NUNIQUE, "B", "nunique"), Aggregation(Operation.MIN, "B", "min"), Aggregation(Operation.MAX, "B", "max"), Aggregation(Operation.FIRST, "B", "first"), ], ), make_table( make_column("A", [1]), make_column("size", [3], format="{:,d}"), make_column("nunique", [2], format="{:,d}"), make_column("min", ["a"], dictionary=True), make_column("max", ["b"], dictionary=True), make_column("first", ["a"], dictionary=True), ), )
def test_assert_arrow_table_equals_check_date_unit(): table1 = make_table( make_column("A", [datetime.date(2021, 4, 1)], unit="day")) table2 = make_table( make_column("A", [datetime.date(2021, 4, 1)], unit="month")) with pytest.raises(AssertionError, match=r"-\{b'unit': b'month'\}\n\+\{b'unit': b'day'\}"): assert_arrow_table_equals(table1, table2)
def test_no_colnames(): assert_arrow_table_equals( groupby( make_table(make_column("A", [1, 2])), [], [Aggregation(Operation.SUM, "A", "X")], ), make_table(make_column("X", [3])), )
def test_sum_float(): assert_arrow_table_equals( groupby( make_table(make_column("A", [1.0, None, 3.0], format="{:d}")), [], [Aggregation(Operation.SUM, "A", "sum")], ), make_table(make_column("sum", [4.0], format="{:d}")), )
def test_sum_int8_does_not_overflow(): assert_arrow_table_equals( groupby( make_table(make_column("A", [127, 1], pa.int8(), format="{:d}")), [], [Aggregation(Operation.SUM, "A", "sum")], ), make_table(make_column("sum", [128], format="{:d}")), )
def test_size(): assert_arrow_table_equals( groupby( make_table(make_column("A", [1, 1, 2])), [Group("A", None)], [Aggregation(Operation.SIZE, "", "X")], ), make_table(make_column("A", [1, 2]), make_column("X", [2, 1], format="{:,d}")), )
def test_render_with_no_kwargs(self): def render(table, params): return table * params["n"] param_schema = ParamSchema.Dict({"n": ParamSchema.Float()}) with ModuleTestEnv(param_schema=param_schema, render=render) as env: outcome = env.call_render(make_table(make_column("A", [1])), {"n": 2}) assert_arrow_table_equals(outcome.read_table(), make_table(make_column("A", [2])))
def test_render_with_parquet_fetch_result(self): def render(table, params, *, fetch_result): return fetch_result with ModuleTestEnv(render=render) as env: with parquet_file({"A": ["fetched"]}, dir=env.basedir) as pf: outcome = env.call_render(make_table(), {}, fetch_result=FetchResult(pf)) assert_arrow_table_equals( outcome.read_table(), make_table(make_column("A", ["fetched"])))
def render_arrow_v1(table, params, **kwargs): assert_arrow_table_equals( table, make_table( make_column("A", ["x"]), make_column("B", [1], format="{:,.3f}"), make_column("C", [now]), make_column("D", [date(2021, 4, 12)], unit="week"), ), ) return ArrowRenderResult(make_table())
def render_arrow_v1(table, params, *, tab_outputs, **kwargs): self.assertEqual(params["tab"], "tab-x") self.assertEqual(tab_outputs["tab-x"].tab_name, "Tab X") assert_arrow_table_equals( tab_outputs["tab-x"].table, make_table( make_column("X", [1], format="{:,d}"), make_column("Y", ["y"]), ), ) return ArrowRenderResult(make_table())
def test_assert_arrow_table_equals_check_timestamp_tz(): table1 = pa.table( {"A": pa.array([1617889141123456000], pa.timestamp("ns", "UTC"))}) table2 = pa.table( {"A": pa.array([1617889141123456000], pa.timestamp("ns"))}) with pytest.raises( AssertionError, match= r"-pyarrow.Field<A: timestamp\[ns\]>\n\+pyarrow.Field<A: timestamp\[ns, tz=UTC\]>", ): assert_arrow_table_equals(table1, table2)
def _test_dataframe_to_arrow_table( self, dataframe: pd.DataFrame, columns: List[Column], expected_table: pa.Table, ) -> None: with tempfile_context() as path: dataframe_to_arrow_table(dataframe, columns, path) # "untrusted": more integration-test-ish result_table, result_columns = load_untrusted_arrow_file_with_columns(path) assert_arrow_table_equals(result_table, expected_table) self.assertEqual(result_columns, columns) # testing the round trip
def test_aggregate_null_timestamp_by_quarter_DEPRECATED(): assert_arrow_table_equals( groupby( make_table(make_column("A", [None], pa.timestamp("ns"))), [Group("A", DateGranularity.QUARTER)], [Aggregation(Operation.SIZE, "", "size")], ), make_table( make_column("A", [], pa.timestamp("ns")), make_column("size", [], pa.int64(), format="{:,d}"), ), )
def test_execute_partial_cache_hit(self): module_zipfile = create_module_zipfile( "mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh. Should not render. step1 = tab.steps.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) write_to_rendercache(workflow, step1, workflow.last_delta_id, make_table(make_column("A", ["a"]))) # step2: cached result is stale, so must be re-rendered step2 = tab.steps.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) write_to_rendercache( workflow, step2, workflow.last_delta_id - 1, make_table(make_column("B", ["b"])), ) tab_flow = TabFlow( Tab(tab.slug, tab.name), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) new_table = make_table(make_column("C", ["c"])) with patch.object(Kernel, "render", side_effect=mock_render(new_table)): with self._execute(workflow, tab_flow, {}) as (result, path): self.assertEqual( result, StepResult(path, [Column("C", ColumnType.Text())])) assert_arrow_table_equals(load_trusted_arrow_file(path), new_table) Kernel.render.assert_called_once() # step2, not step1 self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_render_with_non_parquet_fetch_result(self): def render(table, params, *, fetch_result): return pd.DataFrame({"A": [fetch_result.path.read_text()]}) with ModuleTestEnv(render=render) as env: with tempfile_context(dir=env.basedir) as tf: tf.write_bytes(b"abcd") outcome = env.call_render(make_table(), {}, fetch_result=FetchResult(tf)) assert_arrow_table_equals( outcome.read_table(), make_table(make_column("A", ["abcd"])))
def test_fetch_return_dataframe(self): async def fetch(params): return pd.DataFrame({"A": ["x", "y"]}) with tempfile_context(dir=self.basedir) as outfile: result = self._test_fetch(fetch, output_filename=outfile.name) self.assertEqual(result.errors, []) arrow_table = read_parquet_as_arrow( outfile, [Column("A", ColumnType.Text())]) assert_arrow_table_equals(arrow_table, make_table(make_column("A", ["x", "y"])))
def test_render_return_column_formats(self): def render(table, params): return { "dataframe": pd.DataFrame({"A": [1]}), "column_formats": { "A": "${:,d}" }, } with ModuleTestEnv(render=render) as env: outcome = env.call_render(make_table(), {}) assert_arrow_table_equals( outcome.read_table(), make_table(make_column("A", [1], format="${:,d}")))
def test_first_in_category(): # https://www.pivotaltracker.com/story/show/177964511 # This crash finally inspired us, [2021-04-29], to ditch Pandas. # # The only shock is that we didn't ditch it after all the other crashes # that litter this test suite. assert_arrow_table_equals( groupby( make_table(make_column("A", ["A", "A"], dictionary=True), ), [], [Aggregation(Operation.FIRST, "A", "first")], ), make_table(make_column("first", ["A"], dictionary=True)), )
def test_aggregate_text_category_values_max(): # https://github.com/pandas-dev/pandas/issues/28641 assert_arrow_table_equals( groupby( make_table( make_column("A", [1997]), make_column("B", ["30-SEP-97"], dictionary=True), ), [Group("A", None)], [Aggregation(Operation.MAX, "B", "X")], ), make_table(make_column("A", [1997]), make_column("X", ["30-SEP-97"], dictionary=True)), )
def test_render_arrow_table_infer_output_column_formats(self): def render(arrow_table, params, output_path, *, columns, **kwargs): out = pa.table({"A": [1], "B": [date(2021, 4, 1)]}) with pa.ipc.RecordBatchFileWriter(output_path, out.schema) as writer: writer.write_table(out) with ModuleTestEnv(render=render) as env: outcome = env.call_render(make_table(), {}) assert_arrow_table_equals( outcome.read_table(), make_table( make_column("A", [1], format="{:,}"), make_column("B", [date(2021, 4, 1)], unit="day"), ), )
def test_allow_duplicate_aggregations(): assert_arrow_table_equals( groupby( make_table( make_column("A", [1, 1, 2]), make_column("B", [1, 2, 3]), ), [Group("A", None)], [ Aggregation(Operation.MIN, "B", "X"), Aggregation(Operation.MIN, "B", "Y"), ], ), make_table(make_column("A", [1, 2]), make_column("X", [1, 3]), make_column("Y", [1, 3])), )
def test_aggregate_timestamp_by_year_DEPRECATED(): assert_arrow_table_equals( groupby( make_table( make_column("A", [dt(2018, 1, 4), dt(2019, 2, 4), dt(2018, 3, 4)]), ), [Group("A", DateGranularity.YEAR)], [Aggregation(Operation.SIZE, "", "size")], ), make_table( make_column("A", [dt(2018, 1, 1), dt(2019, 1, 1)]), make_column("size", [2, 1], format="{:,d}"), ), )
def test_multilevel_with_na_remove_unused_category(): assert_arrow_table_equals( groupby( make_table( make_column("A", ["a1", "a2", "a1", "a1"], dictionary=True), make_column("B", ["b1", None, "b2", "b3"], dictionary=True), ), [Group("A", None), Group("B", None)], [Aggregation(Operation.SIZE, "", "X")], ), make_table( make_column("A", ["a1", "a1", "a1"], dictionary=True), make_column("B", ["b1", "b2", "b3"]), make_column("X", [1, 1, 1], format="{:,d}"), ), )