def test_assert_result_equals_check_table(): table1 = make_table(make_column("A", [1])).replace_schema_metadata( {"foo": "bar"}) table2 = make_table(make_column("A", [1])) with pytest.raises(AssertionError, match=r"-None\n\+\{"): assert_result_equals(ArrowRenderResult(table1), ArrowRenderResult(table2))
def test_execute_new_revision(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() create_module_zipfile( "mod", spec_kwargs={"loads_data": True}, python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"B": [2]})', ) step = tab.steps.create( order=0, slug="step-1", last_relevant_delta_id=2, module_id_name="mod", ) # stale write_to_rendercache(workflow, step, 1, make_table(make_column("A", ["a"]))) self._execute(workflow) step.refresh_from_db() with open_cached_render_result(step.cached_render_result) as result: assert_arrow_table_equals(result.table, make_table(make_column("B", [2])))
def test_default_outnames(): assert_result_equals( render( make_table( make_column("A", ["x", "x"]), make_column("B", [1, 2], format="{:d}") ), P( groups=dict(colnames=["A"], group_dates=False, date_granularities={}), aggregations=[ dict(operation="size", colname="", outname=""), dict(operation="nunique", colname="B", outname=""), dict(operation="sum", colname="B", outname=""), dict(operation="mean", colname="B", outname=""), dict(operation="median", colname="B", outname=""), dict(operation="min", colname="B", outname=""), dict(operation="max", colname="B", outname=""), dict(operation="first", colname="B", outname=""), ], ), ), ArrowRenderResult( make_table( make_column("A", ["x"]), make_column("Group Size", [2], format="{:,d}"), make_column("Unique count of B", [2], format="{:,d}"), make_column("Sum of B", [3], format="{:d}"), make_column("Average of B", [1.5], format="{:,}"), make_column("Median of B", [1.5], format="{:,}"), make_column("Minimum of B", [1], format="{:d}"), make_column("Maximum of B", [2], format="{:d}"), make_column("First of B", [1], format="{:d}"), ) ), )
def test_group_date_prompt_upgrade_timestamp_to_date(): assert_result_equals( render( make_table(make_column("A", [datetime.datetime(2021, 5, 5)])), P( groups=dict( colnames=["A"], group_dates=True, date_granularities={"A": "Y"} ), aggregations=[dict(operation="size", colname="", outname="size")], ), ), ArrowRenderResult( make_table( make_column("A", [datetime.datetime(2021, 1, 1)]), make_column("size", [1], format="{:,d}"), ), [ RenderError( i18n_message("group_dates.granularity_deprecated.need_dates"), [ QuickFix( i18n_message( "group_dates.granularity_deprecated.quick_fix.convert_to_date" ), QuickFixAction.PrependStep( "converttimestamptodate", dict(colnames=["A"], unit="year"), ), ) ], ) ], ), )
def test_difference_nanoseconds(): assert_result_equals( render( make_table( make_column( "A", [1237342345234234234, 1230000034123123423, None, None] ), make_column("B", [1237343345234134214, 1230080234113143429, 123, None]), ), P( operation="difference", colname1="A", colname2="B", unit="nanosecond", outcolname="C", ), ), ArrowRenderResult( make_table( make_column( "A", [1237342345234234234, 1230000034123123423, None, None] ), make_column("B", [1237343345234134214, 1230080234113143429, 123, None]), make_column( "C", [999999899980, 80199990020006, None, None], format="{:,d}" ), ), ), )
def test_render_replace_many_columns(): assert_result_equals( render( make_table( make_column("A", ["a"]), make_column("B", ["b"]), make_column("C", [dt(2000, 2, 3, 4)]), make_column("D", ["d"]), make_column("E", ["e"]), make_column("F", ["f"]), ), P( colname="C", outputs=[ dict(outcolname="A", part="dateyear"), dict(outcolname="F", part="datemonth"), dict(outcolname="D", part="date"), dict(outcolname="G", part="time_minutes"), ], ), ), ArrowRenderResult( make_table( make_column("B", ["b"]), make_column("A", [datetime.date(2000, 1, 1)], unit="year"), make_column("F", [datetime.date(2000, 2, 1)], unit="month"), make_column("D", [datetime.date(2000, 2, 3)]), make_column("G", ["04:00"]), make_column("E", ["e"]), ), ), )
def test_ignore_non_date_timestamps(): # Steps for the user to get here: # 1. Make a date column, 'A' # 2. Check "Group Dates". The column appears. # 3. Select column 'A', and select a date granularity for it # 4. Alter the input DataFrame such that 'A' is no longer datetime # # Expected results: you can't group it by date any more. assert_result_equals( render( make_table( make_column("A", [1]), # "used to be a datetime" make_column( "B", [datetime.datetime(2019, 1, 4)] ), # so we don't need quickfix ), P( groups=dict( colnames=["A"], group_dates=True, date_granularities={"A": "T"} ), aggregations=[dict(operation="size", colname="", outname="size")], ), ), ArrowRenderResult( make_table(make_column("A", [1]), make_column("size", [1], format="{:,d}")), [RenderError(i18n_message("group_dates.select_date_columns"))], ), )
def test_aggregate_numbers(): assert_arrow_table_equals( groupby( make_table( make_column("A", [2, 1, 2, 2], format="{:.2f}"), make_column("B", [1, 2, 5, 1], format="{:d}"), ), [Group("A", None)], [ Aggregation(Operation.SIZE, "", "size"), Aggregation(Operation.NUNIQUE, "B", "nunique"), Aggregation(Operation.SUM, "B", "sum"), Aggregation(Operation.MEAN, "B", "mean"), Aggregation(Operation.MEDIAN, "B", "median"), Aggregation(Operation.MIN, "B", "min"), Aggregation(Operation.MAX, "B", "max"), Aggregation(Operation.FIRST, "B", "first"), ], ), make_table( make_column("A", [1, 2], format="{:.2f}"), # format from A make_column("size", [1, 3], format="{:,d}"), # int format make_column("nunique", [1, 2], format="{:,d}"), # int format make_column("sum", [2, 7], format="{:d}"), # format from B make_column("mean", [2, 7 / 3], format="{:,}"), # default format make_column("median", [2.0, 1.0], format="{:,}"), # default format make_column("min", [2, 1], format="{:d}"), # format from B make_column("max", [2, 5], format="{:d}"), # format from B make_column("first", [2, 1], format="{:d}"), # format from B ), )
def test_group_date_prompt_all_is_well_when_date_column_present(): assert_result_equals( render( make_table( make_column("A", [datetime.date(2021, 5, 10)], unit="week"), make_column("B", [1]), ), P( groups=dict( colnames=["A", "B"], group_dates=True, date_granularities={} ), aggregations=[dict(operation="size", colname="", outname="size")], ), ), ArrowRenderResult( make_table( make_column("A", [datetime.date(2021, 5, 10)], unit="week"), make_column("B", [1]), make_column("size", [1], format="{:,d}"), ), [ RenderError( i18n_message( "group_dates.date_selected", dict(columns=1, column0="A", unit0="week"), ) ) ], ), )
def test_do_not_multiply_categories(): # Pandas default, when given categoricals, is to multiply them out: # in this example, we'd get four rows: # # a, c # a, d # b, c # b, d # # ... even though there are no values for (a, d) or (b, c). # # See https://github.com/pandas-dev/pandas/issues/17594. The solution # is .groupby(..., observed=True). assert_arrow_table_equals( groupby( make_table( make_column("A", ["a", "b", "a"], dictionary=True), make_column("B", ["c", "d", "d"], dictionary=True), make_column("C", [1, 2, 3]), ), [Group("A", None), Group("B", None)], [Aggregation(Operation.SUM, "C", "X")], ), make_table( make_column("A", ["a", "a", "b"], dictionary=True), make_column("B", ["c", "d", "d"], dictionary=True), make_column("X", [1, 3, 2]), ), )
def test_aggregate_text_category_values(): assert_arrow_table_equals( groupby( make_table( make_column("A", [1, 1, 1]), make_column("B", ["a", "b", "a"], dictionary=True), ), [Group("A", None)], [ Aggregation(Operation.SIZE, "B", "size"), Aggregation(Operation.NUNIQUE, "B", "nunique"), Aggregation(Operation.MIN, "B", "min"), Aggregation(Operation.MAX, "B", "max"), Aggregation(Operation.FIRST, "B", "first"), ], ), make_table( make_column("A", [1]), make_column("size", [3], format="{:,d}"), make_column("nunique", [2], format="{:,d}"), make_column("min", ["a"], dictionary=True), make_column("max", ["b"], dictionary=True), make_column("first", ["a"], dictionary=True), ), )
def test_parquet_same_data_different_bytes(self): cjwparquet.write(self.old_path, make_table(make_column("A", ["a"]))) cjwparquet.write(self.new_path, make_table(make_column("A", ["a"], dictionary=True))) self.assertTrue( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))
def test_render_dict_disallow_rename_to_null(): result = render( make_table(make_column("A", ["x"])), P(renames={"A": ""}), settings=Settings(), ) assert_result_equals( result, ArrowRenderResult(make_table(make_column("A", ["x"]))))
def test_assert_arrow_table_equals_check_date_unit(): table1 = make_table( make_column("A", [datetime.date(2021, 4, 1)], unit="day")) table2 = make_table( make_column("A", [datetime.date(2021, 4, 1)], unit="month")) with pytest.raises(AssertionError, match=r"-\{b'unit': b'month'\}\n\+\{b'unit': b'day'\}"): assert_arrow_table_equals(table1, table2)
def test_assert_result_equals_check_errors(): with pytest.raises(AssertionError, match=r"-\[Render.*\n\+\[\]"): assert_result_equals( ArrowRenderResult(make_table()), ArrowRenderResult( make_table(), errors=[RenderError(I18nMessage("foo", {}, "module"))]), )
def test_render_no_outcolname_is_no_op(): assert_result_equals( render( make_table(make_column("A", [dt()])), P(colname="A", outputs=[dict(outcolname="", part="date")]), ), ArrowRenderResult(make_table(make_column("A", [dt()]))), )
def test_render_rename_custom_list_empty_is_no_op(): result = render( make_table(make_column("A", ["x"])), P(custom_list=True, list_string=""), settings=Settings(), ) assert_result_equals( result, ArrowRenderResult(make_table(make_column("A", ["x"]))))
def test_dictionary_no_op(): assert_result_equals( render( make_table(make_column("A", ["a", "b"], dictionary=True)), P(colnames=["A"]), ), ArrowRenderResult( make_table(make_column("A", ["a", "b"], dictionary=True))), )
def test_no_colnames(): assert_arrow_table_equals( groupby( make_table(make_column("A", [1, 2])), [], [Aggregation(Operation.SUM, "A", "X")], ), make_table(make_column("X", [3])), )
def test_sum_float(): assert_arrow_table_equals( groupby( make_table(make_column("A", [1.0, None, 3.0], format="{:d}")), [], [Aggregation(Operation.SUM, "A", "sum")], ), make_table(make_column("sum", [4.0], format="{:d}")), )
def test_sum_int8_does_not_overflow(): assert_arrow_table_equals( groupby( make_table(make_column("A", [127, 1], pa.int8(), format="{:d}")), [], [Aggregation(Operation.SUM, "A", "sum")], ), make_table(make_column("sum", [128], format="{:d}")), )
def test_render_with_no_kwargs(self): def render(table, params): return table * params["n"] param_schema = ParamSchema.Dict({"n": ParamSchema.Float()}) with ModuleTestEnv(param_schema=param_schema, render=render) as env: outcome = env.call_render(make_table(make_column("A", [1])), {"n": 2}) assert_arrow_table_equals(outcome.read_table(), make_table(make_column("A", [2])))
def test_maximum_no_outcolname(): assert_result_equals( render( make_table(make_column("A", [1, 2, 3], pa.timestamp(unit="ns"))), P(operation="maximum", colnames=["A"], outcolname=""), ), ArrowRenderResult( make_table(make_column("A", [1, 2, 3], pa.timestamp(unit="ns"))) ), )
def test_render_empty_file_fetch_result_is_parquet(self): def render(table, params, *, fetch_result): assert_frame_equal(fetch_result.dataframe, pd.DataFrame({})) return fetch_result.dataframe with ModuleTestEnv(render=render) as env: with tempfile_context(dir=env.basedir) as tf: outcome = env.call_render(make_table(), {}, fetch_result=FetchResult(tf)) self.assertEqual(outcome.read_table(), make_table())
def test_size(): assert_arrow_table_equals( groupby( make_table(make_column("A", [1, 1, 2])), [Group("A", None)], [Aggregation(Operation.SIZE, "", "X")], ), make_table(make_column("A", [1, 2]), make_column("X", [2, 1], format="{:,d}")), )
def render_arrow_v1(table, params, *, tab_outputs, **kwargs): self.assertEqual(params["tab"], "tab-x") self.assertEqual(tab_outputs["tab-x"].tab_name, "Tab X") assert_arrow_table_equals( tab_outputs["tab-x"].table, make_table( make_column("X", [1], format="{:,d}"), make_column("Y", ["y"]), ), ) return ArrowRenderResult(make_table())
def render_arrow_v1(table, params, **kwargs): assert_arrow_table_equals( table, make_table( make_column("A", ["x"]), make_column("B", [1], format="{:,.3f}"), make_column("C", [now]), make_column("D", [date(2021, 4, 12)], unit="week"), ), ) return ArrowRenderResult(make_table())
def test_render_with_parquet_fetch_result(self): def render(table, params, *, fetch_result): return fetch_result with ModuleTestEnv(render=render) as env: with parquet_file({"A": ["fetched"]}, dir=env.basedir) as pf: outcome = env.call_render(make_table(), {}, fetch_result=FetchResult(pf)) assert_arrow_table_equals( outcome.read_table(), make_table(make_column("A", ["fetched"])))
def test_render_use_input_columns_as_try_fallback_columns(self): def render(table, params): return pd.DataFrame({"A": [2]}) with ModuleTestEnv(render=render) as env: outcome = env.call_render( make_table(make_column("A", [1], format="{:,.3f}")), {}) self.assertEqual( outcome.read_table(), make_table(make_column("A", [2], format="{:,.3f}")), )
def test_render_arrow_table_empty_output_table_is_empty(self): # The param name "arrow_table" is a special case def render(arrow_table, params, output_path, **kwargs): out = pa.table({}) with pa.ipc.RecordBatchFileWriter(output_path, out.schema) as writer: writer.write_table(out) with ModuleTestEnv(render=render) as env: outcome = env.call_render(make_table(), {}) self.assertEqual(outcome.result, RenderResult()) self.assertEqual(outcome.read_table(), make_table())