Example #1
0
def test_assert_result_equals_check_table():
    table1 = make_table(make_column("A", [1])).replace_schema_metadata(
        {"foo": "bar"})
    table2 = make_table(make_column("A", [1]))
    with pytest.raises(AssertionError, match=r"-None\n\+\{"):
        assert_result_equals(ArrowRenderResult(table1),
                             ArrowRenderResult(table2))
Example #2
0
    def test_execute_new_revision(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        create_module_zipfile(
            "mod",
            spec_kwargs={"loads_data": True},
            python_code=
            'import pandas as pd\ndef render(table, params): return pd.DataFrame({"B": [2]})',
        )
        step = tab.steps.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=2,
            module_id_name="mod",
        )
        # stale
        write_to_rendercache(workflow, step, 1,
                             make_table(make_column("A", ["a"])))

        self._execute(workflow)

        step.refresh_from_db()

        with open_cached_render_result(step.cached_render_result) as result:
            assert_arrow_table_equals(result.table,
                                      make_table(make_column("B", [2])))
Example #3
0
def test_default_outnames():
    assert_result_equals(
        render(
            make_table(
                make_column("A", ["x", "x"]), make_column("B", [1, 2], format="{:d}")
            ),
            P(
                groups=dict(colnames=["A"], group_dates=False, date_granularities={}),
                aggregations=[
                    dict(operation="size", colname="", outname=""),
                    dict(operation="nunique", colname="B", outname=""),
                    dict(operation="sum", colname="B", outname=""),
                    dict(operation="mean", colname="B", outname=""),
                    dict(operation="median", colname="B", outname=""),
                    dict(operation="min", colname="B", outname=""),
                    dict(operation="max", colname="B", outname=""),
                    dict(operation="first", colname="B", outname=""),
                ],
            ),
        ),
        ArrowRenderResult(
            make_table(
                make_column("A", ["x"]),
                make_column("Group Size", [2], format="{:,d}"),
                make_column("Unique count of B", [2], format="{:,d}"),
                make_column("Sum of B", [3], format="{:d}"),
                make_column("Average of B", [1.5], format="{:,}"),
                make_column("Median of B", [1.5], format="{:,}"),
                make_column("Minimum of B", [1], format="{:d}"),
                make_column("Maximum of B", [2], format="{:d}"),
                make_column("First of B", [1], format="{:d}"),
            )
        ),
    )
Example #4
0
def test_group_date_prompt_upgrade_timestamp_to_date():
    assert_result_equals(
        render(
            make_table(make_column("A", [datetime.datetime(2021, 5, 5)])),
            P(
                groups=dict(
                    colnames=["A"], group_dates=True, date_granularities={"A": "Y"}
                ),
                aggregations=[dict(operation="size", colname="", outname="size")],
            ),
        ),
        ArrowRenderResult(
            make_table(
                make_column("A", [datetime.datetime(2021, 1, 1)]),
                make_column("size", [1], format="{:,d}"),
            ),
            [
                RenderError(
                    i18n_message("group_dates.granularity_deprecated.need_dates"),
                    [
                        QuickFix(
                            i18n_message(
                                "group_dates.granularity_deprecated.quick_fix.convert_to_date"
                            ),
                            QuickFixAction.PrependStep(
                                "converttimestamptodate",
                                dict(colnames=["A"], unit="year"),
                            ),
                        )
                    ],
                )
            ],
        ),
    )
Example #5
0
def test_difference_nanoseconds():
    assert_result_equals(
        render(
            make_table(
                make_column(
                    "A", [1237342345234234234, 1230000034123123423, None, None]
                ),
                make_column("B", [1237343345234134214, 1230080234113143429, 123, None]),
            ),
            P(
                operation="difference",
                colname1="A",
                colname2="B",
                unit="nanosecond",
                outcolname="C",
            ),
        ),
        ArrowRenderResult(
            make_table(
                make_column(
                    "A", [1237342345234234234, 1230000034123123423, None, None]
                ),
                make_column("B", [1237343345234134214, 1230080234113143429, 123, None]),
                make_column(
                    "C", [999999899980, 80199990020006, None, None], format="{:,d}"
                ),
            ),
        ),
    )
def test_render_replace_many_columns():
    assert_result_equals(
        render(
            make_table(
                make_column("A", ["a"]),
                make_column("B", ["b"]),
                make_column("C", [dt(2000, 2, 3, 4)]),
                make_column("D", ["d"]),
                make_column("E", ["e"]),
                make_column("F", ["f"]),
            ),
            P(
                colname="C",
                outputs=[
                    dict(outcolname="A", part="dateyear"),
                    dict(outcolname="F", part="datemonth"),
                    dict(outcolname="D", part="date"),
                    dict(outcolname="G", part="time_minutes"),
                ],
            ),
        ),
        ArrowRenderResult(
            make_table(
                make_column("B", ["b"]),
                make_column("A", [datetime.date(2000, 1, 1)], unit="year"),
                make_column("F", [datetime.date(2000, 2, 1)], unit="month"),
                make_column("D", [datetime.date(2000, 2, 3)]),
                make_column("G", ["04:00"]),
                make_column("E", ["e"]),
            ), ),
    )
Example #7
0
def test_ignore_non_date_timestamps():
    # Steps for the user to get here:
    # 1. Make a date column, 'A'
    # 2. Check "Group Dates". The column appears.
    # 3. Select column 'A', and select a date granularity for it
    # 4. Alter the input DataFrame such that 'A' is no longer datetime
    #
    # Expected results: you can't group it by date any more.
    assert_result_equals(
        render(
            make_table(
                make_column("A", [1]),  # "used to be a datetime"
                make_column(
                    "B", [datetime.datetime(2019, 1, 4)]
                ),  # so we don't need quickfix
            ),
            P(
                groups=dict(
                    colnames=["A"], group_dates=True, date_granularities={"A": "T"}
                ),
                aggregations=[dict(operation="size", colname="", outname="size")],
            ),
        ),
        ArrowRenderResult(
            make_table(make_column("A", [1]), make_column("size", [1], format="{:,d}")),
            [RenderError(i18n_message("group_dates.select_date_columns"))],
        ),
    )
Example #8
0
def test_aggregate_numbers():
    assert_arrow_table_equals(
        groupby(
            make_table(
                make_column("A", [2, 1, 2, 2], format="{:.2f}"),
                make_column("B", [1, 2, 5, 1], format="{:d}"),
            ),
            [Group("A", None)],
            [
                Aggregation(Operation.SIZE, "", "size"),
                Aggregation(Operation.NUNIQUE, "B", "nunique"),
                Aggregation(Operation.SUM, "B", "sum"),
                Aggregation(Operation.MEAN, "B", "mean"),
                Aggregation(Operation.MEDIAN, "B", "median"),
                Aggregation(Operation.MIN, "B", "min"),
                Aggregation(Operation.MAX, "B", "max"),
                Aggregation(Operation.FIRST, "B", "first"),
            ],
        ),
        make_table(
            make_column("A", [1, 2], format="{:.2f}"),  # format from A
            make_column("size", [1, 3], format="{:,d}"),  # int format
            make_column("nunique", [1, 2], format="{:,d}"),  # int format
            make_column("sum", [2, 7], format="{:d}"),  # format from B
            make_column("mean", [2, 7 / 3], format="{:,}"),  # default format
            make_column("median", [2.0, 1.0], format="{:,}"),  # default format
            make_column("min", [2, 1], format="{:d}"),  # format from B
            make_column("max", [2, 5], format="{:d}"),  # format from B
            make_column("first", [2, 1], format="{:d}"),  # format from B
        ),
    )
Example #9
0
def test_group_date_prompt_all_is_well_when_date_column_present():
    assert_result_equals(
        render(
            make_table(
                make_column("A", [datetime.date(2021, 5, 10)], unit="week"),
                make_column("B", [1]),
            ),
            P(
                groups=dict(
                    colnames=["A", "B"], group_dates=True, date_granularities={}
                ),
                aggregations=[dict(operation="size", colname="", outname="size")],
            ),
        ),
        ArrowRenderResult(
            make_table(
                make_column("A", [datetime.date(2021, 5, 10)], unit="week"),
                make_column("B", [1]),
                make_column("size", [1], format="{:,d}"),
            ),
            [
                RenderError(
                    i18n_message(
                        "group_dates.date_selected",
                        dict(columns=1, column0="A", unit0="week"),
                    )
                )
            ],
        ),
    )
Example #10
0
def test_do_not_multiply_categories():
    # Pandas default, when given categoricals, is to multiply them out:
    # in this example, we'd get four rows:
    #
    #     a, c
    #     a, d
    #     b, c
    #     b, d
    #
    # ... even though there are no values for (a, d) or (b, c).
    #
    # See https://github.com/pandas-dev/pandas/issues/17594. The solution
    # is .groupby(..., observed=True).
    assert_arrow_table_equals(
        groupby(
            make_table(
                make_column("A", ["a", "b", "a"], dictionary=True),
                make_column("B", ["c", "d", "d"], dictionary=True),
                make_column("C", [1, 2, 3]),
            ),
            [Group("A", None), Group("B", None)],
            [Aggregation(Operation.SUM, "C", "X")],
        ),
        make_table(
            make_column("A", ["a", "a", "b"], dictionary=True),
            make_column("B", ["c", "d", "d"], dictionary=True),
            make_column("X", [1, 3, 2]),
        ),
    )
Example #11
0
def test_aggregate_text_category_values():
    assert_arrow_table_equals(
        groupby(
            make_table(
                make_column("A", [1, 1, 1]),
                make_column("B", ["a", "b", "a"], dictionary=True),
            ),
            [Group("A", None)],
            [
                Aggregation(Operation.SIZE, "B", "size"),
                Aggregation(Operation.NUNIQUE, "B", "nunique"),
                Aggregation(Operation.MIN, "B", "min"),
                Aggregation(Operation.MAX, "B", "max"),
                Aggregation(Operation.FIRST, "B", "first"),
            ],
        ),
        make_table(
            make_column("A", [1]),
            make_column("size", [3], format="{:,d}"),
            make_column("nunique", [2], format="{:,d}"),
            make_column("min", ["a"], dictionary=True),
            make_column("max", ["b"], dictionary=True),
            make_column("first", ["a"], dictionary=True),
        ),
    )
 def test_parquet_same_data_different_bytes(self):
     cjwparquet.write(self.old_path, make_table(make_column("A", ["a"])))
     cjwparquet.write(self.new_path,
                      make_table(make_column("A", ["a"], dictionary=True)))
     self.assertTrue(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))
Example #13
0
def test_render_dict_disallow_rename_to_null():
    result = render(
        make_table(make_column("A", ["x"])),
        P(renames={"A": ""}),
        settings=Settings(),
    )
    assert_result_equals(
        result, ArrowRenderResult(make_table(make_column("A", ["x"]))))
Example #14
0
def test_assert_arrow_table_equals_check_date_unit():
    table1 = make_table(
        make_column("A", [datetime.date(2021, 4, 1)], unit="day"))
    table2 = make_table(
        make_column("A", [datetime.date(2021, 4, 1)], unit="month"))
    with pytest.raises(AssertionError,
                       match=r"-\{b'unit': b'month'\}\n\+\{b'unit': b'day'\}"):
        assert_arrow_table_equals(table1, table2)
Example #15
0
def test_assert_result_equals_check_errors():
    with pytest.raises(AssertionError, match=r"-\[Render.*\n\+\[\]"):
        assert_result_equals(
            ArrowRenderResult(make_table()),
            ArrowRenderResult(
                make_table(),
                errors=[RenderError(I18nMessage("foo", {}, "module"))]),
        )
Example #16
0
def test_render_no_outcolname_is_no_op():
    assert_result_equals(
        render(
            make_table(make_column("A", [dt()])),
            P(colname="A", outputs=[dict(outcolname="", part="date")]),
        ),
        ArrowRenderResult(make_table(make_column("A", [dt()]))),
    )
Example #17
0
def test_render_rename_custom_list_empty_is_no_op():
    result = render(
        make_table(make_column("A", ["x"])),
        P(custom_list=True, list_string=""),
        settings=Settings(),
    )
    assert_result_equals(
        result, ArrowRenderResult(make_table(make_column("A", ["x"]))))
Example #18
0
def test_dictionary_no_op():
    assert_result_equals(
        render(
            make_table(make_column("A", ["a", "b"], dictionary=True)),
            P(colnames=["A"]),
        ),
        ArrowRenderResult(
            make_table(make_column("A", ["a", "b"], dictionary=True))),
    )
Example #19
0
def test_no_colnames():
    assert_arrow_table_equals(
        groupby(
            make_table(make_column("A", [1, 2])),
            [],
            [Aggregation(Operation.SUM, "A", "X")],
        ),
        make_table(make_column("X", [3])),
    )
Example #20
0
def test_sum_float():
    assert_arrow_table_equals(
        groupby(
            make_table(make_column("A", [1.0, None, 3.0], format="{:d}")),
            [],
            [Aggregation(Operation.SUM, "A", "sum")],
        ),
        make_table(make_column("sum", [4.0], format="{:d}")),
    )
Example #21
0
def test_sum_int8_does_not_overflow():
    assert_arrow_table_equals(
        groupby(
            make_table(make_column("A", [127, 1], pa.int8(), format="{:d}")),
            [],
            [Aggregation(Operation.SUM, "A", "sum")],
        ),
        make_table(make_column("sum", [128], format="{:d}")),
    )
Example #22
0
    def test_render_with_no_kwargs(self):
        def render(table, params):
            return table * params["n"]

        param_schema = ParamSchema.Dict({"n": ParamSchema.Float()})
        with ModuleTestEnv(param_schema=param_schema, render=render) as env:
            outcome = env.call_render(make_table(make_column("A", [1])),
                                      {"n": 2})
            assert_arrow_table_equals(outcome.read_table(),
                                      make_table(make_column("A", [2])))
Example #23
0
def test_maximum_no_outcolname():
    assert_result_equals(
        render(
            make_table(make_column("A", [1, 2, 3], pa.timestamp(unit="ns"))),
            P(operation="maximum", colnames=["A"], outcolname=""),
        ),
        ArrowRenderResult(
            make_table(make_column("A", [1, 2, 3], pa.timestamp(unit="ns")))
        ),
    )
Example #24
0
    def test_render_empty_file_fetch_result_is_parquet(self):
        def render(table, params, *, fetch_result):
            assert_frame_equal(fetch_result.dataframe, pd.DataFrame({}))
            return fetch_result.dataframe

        with ModuleTestEnv(render=render) as env:
            with tempfile_context(dir=env.basedir) as tf:
                outcome = env.call_render(make_table(), {},
                                          fetch_result=FetchResult(tf))
                self.assertEqual(outcome.read_table(), make_table())
Example #25
0
def test_size():
    assert_arrow_table_equals(
        groupby(
            make_table(make_column("A", [1, 1, 2])),
            [Group("A", None)],
            [Aggregation(Operation.SIZE, "", "X")],
        ),
        make_table(make_column("A", [1, 2]),
                   make_column("X", [2, 1], format="{:,d}")),
    )
 def render_arrow_v1(table, params, *, tab_outputs, **kwargs):
     self.assertEqual(params["tab"], "tab-x")
     self.assertEqual(tab_outputs["tab-x"].tab_name, "Tab X")
     assert_arrow_table_equals(
         tab_outputs["tab-x"].table,
         make_table(
             make_column("X", [1], format="{:,d}"),
             make_column("Y", ["y"]),
         ),
     )
     return ArrowRenderResult(make_table())
 def render_arrow_v1(table, params, **kwargs):
     assert_arrow_table_equals(
         table,
         make_table(
             make_column("A", ["x"]),
             make_column("B", [1], format="{:,.3f}"),
             make_column("C", [now]),
             make_column("D", [date(2021, 4, 12)], unit="week"),
         ),
     )
     return ArrowRenderResult(make_table())
Example #28
0
    def test_render_with_parquet_fetch_result(self):
        def render(table, params, *, fetch_result):
            return fetch_result

        with ModuleTestEnv(render=render) as env:
            with parquet_file({"A": ["fetched"]}, dir=env.basedir) as pf:
                outcome = env.call_render(make_table(), {},
                                          fetch_result=FetchResult(pf))
                assert_arrow_table_equals(
                    outcome.read_table(),
                    make_table(make_column("A", ["fetched"])))
Example #29
0
    def test_render_use_input_columns_as_try_fallback_columns(self):
        def render(table, params):
            return pd.DataFrame({"A": [2]})

        with ModuleTestEnv(render=render) as env:
            outcome = env.call_render(
                make_table(make_column("A", [1], format="{:,.3f}")), {})
            self.assertEqual(
                outcome.read_table(),
                make_table(make_column("A", [2], format="{:,.3f}")),
            )
Example #30
0
    def test_render_arrow_table_empty_output_table_is_empty(self):
        # The param name "arrow_table" is a special case
        def render(arrow_table, params, output_path, **kwargs):
            out = pa.table({})
            with pa.ipc.RecordBatchFileWriter(output_path, out.schema) as writer:
                writer.write_table(out)

        with ModuleTestEnv(render=render) as env:
            outcome = env.call_render(make_table(), {})
            self.assertEqual(outcome.result, RenderResult())
            self.assertEqual(outcome.read_table(), make_table())