예제 #1
0
def test_aggregate_numbers():
    assert_arrow_table_equals(
        groupby(
            make_table(
                make_column("A", [2, 1, 2, 2], format="{:.2f}"),
                make_column("B", [1, 2, 5, 1], format="{:d}"),
            ),
            [Group("A", None)],
            [
                Aggregation(Operation.SIZE, "", "size"),
                Aggregation(Operation.NUNIQUE, "B", "nunique"),
                Aggregation(Operation.SUM, "B", "sum"),
                Aggregation(Operation.MEAN, "B", "mean"),
                Aggregation(Operation.MEDIAN, "B", "median"),
                Aggregation(Operation.MIN, "B", "min"),
                Aggregation(Operation.MAX, "B", "max"),
                Aggregation(Operation.FIRST, "B", "first"),
            ],
        ),
        make_table(
            make_column("A", [1, 2], format="{:.2f}"),  # format from A
            make_column("size", [1, 3], format="{:,d}"),  # int format
            make_column("nunique", [1, 2], format="{:,d}"),  # int format
            make_column("sum", [2, 7], format="{:d}"),  # format from B
            make_column("mean", [2, 7 / 3], format="{:,}"),  # default format
            make_column("median", [2.0, 1.0], format="{:,}"),  # default format
            make_column("min", [2, 1], format="{:d}"),  # format from B
            make_column("max", [2, 5], format="{:d}"),  # format from B
            make_column("first", [2, 1], format="{:d}"),  # format from B
        ),
    )
예제 #2
0
    def test_execute_new_revision(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        create_module_zipfile(
            "mod",
            spec_kwargs={"loads_data": True},
            python_code=
            'import pandas as pd\ndef render(table, params): return pd.DataFrame({"B": [2]})',
        )
        step = tab.steps.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=2,
            module_id_name="mod",
        )
        # stale
        write_to_rendercache(workflow, step, 1,
                             make_table(make_column("A", ["a"])))

        self._execute(workflow)

        step.refresh_from_db()

        with open_cached_render_result(step.cached_render_result) as result:
            assert_arrow_table_equals(result.table,
                                      make_table(make_column("B", [2])))
예제 #3
0
    def test_fetch_integration(self, send_update, queue_render):
        queue_render.side_effect = async_value(None)
        send_update.side_effect = async_value(None)
        workflow = Workflow.create_and_init()
        create_module_zipfile(
            "mod",
            python_code=
            ("import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table"
             ),
        )
        step = workflow.tabs.first().steps.create(order=0,
                                                  slug="step-1",
                                                  module_id_name="mod")
        cjwstate.modules.init_module_system()
        now = datetime.datetime.now()
        with self.assertLogs(level=logging.INFO):
            self.run_with_async_db(
                fetch.fetch(workflow_id=workflow.id, step_id=step.id, now=now))
        step.refresh_from_db()
        so = step.stored_objects.get(stored_at=step.stored_data_version)
        with s3.temporarily_download(s3.StoredObjectsBucket,
                                     so.key) as parquet_path:
            # fetch results are stored without a schema. Let's hard-code a
            # schema simply so we can test that the table data is the same.
            table = read_parquet_as_arrow(parquet_path,
                                          [Column("A", ColumnType.Number())])
            assert_arrow_table_equals(table, make_table(make_column("A", [1])))

        workflow.refresh_from_db()
        queue_render.assert_called_with(workflow.id, workflow.last_delta_id)
        send_update.assert_called()
예제 #4
0
 def test_to_arrow_normal_dataframe(self):
     fd, filename = tempfile.mkstemp()
     try:
         process_result = ProcessResult.coerce(pd.DataFrame({"A": [1, 2]}))
         result = process_result.to_arrow(Path(filename))
         self.assertEqual(
             result,
             atypes.RenderResult(
                 [],
                 {},
             ),
         )
         with pa.ipc.open_file(filename) as reader:
             table = reader.read_all()
         assert_arrow_table_equals(
             table,
             make_table(
                 make_column(
                     # Whatever .format ProcessResult.coerce() gave
                     "A",
                     [1, 2],
                     format=process_result.columns[0].type.format,
                 )
             ),
         )
     finally:
         os.unlink(filename)
예제 #5
0
    def test_duplicate_copies_fresh_cache(self):
        # The cache's filename depends on workflow_id and step_id.
        # Duplicating it would need more complex code :).
        table = make_table(make_column("A", [1], format="${:,.2f}"))
        write_to_rendercache(
            self.workflow,
            self.step,
            1,
            table=table,
            errors=[RenderError(I18nMessage("X", {}, None))],
            json={"foo": "bar"},
        )

        workflow2 = Workflow.objects.create()
        tab2 = workflow2.tabs.create(position=0)
        dup = self.step.duplicate_into_new_workflow(tab2)

        dup_cached_result = dup.cached_render_result
        self.assertEqual(
            dup_cached_result,
            replace(
                self.step.cached_render_result,
                workflow_id=workflow2.id,
                step_id=dup.id,
                delta_id=0,
            ),
        )
        with open_cached_render_result(dup_cached_result) as result2:
            assert_arrow_table_equals(result2.table, table)
            self.assertEqual(result2.errors,
                             [RenderError(I18nMessage("X", {}, None))])
            self.assertEqual(result2.json, {"foo": "bar"})
예제 #6
0
    def test_fetch_truncate(self):
        def fetch(params):
            return pd.DataFrame({"A": [1, 2, 3]})

        with tempfile_context(dir=self.basedir) as outfile:
            result = self._test_fetch(fetch, output_filename=outfile.name)
            self.assertEqual(
                result,
                FetchResult(
                    outfile,
                    errors=[
                        FetchError(
                            I18nMessage(
                                "py.cjwkernel.pandas.types.ProcessResult.truncate_in_place_if_too_big.warning",
                                {
                                    "old_number": 3,
                                    "new_number": 2
                                },
                                None,
                            ))
                    ],
                ),
            )
            assert_arrow_table_equals(
                read_parquet_as_arrow(
                    outfile, [Column("A", ColumnType.Number("{:,}"))]),
                make_table(make_column("A", [1, 2])),
            )
예제 #7
0
def test_do_not_multiply_categories():
    # Pandas default, when given categoricals, is to multiply them out:
    # in this example, we'd get four rows:
    #
    #     a, c
    #     a, d
    #     b, c
    #     b, d
    #
    # ... even though there are no values for (a, d) or (b, c).
    #
    # See https://github.com/pandas-dev/pandas/issues/17594. The solution
    # is .groupby(..., observed=True).
    assert_arrow_table_equals(
        groupby(
            make_table(
                make_column("A", ["a", "b", "a"], dictionary=True),
                make_column("B", ["c", "d", "d"], dictionary=True),
                make_column("C", [1, 2, 3]),
            ),
            [Group("A", None), Group("B", None)],
            [Aggregation(Operation.SUM, "C", "X")],
        ),
        make_table(
            make_column("A", ["a", "a", "b"], dictionary=True),
            make_column("B", ["c", "d", "d"], dictionary=True),
            make_column("X", [1, 3, 2]),
        ),
    )
예제 #8
0
def test_aggregate_text_category_values():
    assert_arrow_table_equals(
        groupby(
            make_table(
                make_column("A", [1, 1, 1]),
                make_column("B", ["a", "b", "a"], dictionary=True),
            ),
            [Group("A", None)],
            [
                Aggregation(Operation.SIZE, "B", "size"),
                Aggregation(Operation.NUNIQUE, "B", "nunique"),
                Aggregation(Operation.MIN, "B", "min"),
                Aggregation(Operation.MAX, "B", "max"),
                Aggregation(Operation.FIRST, "B", "first"),
            ],
        ),
        make_table(
            make_column("A", [1]),
            make_column("size", [3], format="{:,d}"),
            make_column("nunique", [2], format="{:,d}"),
            make_column("min", ["a"], dictionary=True),
            make_column("max", ["b"], dictionary=True),
            make_column("first", ["a"], dictionary=True),
        ),
    )
예제 #9
0
def test_assert_arrow_table_equals_check_date_unit():
    table1 = make_table(
        make_column("A", [datetime.date(2021, 4, 1)], unit="day"))
    table2 = make_table(
        make_column("A", [datetime.date(2021, 4, 1)], unit="month"))
    with pytest.raises(AssertionError,
                       match=r"-\{b'unit': b'month'\}\n\+\{b'unit': b'day'\}"):
        assert_arrow_table_equals(table1, table2)
예제 #10
0
def test_no_colnames():
    assert_arrow_table_equals(
        groupby(
            make_table(make_column("A", [1, 2])),
            [],
            [Aggregation(Operation.SUM, "A", "X")],
        ),
        make_table(make_column("X", [3])),
    )
예제 #11
0
def test_sum_float():
    assert_arrow_table_equals(
        groupby(
            make_table(make_column("A", [1.0, None, 3.0], format="{:d}")),
            [],
            [Aggregation(Operation.SUM, "A", "sum")],
        ),
        make_table(make_column("sum", [4.0], format="{:d}")),
    )
예제 #12
0
def test_sum_int8_does_not_overflow():
    assert_arrow_table_equals(
        groupby(
            make_table(make_column("A", [127, 1], pa.int8(), format="{:d}")),
            [],
            [Aggregation(Operation.SUM, "A", "sum")],
        ),
        make_table(make_column("sum", [128], format="{:d}")),
    )
예제 #13
0
def test_size():
    assert_arrow_table_equals(
        groupby(
            make_table(make_column("A", [1, 1, 2])),
            [Group("A", None)],
            [Aggregation(Operation.SIZE, "", "X")],
        ),
        make_table(make_column("A", [1, 2]),
                   make_column("X", [2, 1], format="{:,d}")),
    )
예제 #14
0
    def test_render_with_no_kwargs(self):
        def render(table, params):
            return table * params["n"]

        param_schema = ParamSchema.Dict({"n": ParamSchema.Float()})
        with ModuleTestEnv(param_schema=param_schema, render=render) as env:
            outcome = env.call_render(make_table(make_column("A", [1])),
                                      {"n": 2})
            assert_arrow_table_equals(outcome.read_table(),
                                      make_table(make_column("A", [2])))
예제 #15
0
    def test_render_with_parquet_fetch_result(self):
        def render(table, params, *, fetch_result):
            return fetch_result

        with ModuleTestEnv(render=render) as env:
            with parquet_file({"A": ["fetched"]}, dir=env.basedir) as pf:
                outcome = env.call_render(make_table(), {},
                                          fetch_result=FetchResult(pf))
                assert_arrow_table_equals(
                    outcome.read_table(),
                    make_table(make_column("A", ["fetched"])))
예제 #16
0
 def render_arrow_v1(table, params, **kwargs):
     assert_arrow_table_equals(
         table,
         make_table(
             make_column("A", ["x"]),
             make_column("B", [1], format="{:,.3f}"),
             make_column("C", [now]),
             make_column("D", [date(2021, 4, 12)], unit="week"),
         ),
     )
     return ArrowRenderResult(make_table())
예제 #17
0
 def render_arrow_v1(table, params, *, tab_outputs, **kwargs):
     self.assertEqual(params["tab"], "tab-x")
     self.assertEqual(tab_outputs["tab-x"].tab_name, "Tab X")
     assert_arrow_table_equals(
         tab_outputs["tab-x"].table,
         make_table(
             make_column("X", [1], format="{:,d}"),
             make_column("Y", ["y"]),
         ),
     )
     return ArrowRenderResult(make_table())
예제 #18
0
def test_assert_arrow_table_equals_check_timestamp_tz():
    table1 = pa.table(
        {"A": pa.array([1617889141123456000], pa.timestamp("ns", "UTC"))})
    table2 = pa.table(
        {"A": pa.array([1617889141123456000], pa.timestamp("ns"))})
    with pytest.raises(
            AssertionError,
            match=
            r"-pyarrow.Field<A: timestamp\[ns\]>\n\+pyarrow.Field<A: timestamp\[ns, tz=UTC\]>",
    ):
        assert_arrow_table_equals(table1, table2)
예제 #19
0
 def _test_dataframe_to_arrow_table(
     self,
     dataframe: pd.DataFrame,
     columns: List[Column],
     expected_table: pa.Table,
 ) -> None:
     with tempfile_context() as path:
         dataframe_to_arrow_table(dataframe, columns, path)
         # "untrusted": more integration-test-ish
         result_table, result_columns = load_untrusted_arrow_file_with_columns(path)
         assert_arrow_table_equals(result_table, expected_table)
         self.assertEqual(result_columns, columns)  # testing the round trip
예제 #20
0
def test_aggregate_null_timestamp_by_quarter_DEPRECATED():
    assert_arrow_table_equals(
        groupby(
            make_table(make_column("A", [None], pa.timestamp("ns"))),
            [Group("A", DateGranularity.QUARTER)],
            [Aggregation(Operation.SIZE, "", "size")],
        ),
        make_table(
            make_column("A", [], pa.timestamp("ns")),
            make_column("size", [], pa.int64(), format="{:,d}"),
        ),
    )
예제 #21
0
    def test_execute_partial_cache_hit(self):
        module_zipfile = create_module_zipfile(
            "mod", spec_kwargs={"loads_data": True})
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        # step1: cached result is fresh. Should not render.
        step1 = tab.steps.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        write_to_rendercache(workflow, step1, workflow.last_delta_id,
                             make_table(make_column("A", ["a"])))
        # step2: cached result is stale, so must be re-rendered
        step2 = tab.steps.create(
            order=1,
            slug="step-2",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        write_to_rendercache(
            workflow,
            step2,
            workflow.last_delta_id - 1,
            make_table(make_column("B", ["b"])),
        )

        tab_flow = TabFlow(
            Tab(tab.slug, tab.name),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        new_table = make_table(make_column("C", ["c"]))

        with patch.object(Kernel, "render",
                          side_effect=mock_render(new_table)):
            with self._execute(workflow, tab_flow, {}) as (result, path):
                self.assertEqual(
                    result, StepResult(path, [Column("C", ColumnType.Text())]))
                assert_arrow_table_equals(load_trusted_arrow_file(path),
                                          new_table)

            Kernel.render.assert_called_once()  # step2, not step1

            self.assertRegex(
                # Output is to the correct file
                Kernel.render.call_args[1]["output_filename"],
                r"execute-tab-output.*\.arrow",
            )
예제 #22
0
    def test_render_with_non_parquet_fetch_result(self):
        def render(table, params, *, fetch_result):
            return pd.DataFrame({"A": [fetch_result.path.read_text()]})

        with ModuleTestEnv(render=render) as env:
            with tempfile_context(dir=env.basedir) as tf:
                tf.write_bytes(b"abcd")
                outcome = env.call_render(make_table(), {},
                                          fetch_result=FetchResult(tf))
                assert_arrow_table_equals(
                    outcome.read_table(),
                    make_table(make_column("A", ["abcd"])))
예제 #23
0
    def test_fetch_return_dataframe(self):
        async def fetch(params):
            return pd.DataFrame({"A": ["x", "y"]})

        with tempfile_context(dir=self.basedir) as outfile:
            result = self._test_fetch(fetch, output_filename=outfile.name)

            self.assertEqual(result.errors, [])
            arrow_table = read_parquet_as_arrow(
                outfile, [Column("A", ColumnType.Text())])
            assert_arrow_table_equals(arrow_table,
                                      make_table(make_column("A", ["x", "y"])))
예제 #24
0
    def test_render_return_column_formats(self):
        def render(table, params):
            return {
                "dataframe": pd.DataFrame({"A": [1]}),
                "column_formats": {
                    "A": "${:,d}"
                },
            }

        with ModuleTestEnv(render=render) as env:
            outcome = env.call_render(make_table(), {})
            assert_arrow_table_equals(
                outcome.read_table(),
                make_table(make_column("A", [1], format="${:,d}")))
예제 #25
0
def test_first_in_category():
    # https://www.pivotaltracker.com/story/show/177964511
    # This crash finally inspired us, [2021-04-29], to ditch Pandas.
    #
    # The only shock is that we didn't ditch it after all the other crashes
    # that litter this test suite.
    assert_arrow_table_equals(
        groupby(
            make_table(make_column("A", ["A", "A"], dictionary=True), ),
            [],
            [Aggregation(Operation.FIRST, "A", "first")],
        ),
        make_table(make_column("first", ["A"], dictionary=True)),
    )
예제 #26
0
def test_aggregate_text_category_values_max():
    # https://github.com/pandas-dev/pandas/issues/28641
    assert_arrow_table_equals(
        groupby(
            make_table(
                make_column("A", [1997]),
                make_column("B", ["30-SEP-97"], dictionary=True),
            ),
            [Group("A", None)],
            [Aggregation(Operation.MAX, "B", "X")],
        ),
        make_table(make_column("A", [1997]),
                   make_column("X", ["30-SEP-97"], dictionary=True)),
    )
예제 #27
0
    def test_render_arrow_table_infer_output_column_formats(self):
        def render(arrow_table, params, output_path, *, columns, **kwargs):
            out = pa.table({"A": [1], "B": [date(2021, 4, 1)]})
            with pa.ipc.RecordBatchFileWriter(output_path, out.schema) as writer:
                writer.write_table(out)

        with ModuleTestEnv(render=render) as env:
            outcome = env.call_render(make_table(), {})
            assert_arrow_table_equals(
                outcome.read_table(),
                make_table(
                    make_column("A", [1], format="{:,}"),
                    make_column("B", [date(2021, 4, 1)], unit="day"),
                ),
            )
예제 #28
0
def test_allow_duplicate_aggregations():
    assert_arrow_table_equals(
        groupby(
            make_table(
                make_column("A", [1, 1, 2]),
                make_column("B", [1, 2, 3]),
            ),
            [Group("A", None)],
            [
                Aggregation(Operation.MIN, "B", "X"),
                Aggregation(Operation.MIN, "B", "Y"),
            ],
        ),
        make_table(make_column("A", [1, 2]), make_column("X", [1, 3]),
                   make_column("Y", [1, 3])),
    )
예제 #29
0
def test_aggregate_timestamp_by_year_DEPRECATED():
    assert_arrow_table_equals(
        groupby(
            make_table(
                make_column("A",
                            [dt(2018, 1, 4),
                             dt(2019, 2, 4),
                             dt(2018, 3, 4)]), ),
            [Group("A", DateGranularity.YEAR)],
            [Aggregation(Operation.SIZE, "", "size")],
        ),
        make_table(
            make_column("A", [dt(2018, 1, 1), dt(2019, 1, 1)]),
            make_column("size", [2, 1], format="{:,d}"),
        ),
    )
예제 #30
0
def test_multilevel_with_na_remove_unused_category():
    assert_arrow_table_equals(
        groupby(
            make_table(
                make_column("A", ["a1", "a2", "a1", "a1"], dictionary=True),
                make_column("B", ["b1", None, "b2", "b3"], dictionary=True),
            ),
            [Group("A", None), Group("B", None)],
            [Aggregation(Operation.SIZE, "", "X")],
        ),
        make_table(
            make_column("A", ["a1", "a1", "a1"], dictionary=True),
            make_column("B", ["b1", "b2", "b3"]),
            make_column("X", [1, 1, 1], format="{:,d}"),
        ),
    )