Esempio n. 1
0
    def test_only_nan_in_group(self):
        data = Table(
            Domain([ContinuousVariable("A"),
                    ContinuousVariable("B")]),
            np.array([[1, np.nan], [2, 1], [1, np.nan], [2, 1]]),
        )
        self.send_signal(self.widget.Inputs.data, data)

        # select feature A as group-by
        self._set_selection(self.widget.gb_attrs_view, [0])
        # select all aggregations for feature B
        self.select_table_rows(self.widget.agg_table_view, [1])
        for cb in self.widget.agg_checkboxes.values():
            while not cb.isChecked():
                cb.click()

        # unselect all aggregations for attr A
        self.select_table_rows(self.widget.agg_table_view, [0])
        for cb in self.widget.agg_checkboxes.values():
            while cb.isChecked():
                cb.click()

        expected_columns = [
            "B - Mean",
            "B - Median",
            "B - Mode",
            "B - Standard deviation",
            "B - Variance",
            "B - Sum",
            "B - Min. value",
            "B - Max. value",
            "B - Span",
            "B - First value",
            "B - Last value",
            "B - Random value",
            "B - Count defined",
            "B - Count",
            "B - Proportion defined",
            "B - Concatenate",
            "A",
        ]
        n = np.nan
        expected_df = pd.DataFrame(
            [
                [n, n, n, n, n, 0, n, n, n, n, n, n, 0, 2, 0, "", 1],
                [1, 1, 1, 0, 0, 2, 1, 1, 0, 1, 1, 1, 2, 2, 1, "1.0 1.0", 2],
            ],
            columns=expected_columns,
        )
        output_df = table_to_frame(self.get_output(self.widget.Outputs.data),
                                   include_metas=True)
        pd.testing.assert_frame_equal(
            output_df,
            expected_df,
            check_dtype=False,
            check_column_type=False,
            check_categorical=False,
        )
Esempio n. 2
0
    def test_only_nan_in_group(self):
        data = Table(
            Domain([ContinuousVariable("A"), ContinuousVariable("B")]),
            np.array([[1, np.nan], [2, 1], [1, np.nan], [2, 1]]),
        )
        self.send_signal(self.widget.Inputs.data, data)

        # select feature A as group-by
        self._set_selection(self.widget.gb_attrs_view, [0])
        # select all aggregations for feature B
        self.select_table_rows(self.widget.agg_table_view, [1])
        for cb in self.widget.agg_checkboxes.values():
            while not cb.isChecked():
                cb.click()

        # unselect all aggregations for attr A
        self.select_table_rows(self.widget.agg_table_view, [0])
        for cb in self.widget.agg_checkboxes.values():
            while cb.isChecked():
                cb.click()

        expected_columns = [
            "B - 平均值",
            "B - 中位数",
            "B - 取模",
            "B - 标准差",
            "B - 方差",
            "B - 和",
            "B - 最小值",
            "B - 最大值",
            "B - 跨度",
            "B - 首值",
            "B - 末值",
            "B - 随机值",
            "B - 非缺失数量",
            "B - 数量",
            "B - 非缺失占比",
            "B - 串接(Concatenate)",
            "A",
        ]
        n = np.nan
        expected_df = pd.DataFrame(
            [
                [n, n, n, n, n, 0, n, n, n, n, n, n, 0, 2, 0, "", 1],
                [1, 1, 1, 0, 0, 2, 1, 1, 0, 1, 1, 1, 2, 2, 1, "1.0 1.0", 2],
            ],
            columns=expected_columns,
        )
        output_df = table_to_frame(
            self.get_output(self.widget.Outputs.data), include_metas=True
        )
        pd.testing.assert_frame_equal(
            output_df,
            expected_df,
            check_dtype=False,
            check_column_type=False,
            check_categorical=False,
        )
Esempio n. 3
0
    def __init__(self, table: Table, by: List[Variable]):
        self.table = table

        df = table_to_frame(table, include_metas=True)
        # observed=True keeps only groups with at leas one instance
        self.group_by = df.groupby([a.name for a in by], observed=True)

        # lru_cache that is caches on the object level
        self.compute_aggregation = lru_cache()(self._compute_aggregation)
Esempio n. 4
0
    def test_aggregation(self):
        d = self.data.domain
        gb = self.data.groupby([self.data.domain["a"], self.data.domain["b"]])
        output = gb.aggregate({
            d["cvar"]: [("Mean", "mean"), ("Median", "median"),
                        ("Mean1", np.mean)],
            d["dvar"]: [("Count defined", "count"), ("Count", "size")],
            d["svar"]: [("Concatenate", "".join)],
        })

        expected_columns = [
            "cvar - Mean",
            "cvar - Median",
            "cvar - Mean1",
            "dvar - Count defined",
            "dvar - Count",
            "svar - Concatenate",
            "a",  # groupby variables are last two in metas
            "b",
        ]

        exp_df = pd.DataFrame(
            [
                [0.15, 0.15, 0.15, 2, 2, "sval1sval2", 1, 1],
                [0.3, 0.3, 0.3, 1, 2, "sval2", 1, 2],
                [0.433, 0.4, 0.433, 3, 3, "sval1sval2sval1", 1, 3],
                [1.5, 1.5, 1.5, 2, 2, "sval2sval1", 2, 1],
                [-0.5, -0.5, -0.5, 2, 2, "sval2sval1", 2, 2],
                [5, 5, 5, 2, 2, "sval2sval1", 2, 3],
            ],
            columns=expected_columns,
        )

        out_df = table_to_frame(output, include_metas=True)

        pd.testing.assert_frame_equal(
            out_df,
            exp_df,
            check_dtype=False,
            check_column_type=False,
            check_categorical=False,
            atol=1e-3,
        )
Esempio n. 5
0
    def test_aggregation(self):
        """Test aggregation results"""
        self.send_signal(self.widget.Inputs.data, self.data)
        output = self.get_output(self.widget.Outputs.data)

        np.testing.assert_array_almost_equal(
            output.X, [[1, 2.143, 0.317, 0], [2, 2, 2, 0]], decimal=3)
        np.testing.assert_array_equal(
            output.metas,
            np.array(
                [
                    [
                        "sval1 sval2 sval2 sval1 sval2 sval1",
                        1.0,
                    ],
                    [
                        "sval2 sval1 sval2 sval1 sval2 sval1",
                        2.0,
                    ],
                ],
                dtype=object,
            ),
        )

        # select all aggregations for all features except a and b
        self._set_selection(self.widget.gb_attrs_view, [1, 2])
        self.select_table_rows(self.widget.agg_table_view, [2, 3, 4])
        # select all aggregations
        for cb in self.widget.agg_checkboxes.values():
            cb.click()
            while not cb.isChecked():
                cb.click()

        self.select_table_rows(self.widget.agg_table_view, [0, 1])
        # unselect all aggregations for attr a and b
        for cb in self.widget.agg_checkboxes.values():
            while cb.isChecked():
                cb.click()

        expected_columns = [
            "cvar - Mean",
            "cvar - Median",
            "cvar - Mode",
            "cvar - Standard deviation",
            "cvar - Variance",
            "cvar - Sum",
            "cvar - Min. value",
            "cvar - Max. value",
            "cvar - Span",
            "cvar - First value",
            "cvar - Last value",
            "cvar - Count defined",
            "cvar - Count",
            "cvar - Proportion defined",
            "dvar - Mode",
            "dvar - First value",
            "dvar - Last value",
            "dvar - Count defined",
            "dvar - Count",
            "dvar - Proportion defined",
            "svar - First value",
            "svar - Last value",
            "svar - Count defined",
            "svar - Count",
            "svar - Proportion defined",
            "cvar - Concatenate",
            "dvar - Concatenate",
            "svar - Concatenate",
            "a",  # groupby variables are last two in metas
            "b",
        ]

        # fmt: off
        expected_df = pd.DataFrame(
            [[
                .15, .15, .1, .07, .005, .3, .1, .2, .1, 0.1, 0.2, 2, 2, 1,
                "val1", "val1", "val2", 2, 2, 1, "sval1", "sval2", 2, 2, 1,
                "0.1 0.2", "val1 val2", "sval1 sval2", 1, 1
            ],
             [
                 .3, .3, .3, np.nan, np.nan, .3, .3, .3, 0, .3, .3, 1, 2, 0.5,
                 "val2", "val2", "val2", 1, 2, 0.5, "", "sval2", 2, 2, 1,
                 "0.3", "val2", "sval2", 1, 2
             ],
             [
                 .433, .4, .3, 0.153, 0.023, 1.3, .3, .6, .3, .3, .6, 3, 3, 1,
                 "val1", "val1", "val1", 3, 3, 1, "sval1", "sval1", 3, 3, 1,
                 "0.3 0.4 0.6", "val1 val2 val1", "sval1 sval2 sval1", 1, 3
             ],
             [
                 1.5, 1.5, 1, 0.707, 0.5, 3, 1, 2, 1, 1, 2, 2, 2, 1, "val1",
                 "val2", "val1", 2, 2, 1, "sval2", "sval1", 2, 2, 1, "1.0 2.0",
                 "val2 val1", "sval2 sval1", 2, 1
             ],
             [
                 -0.5, -0.5, -4, 4.95, 24.5, -1, -4, 3, 7, 3, -4, 2, 2, 1,
                 "val1", "val2", "val1", 2, 2, 1, "sval2", "sval1", 2, 2, 1,
                 "3.0 -4.0", "val2 val1", "sval2 sval1", 2, 2
             ],
             [
                 5, 5, 5, 0, 0, 10, 5, 5, 0, 5, 5, 2, 2, 1, "val1", "val2",
                 "val1", 2, 2, 1, "sval2", "sval1", 2, 2, 1, "5.0 5.0",
                 "val2 val1", "sval2 sval1", 2, 3
             ]],
            columns=expected_columns)
        # fmt: on

        output_df = table_to_frame(self.get_output(self.widget.Outputs.data),
                                   include_metas=True)
        # remove random since it is not possible to test
        output_df = output_df.loc[:, ~output_df.columns.str.
                                  endswith("Random value")]

        pd.testing.assert_frame_equal(
            output_df,
            expected_df,
            check_dtype=False,
            check_column_type=False,
            check_categorical=False,
            atol=1e-3,
        )