Ejemplo n.º 1
0
    def __init__(self,
                 table: Table,
                 agg_funs: Iterable[Functions],
                 row_var: Variable,
                 col_var: Variable = None,
                 val_var: Variable = None):
        self._group_tables = self.Tables()
        self._pivot_tables = self.Tables()
        self._table = table
        self._row_var = row_var
        self._col_var = col_var if col_var else row_var
        self.renamed = []

        if not table:
            return
        if not self._row_var.is_primitive():
            raise TypeError("Row variable should be DiscreteVariable"
                            " or ContinuousVariable")
        if self._col_var and not self._col_var.is_discrete:
            raise TypeError("Column variable should be DiscreteVariable")

        self._row_var_col = table.get_column_view(row_var)[0].astype(np.float)
        self._col_var_col = table.get_column_view(self._col_var)[0].astype(
            np.float)
        self._row_var_groups = nanunique(self._row_var_col)
        self._col_var_groups = nanunique(self._col_var_col)

        self._total_var = DiscreteVariable("Total", values=("total", ))
        self._current_agg_functions = sorted(agg_funs)
        self._indepen_agg_done = {}  # type: Dict[Functions, int]
        self._depen_agg_done = {}  # type: Dict[Functions, Dict[Variable, int]]

        self._initialize(agg_funs, val_var)
Ejemplo n.º 2
0
    def test_nanunique_ignores_nans_in_counts(self, array):
        # pylint: disable=bad-whitespace
        x = array([[-1., 1., 0., 2., 3., np.nan], [0., 0., 0., 3., 5., np.nan],
                   [-1., 0., 0., 1., 7., 6.]])
        expected = [2, 6, 2, 1, 2, 1, 1, 1]

        np.testing.assert_equal(nanunique(x, return_counts=True)[1], expected)
Ejemplo n.º 3
0
    def test_nanunique_ignores_nans_in_values(self, array):
        # pylint: disable=bad-whitespace
        x = array([[-1., 1., 0., 2., 3., np.nan], [0., 0., 0., 3., 5., np.nan],
                   [-1., 0., 0., 1., 7., 6.]])
        expected = [-1, 0, 1, 2, 3, 5, 6, 7]

        np.testing.assert_equal(nanunique(x, return_counts=False), expected)
Ejemplo n.º 4
0
def remove_unused_values(var, data):
    column_data = Table.from_table(
        Domain([var]),
        data
    )
    unique = nanunique(column_data.X).astype(int)
    if len(unique) == len(var.values):
        return var

    used_values = [var.values[i] for i in unique]
    translation_table = np.array([np.NaN] * len(var.values))
    translation_table[unique] = range(len(used_values))

    base_value = -1
    if 0 >= var.base_value < len(var.values):
        base = translation_table[var.base_value]
        if np.isfinite(base):
            base_value = int(base)

    return DiscreteVariable("{}".format(var.name),
                            values=used_values,
                            base_value=base_value,
                            compute_value=Lookup(var, translation_table),
                            sparse=var.sparse,
                            )
Ejemplo n.º 5
0
    def test_nanunique_ignores_nans_in_counts(self, array):
        # pylint: disable=bad-whitespace
        x = array([[-1., 1., 0., 2., 3., np.nan],
                   [ 0., 0., 0., 3., 5., np.nan],
                   [-1., 0., 0., 1., 7.,     6.]])
        expected = [2, 6, 2, 1, 2, 1, 1, 1]

        np.testing.assert_equal(nanunique(x, return_counts=True)[1], expected)
Ejemplo n.º 6
0
    def test_nanunique_ignores_nans_in_values(self, array):
        # pylint: disable=bad-whitespace
        x = array([[-1., 1., 0., 2., 3., np.nan],
                   [ 0., 0., 0., 3., 5., np.nan],
                   [-1., 0., 0., 1., 7.,     6.]])
        expected = [-1, 0, 1, 2, 3, 5, 6, 7]

        np.testing.assert_equal(nanunique(x, return_counts=False), expected)
Ejemplo n.º 7
0
    def commit(self):
        def send_outputs(pivot_table, filtered_data, grouped_data):
            if self.data:
                if grouped_data:
                    grouped_data.name = self.data.name
                if pivot_table:
                    pivot_table.name = self.data.name
                if filtered_data:
                    filtered_data.name = self.data.name
            self.Outputs.grouped_data.send(grouped_data)
            self.Outputs.pivot_table.send(pivot_table)
            self.Outputs.filtered_data.send(filtered_data)

        self.Warning.renamed_vars.clear()
        self.Warning.too_many_values.clear()
        self.Warning.cannot_aggregate.clear()
        self.Warning.no_col_feature.clear()

        self.table_view.clear()

        if self.pivot is None:
            if self.data:
                if not self.data_has_primitives:
                    self.Warning.no_variables()
                    send_outputs(None, None, None)
                    return

            if self.no_col_feature:
                self.Warning.no_col_feature()
                send_outputs(None, None, None)
                return

            if self.data:
                col_var = self.col_feature or self.row_feature
                col = self.data.get_column_view(col_var)[0].astype(float)
                if len(nanunique(col)) >= self.MAX_VALUES:
                    self.table_view.clear()
                    self.Warning.too_many_values()
                    send_outputs(None, None, None)
                    return

            self.pivot = Pivot(self.data, self.sel_agg_functions,
                               self.row_feature, self.col_feature,
                               self.val_feature)

        if self.skipped_aggs:
            self.Warning.cannot_aggregate(self.skipped_aggs)
        self._update_graph()

        send_outputs(self.pivot.pivot_table, self.get_filtered_data(),
                     self.pivot.group_table)

        if self.pivot.renamed:
            self.Warning.renamed_vars(self.pivot.renamed)
Ejemplo n.º 8
0
def remove_unused_values(var, data):
    unique = nanunique(data.get_column_view(var)[0].astype(float)).astype(int)
    if len(unique) == len(var.values):
        return var
    used_values = [var.values[i] for i in unique]
    translation_table = np.array([np.NaN] * len(var.values))
    translation_table[unique] = range(len(used_values))
    return DiscreteVariable(var.name,
                            values=used_values,
                            sparse=var.sparse,
                            compute_value=Lookup(var, translation_table))
Ejemplo n.º 9
0
def remove_unused_values(var, data):
    column_data = Table.from_table(Domain([var]), data)
    unique = nanunique(column_data.X).astype(int)
    if len(unique) == len(var.values):
        return var

    used_values = [var.values[i] for i in unique]
    translation_table = np.array([np.NaN] * len(var.values))
    translation_table[unique] = range(len(used_values))

    return DiscreteVariable("{}".format(var.name),
                            values=used_values,
                            compute_value=Lookup(var, translation_table),
                            sparse=var.sparse)
Ejemplo n.º 10
0
    def __init__(self,
                 data,
                 variable,
                 parent=None,
                 height=200,
                 width=300,
                 side_padding=5,
                 top_padding=20,
                 bottom_padding=0,
                 bar_spacing=4,
                 border=0,
                 border_color=None,
                 color_attribute=None,
                 n_bins=10):
        super().__init__(parent)
        self.height, self.width = height, width
        self.padding = side_padding
        self.bar_spacing = bar_spacing

        self.data = data
        self.attribute = data.domain[variable]

        self.x = data.get_column_view(self.attribute)[0].astype(np.float64)
        self.x_nans = np.isnan(self.x)
        self.x = self.x[~self.x_nans]

        if self.attribute.is_discrete:
            self.n_bins = len(self.attribute.values)
        elif self.attribute.is_continuous:
            # If the attribute is continuous but contains fewer values than the
            # bins, it is better to assign each their own bin. We will require
            # at least 2 bins so that the histogram still visually makes sense
            # except if there is only a single value, then we use 3 bins for
            # symmetry
            num_unique = ut.nanunique(self.x).shape[0]
            if num_unique == 1:
                self.n_bins = 3
            else:
                self.n_bins = min(max(2, num_unique), n_bins)

        # Handle target variable index
        self.color_attribute = color_attribute
        if self.color_attribute is not None:
            self.target_var = data.domain[color_attribute]
            self.y = data.get_column_view(color_attribute)[0]
            self.y = self.y[~self.x_nans]
            if not np.issubdtype(self.y.dtype, np.number):
                self.y = self.y.astype(np.float64)
        else:
            self.target_var, self.y = None, None

        # Borders
        self.border_color = border_color if border_color is not None else '#000'
        if isinstance(border, tuple):
            assert len(border) == 4, 'Border tuple must be of size 4.'
            self.border = border
        else:
            self.border = (border, border, border, border)
        t, r, b, l = self.border

        def _draw_border(point_1, point_2, border_width, parent):
            pen = QPen(QColor(self.border_color))
            pen.setCosmetic(True)
            pen.setWidth(border_width)
            line = QGraphicsLineItem(QLineF(point_1, point_2), parent)
            line.setPen(pen)
            return line

        top_left = QPointF(0, 0)
        bottom_left = QPointF(0, self.height)
        top_right = QPointF(self.width, 0)
        bottom_right = QPointF(self.width, self.height)

        self.border_top = _draw_border(top_left, top_right, t,
                                       self) if t else None
        self.border_bottom = _draw_border(bottom_left, bottom_right, b,
                                          self) if b else None
        self.border_left = _draw_border(top_left, bottom_left, l,
                                        self) if l else None
        self.border_right = _draw_border(top_right, bottom_right, r,
                                         self) if r else None

        # _plot_`dim` accounts for all the paddings and spacings
        self._plot_height = self.height
        self._plot_height -= top_padding + bottom_padding
        self._plot_height -= t / 4 + b / 4

        self._plot_width = self.width
        self._plot_width -= 2 * side_padding
        self._plot_width -= (self.n_bins - 2) * bar_spacing
        self._plot_width -= l / 4 + r / 4

        self.__layout = QGraphicsLinearLayout(Qt.Horizontal, self)
        self.__layout.setContentsMargins(side_padding + r / 2,
                                         top_padding + t / 2,
                                         side_padding + l / 2,
                                         bottom_padding + b / 2)
        self.__layout.setSpacing(bar_spacing)

        # If the data contains any non-NaN values, we can draw a histogram
        if self.x.size > 0:
            self.edges, self.distributions = self._histogram()
            self._draw_histogram()
Ejemplo n.º 11
0
 def test_nanunique(self):
     x = csr_matrix(np.array([0, 1, 1, np.nan]))
     np.testing.assert_array_equal(nanunique(x), np.array([0, 1]))
Ejemplo n.º 12
0
    def __init__(self, data, variable, parent=None, height=200,
                 width=300, side_padding=5, top_padding=20, bar_spacing=4,
                 border=0, border_color=None, color_attribute=None, n_bins=10):
        super().__init__(parent)
        self.height, self.width = height, width
        self.padding = side_padding
        self.bar_spacing = bar_spacing

        self.data = data
        self.attribute = data.domain[variable]

        self.x = data.get_column_view(self.attribute)[0].astype(np.float64)
        self.x_nans = np.isnan(self.x)
        self.x = self.x[~self.x_nans]

        if self.attribute.is_discrete:
            self.n_bins = len(self.attribute.values)
        elif self.attribute.is_continuous:
            # If the attribute is continuous but contains fewer values than the
            # bins, it is better to assign each their own bin. We will require
            # at least 2 bins so that the histogram still visually makes sense
            # except if there is only a single value, then we use 3 bins for
            # symmetry
            num_unique = ut.nanunique(self.x).shape[0]
            if num_unique == 1:
                self.n_bins = 3
            else:
                self.n_bins = min(max(2, num_unique), n_bins)

        # Handle target variable index
        self.color_attribute = color_attribute
        if self.color_attribute is not None:
            self.target_var = data.domain[color_attribute]
            self.y = data.get_column_view(color_attribute)[0]
            self.y = self.y[~self.x_nans]
            if not np.issubdtype(self.y.dtype, np.number):
                self.y = self.y.astype(np.float64)
        else:
            self.target_var, self.y = None, None

        # Borders
        self.border_color = border_color if border_color is not None else '#000'
        if isinstance(border, tuple):
            assert len(border) == 4, 'Border tuple must be of size 4.'
            self.border = border
        else:
            self.border = (border, border, border, border)
        t, r, b, l = self.border

        def _draw_border(point_1, point_2, border_width, parent):
            pen = QPen(QColor(self.border_color))
            pen.setCosmetic(True)
            pen.setWidth(border_width)
            line = QGraphicsLineItem(QLineF(point_1, point_2), parent)
            line.setPen(pen)
            return line

        top_left = QPointF(0, 0)
        bottom_left = QPointF(0, self.height)
        top_right = QPointF(self.width, 0)
        bottom_right = QPointF(self.width, self.height)

        self.border_top = _draw_border(top_left, top_right, t, self) if t else None
        self.border_bottom = _draw_border(bottom_left, bottom_right, b, self) if b else None
        self.border_left = _draw_border(top_left, bottom_left, l, self) if l else None
        self.border_right = _draw_border(top_right, bottom_right, r, self) if r else None

        # _plot_`dim` accounts for all the paddings and spacings
        self._plot_height = self.height
        self._plot_height -= top_padding
        self._plot_height -= t / 4 + b / 4

        self._plot_width = self.width
        self._plot_width -= 2 * side_padding
        self._plot_width -= (self.n_bins - 2) * bar_spacing
        self._plot_width -= l / 4 + r / 4

        self.__layout = QGraphicsLinearLayout(Qt.Horizontal, self)
        self.__layout.setContentsMargins(
            side_padding + r / 2,
            top_padding + t / 2,
            side_padding + l / 2,
            b / 2
        )
        self.__layout.setSpacing(bar_spacing)

        # If the data contains any non-NaN values, we can draw a histogram
        if self.x.size > 0:
            self.edges, self.distributions = self._histogram()
            self._draw_histogram()
Ejemplo n.º 13
0
 def test_nanunique(self):
     x = csr_matrix(np.array([0, 1, 1, np.nan]))
     np.testing.assert_array_equal(
         nanunique(x),
         np.array([0, 1])
     )
Ejemplo n.º 14
0
def remove_unused_values(var, data):
    unique = nanunique(data.get_column_view(var)[0].astype(float)).astype(int)
    if len(unique) == len(var.values):
        return var
    used_values = [var.values[i] for i in unique]
    return DiscreteVariable(var.name, values=used_values, sparse=var.sparse)