def __init__(self, table: Table, agg_funs: Iterable[Functions], row_var: Variable, col_var: Variable = None, val_var: Variable = None): self._group_tables = self.Tables() self._pivot_tables = self.Tables() self._table = table self._row_var = row_var self._col_var = col_var if col_var else row_var self.renamed = [] if not table: return if not self._row_var.is_primitive(): raise TypeError("Row variable should be DiscreteVariable" " or ContinuousVariable") if self._col_var and not self._col_var.is_discrete: raise TypeError("Column variable should be DiscreteVariable") self._row_var_col = table.get_column_view(row_var)[0].astype(np.float) self._col_var_col = table.get_column_view(self._col_var)[0].astype( np.float) self._row_var_groups = nanunique(self._row_var_col) self._col_var_groups = nanunique(self._col_var_col) self._total_var = DiscreteVariable("Total", values=("total", )) self._current_agg_functions = sorted(agg_funs) self._indepen_agg_done = {} # type: Dict[Functions, int] self._depen_agg_done = {} # type: Dict[Functions, Dict[Variable, int]] self._initialize(agg_funs, val_var)
def test_nanunique_ignores_nans_in_counts(self, array): # pylint: disable=bad-whitespace x = array([[-1., 1., 0., 2., 3., np.nan], [0., 0., 0., 3., 5., np.nan], [-1., 0., 0., 1., 7., 6.]]) expected = [2, 6, 2, 1, 2, 1, 1, 1] np.testing.assert_equal(nanunique(x, return_counts=True)[1], expected)
def test_nanunique_ignores_nans_in_values(self, array): # pylint: disable=bad-whitespace x = array([[-1., 1., 0., 2., 3., np.nan], [0., 0., 0., 3., 5., np.nan], [-1., 0., 0., 1., 7., 6.]]) expected = [-1, 0, 1, 2, 3, 5, 6, 7] np.testing.assert_equal(nanunique(x, return_counts=False), expected)
def remove_unused_values(var, data): column_data = Table.from_table( Domain([var]), data ) unique = nanunique(column_data.X).astype(int) if len(unique) == len(var.values): return var used_values = [var.values[i] for i in unique] translation_table = np.array([np.NaN] * len(var.values)) translation_table[unique] = range(len(used_values)) base_value = -1 if 0 >= var.base_value < len(var.values): base = translation_table[var.base_value] if np.isfinite(base): base_value = int(base) return DiscreteVariable("{}".format(var.name), values=used_values, base_value=base_value, compute_value=Lookup(var, translation_table), sparse=var.sparse, )
def test_nanunique_ignores_nans_in_counts(self, array): # pylint: disable=bad-whitespace x = array([[-1., 1., 0., 2., 3., np.nan], [ 0., 0., 0., 3., 5., np.nan], [-1., 0., 0., 1., 7., 6.]]) expected = [2, 6, 2, 1, 2, 1, 1, 1] np.testing.assert_equal(nanunique(x, return_counts=True)[1], expected)
def test_nanunique_ignores_nans_in_values(self, array): # pylint: disable=bad-whitespace x = array([[-1., 1., 0., 2., 3., np.nan], [ 0., 0., 0., 3., 5., np.nan], [-1., 0., 0., 1., 7., 6.]]) expected = [-1, 0, 1, 2, 3, 5, 6, 7] np.testing.assert_equal(nanunique(x, return_counts=False), expected)
def commit(self): def send_outputs(pivot_table, filtered_data, grouped_data): if self.data: if grouped_data: grouped_data.name = self.data.name if pivot_table: pivot_table.name = self.data.name if filtered_data: filtered_data.name = self.data.name self.Outputs.grouped_data.send(grouped_data) self.Outputs.pivot_table.send(pivot_table) self.Outputs.filtered_data.send(filtered_data) self.Warning.renamed_vars.clear() self.Warning.too_many_values.clear() self.Warning.cannot_aggregate.clear() self.Warning.no_col_feature.clear() self.table_view.clear() if self.pivot is None: if self.data: if not self.data_has_primitives: self.Warning.no_variables() send_outputs(None, None, None) return if self.no_col_feature: self.Warning.no_col_feature() send_outputs(None, None, None) return if self.data: col_var = self.col_feature or self.row_feature col = self.data.get_column_view(col_var)[0].astype(float) if len(nanunique(col)) >= self.MAX_VALUES: self.table_view.clear() self.Warning.too_many_values() send_outputs(None, None, None) return self.pivot = Pivot(self.data, self.sel_agg_functions, self.row_feature, self.col_feature, self.val_feature) if self.skipped_aggs: self.Warning.cannot_aggregate(self.skipped_aggs) self._update_graph() send_outputs(self.pivot.pivot_table, self.get_filtered_data(), self.pivot.group_table) if self.pivot.renamed: self.Warning.renamed_vars(self.pivot.renamed)
def remove_unused_values(var, data): unique = nanunique(data.get_column_view(var)[0].astype(float)).astype(int) if len(unique) == len(var.values): return var used_values = [var.values[i] for i in unique] translation_table = np.array([np.NaN] * len(var.values)) translation_table[unique] = range(len(used_values)) return DiscreteVariable(var.name, values=used_values, sparse=var.sparse, compute_value=Lookup(var, translation_table))
def remove_unused_values(var, data): column_data = Table.from_table(Domain([var]), data) unique = nanunique(column_data.X).astype(int) if len(unique) == len(var.values): return var used_values = [var.values[i] for i in unique] translation_table = np.array([np.NaN] * len(var.values)) translation_table[unique] = range(len(used_values)) return DiscreteVariable("{}".format(var.name), values=used_values, compute_value=Lookup(var, translation_table), sparse=var.sparse)
def __init__(self, data, variable, parent=None, height=200, width=300, side_padding=5, top_padding=20, bottom_padding=0, bar_spacing=4, border=0, border_color=None, color_attribute=None, n_bins=10): super().__init__(parent) self.height, self.width = height, width self.padding = side_padding self.bar_spacing = bar_spacing self.data = data self.attribute = data.domain[variable] self.x = data.get_column_view(self.attribute)[0].astype(np.float64) self.x_nans = np.isnan(self.x) self.x = self.x[~self.x_nans] if self.attribute.is_discrete: self.n_bins = len(self.attribute.values) elif self.attribute.is_continuous: # If the attribute is continuous but contains fewer values than the # bins, it is better to assign each their own bin. We will require # at least 2 bins so that the histogram still visually makes sense # except if there is only a single value, then we use 3 bins for # symmetry num_unique = ut.nanunique(self.x).shape[0] if num_unique == 1: self.n_bins = 3 else: self.n_bins = min(max(2, num_unique), n_bins) # Handle target variable index self.color_attribute = color_attribute if self.color_attribute is not None: self.target_var = data.domain[color_attribute] self.y = data.get_column_view(color_attribute)[0] self.y = self.y[~self.x_nans] if not np.issubdtype(self.y.dtype, np.number): self.y = self.y.astype(np.float64) else: self.target_var, self.y = None, None # Borders self.border_color = border_color if border_color is not None else '#000' if isinstance(border, tuple): assert len(border) == 4, 'Border tuple must be of size 4.' self.border = border else: self.border = (border, border, border, border) t, r, b, l = self.border def _draw_border(point_1, point_2, border_width, parent): pen = QPen(QColor(self.border_color)) pen.setCosmetic(True) pen.setWidth(border_width) line = QGraphicsLineItem(QLineF(point_1, point_2), parent) line.setPen(pen) return line top_left = QPointF(0, 0) bottom_left = QPointF(0, self.height) top_right = QPointF(self.width, 0) bottom_right = QPointF(self.width, self.height) self.border_top = _draw_border(top_left, top_right, t, self) if t else None self.border_bottom = _draw_border(bottom_left, bottom_right, b, self) if b else None self.border_left = _draw_border(top_left, bottom_left, l, self) if l else None self.border_right = _draw_border(top_right, bottom_right, r, self) if r else None # _plot_`dim` accounts for all the paddings and spacings self._plot_height = self.height self._plot_height -= top_padding + bottom_padding self._plot_height -= t / 4 + b / 4 self._plot_width = self.width self._plot_width -= 2 * side_padding self._plot_width -= (self.n_bins - 2) * bar_spacing self._plot_width -= l / 4 + r / 4 self.__layout = QGraphicsLinearLayout(Qt.Horizontal, self) self.__layout.setContentsMargins(side_padding + r / 2, top_padding + t / 2, side_padding + l / 2, bottom_padding + b / 2) self.__layout.setSpacing(bar_spacing) # If the data contains any non-NaN values, we can draw a histogram if self.x.size > 0: self.edges, self.distributions = self._histogram() self._draw_histogram()
def test_nanunique(self): x = csr_matrix(np.array([0, 1, 1, np.nan])) np.testing.assert_array_equal(nanunique(x), np.array([0, 1]))
def __init__(self, data, variable, parent=None, height=200, width=300, side_padding=5, top_padding=20, bar_spacing=4, border=0, border_color=None, color_attribute=None, n_bins=10): super().__init__(parent) self.height, self.width = height, width self.padding = side_padding self.bar_spacing = bar_spacing self.data = data self.attribute = data.domain[variable] self.x = data.get_column_view(self.attribute)[0].astype(np.float64) self.x_nans = np.isnan(self.x) self.x = self.x[~self.x_nans] if self.attribute.is_discrete: self.n_bins = len(self.attribute.values) elif self.attribute.is_continuous: # If the attribute is continuous but contains fewer values than the # bins, it is better to assign each their own bin. We will require # at least 2 bins so that the histogram still visually makes sense # except if there is only a single value, then we use 3 bins for # symmetry num_unique = ut.nanunique(self.x).shape[0] if num_unique == 1: self.n_bins = 3 else: self.n_bins = min(max(2, num_unique), n_bins) # Handle target variable index self.color_attribute = color_attribute if self.color_attribute is not None: self.target_var = data.domain[color_attribute] self.y = data.get_column_view(color_attribute)[0] self.y = self.y[~self.x_nans] if not np.issubdtype(self.y.dtype, np.number): self.y = self.y.astype(np.float64) else: self.target_var, self.y = None, None # Borders self.border_color = border_color if border_color is not None else '#000' if isinstance(border, tuple): assert len(border) == 4, 'Border tuple must be of size 4.' self.border = border else: self.border = (border, border, border, border) t, r, b, l = self.border def _draw_border(point_1, point_2, border_width, parent): pen = QPen(QColor(self.border_color)) pen.setCosmetic(True) pen.setWidth(border_width) line = QGraphicsLineItem(QLineF(point_1, point_2), parent) line.setPen(pen) return line top_left = QPointF(0, 0) bottom_left = QPointF(0, self.height) top_right = QPointF(self.width, 0) bottom_right = QPointF(self.width, self.height) self.border_top = _draw_border(top_left, top_right, t, self) if t else None self.border_bottom = _draw_border(bottom_left, bottom_right, b, self) if b else None self.border_left = _draw_border(top_left, bottom_left, l, self) if l else None self.border_right = _draw_border(top_right, bottom_right, r, self) if r else None # _plot_`dim` accounts for all the paddings and spacings self._plot_height = self.height self._plot_height -= top_padding self._plot_height -= t / 4 + b / 4 self._plot_width = self.width self._plot_width -= 2 * side_padding self._plot_width -= (self.n_bins - 2) * bar_spacing self._plot_width -= l / 4 + r / 4 self.__layout = QGraphicsLinearLayout(Qt.Horizontal, self) self.__layout.setContentsMargins( side_padding + r / 2, top_padding + t / 2, side_padding + l / 2, b / 2 ) self.__layout.setSpacing(bar_spacing) # If the data contains any non-NaN values, we can draw a histogram if self.x.size > 0: self.edges, self.distributions = self._histogram() self._draw_histogram()
def test_nanunique(self): x = csr_matrix(np.array([0, 1, 1, np.nan])) np.testing.assert_array_equal( nanunique(x), np.array([0, 1]) )
def remove_unused_values(var, data): unique = nanunique(data.get_column_view(var)[0].astype(float)).astype(int) if len(unique) == len(var.values): return var used_values = [var.values[i] for i in unique] return DiscreteVariable(var.name, values=used_values, sparse=var.sparse)