def test_get_distribution(self): d = data.Table("iris") cls = d.domain.class_var disc = distribution.get_distribution(d, cls) self.assertIsInstance(disc, np.ndarray) self.assertIs(disc.variable, cls) self.assertEqual(disc.unknowns, 0) np.testing.assert_array_equal(disc, [50, 50, 50]) petal_length = d.columns.petal_length freqs = np.array( [ (1.0, 1), (1.1, 1), (1.2, 2), (1.3, 7), (1.4, 12), (1.5, 14), (1.6, 7), (1.7, 4), (1.9, 2), (3.0, 1), (3.3, 2), (3.5, 2), (3.6, 1), (3.7, 1), (3.8, 1), (3.9, 3), (4.0, 5), (4.1, 3), (4.2, 4), (4.3, 2), (4.4, 4), (4.5, 8), (4.6, 3), (4.7, 5), (4.8, 4), (4.9, 5), (5.0, 4), (5.1, 8), (5.2, 2), (5.3, 2), (5.4, 2), (5.5, 3), (5.6, 6), (5.7, 3), (5.8, 3), (5.9, 2), (6.0, 2), (6.1, 3), (6.3, 1), (6.4, 1), (6.6, 1), (6.7, 2), (6.9, 1), ] ).T disc = distribution.get_distribution(d, petal_length) np.testing.assert_almost_equal(disc, freqs)
def column_imputer_random(variable, data): if variable.is_discrete: dist = distribution.get_distribution(data, variable) transformer = RandomTransform(variable, dist) elif variable.is_continuous: dist = distribution.get_distribution(data, variable) transformer = RandomTransform(variable, dist) return RandomImputerModel((variable,), (variable,), (transformer,))
def column_imputer_random(variable, data): if isinstance(variable, Orange.data.DiscreteVariable): dist = distribution.get_distribution(data, variable) transformer = RandomTransform(variable, dist) elif isinstance(variable, Orange.data.ContinuousVariable): dist = distribution.get_distribution(data, variable) transformer = RandomTransform(variable, dist) return RandomImputerModel((variable,), (variable,), (transformer,))
def compute_box_data(self): attr = self.attribute if not attr: return dataset = self.dataset self.is_continuous = attr.is_continuous if dataset is None or not self.is_continuous and not attr.values or \ self.group_var and not self.group_var.values: self.stats = self.dist = self.conts = [] return if self.group_var: self.dist = [] self.conts = contingency.get_contingency( dataset, attr, self.group_var) if self.is_continuous: self.stats = [BoxData(cont, attr, i, self.group_var) for i, cont in enumerate(self.conts)] self.label_txts_all = self.group_var.values else: self.dist = distribution.get_distribution(dataset, attr) self.conts = [] if self.is_continuous: self.stats = [BoxData(self.dist, attr, None)] self.label_txts_all = [""] self.label_txts = [txts for stat, txts in zip(self.stats, self.label_txts_all) if stat.n > 0] self.stats = [stat for stat in self.stats if stat.n > 0]
def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] self.cb_prob.clear() self.cb_prob.addItem("(None)") self.cb_prob.addItems(self.cvar.values) self.cb_prob.addItem("(All)") self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1) data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: data = self.data[:, (self.var, self.cvar) if self.cvar else self.var] disc = Orange.preprocess.discretize.EqualWidth(n=self.bins[self.smoothing_index]) data = Orange.preprocess.Discretize(data, method=disc, remove_const=False) self.var = data.domain[0] self.set_left_axis_name() self.enable_disable_rel_freq() if self.cvar: self.contingencies = contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange()
def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: data = self.data[:, (self.var, self.cvar) if self.cvar else self.var ] disc = Orange.preprocess.discretize.EqualWidth(n=self.bins[self.smoothing_index]) data = Orange.preprocess.Discretize(data, method=disc) self.var = data.domain.variables[0] self.set_left_axis_name() self.enable_disable_rel_freq() if self.cvar: self.contingencies = \ contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange()
def fit_storage(self, data): dist = distribution.get_distribution(data, data.domain.class_var) N = dist.sum() if N > 0: dist /= N else: dist.fill(1 / len(dist)) return ConstantClassifier(dist=dist)
def compute_box_data(self): if self.split_var: return ( contingency.get_contingency( self.dataset, self.attribute, self.split_var), self.split_var.values) else: return [ distribution.get_distribution( self.dataset, self.attribute)], [""]
def fit_storage(self, dat): if not isinstance(dat.domain.class_var, data.DiscreteVariable): raise ValueError("classification.MajorityFitter expects a domain with a " "(single) discrete variable") dist = distribution.get_distribution(dat, dat.domain.class_var) N = dist.sum() if N > 0: dist /= N else: dist.fill(1 / len(dist)) return ConstantClassifier(dist=dist)
def fit_storage(self, dat): if not dat.domain.has_discrete_class: raise ValueError("classification.MajorityLearner expects a domain with a " "(single) discrete variable") dist = distribution.get_distribution(dat, dat.domain.class_var) N = dist.sum() if N > 0: dist /= N else: dist.fill(1 / len(dist)) return ConstantModel(dist=dist)
def __call__(self, data, attribute): if type(data) == SqlTable: att = attribute.to_sql() quantiles = [(i + 1) / self.n for i in range(self.n - 1)] query = data._sql_query(['quantile(%s, ARRAY%s)' % (att, str(quantiles))]) with data._execute_sql_query(query) as cur: points = sorted(set(cur.fetchone()[0])) else: d = distribution.get_distribution(data, attribute) points = _discretize.split_eq_freq(d, n=self.n) return Discretizer.create_discretized_var( data.domain[attribute], points)
def _ensure_dist(var, data_or_dist): if isinstance(data_or_dist, distribution.Discrete): if not is_discrete(var): raise TypeError return data_or_dist elif isinstance(data_or_dist, distribution.Continuous): if not is_continuous(var): raise TypeError return data_or_dist elif isinstance(data_or_dist, Orange.data.Storage): return distribution.get_distribution(data_or_dist, var) else: raise ValueError("Need a distribution or data.")
def __call__(self, data, attribute): if type(data) == SqlTable: att = attribute.to_sql() quantiles = [(i + 1) / self.n for i in range(self.n - 1)] query = data._sql_query( ['quantile(%s, ARRAY%s)' % (att, str(quantiles))]) with data._execute_sql_query(query) as cur: points = sorted(set(cur.fetchone()[0])) else: d = distribution.get_distribution(data, attribute) points = _discretize.split_eq_freq(d, self.n) return Discretizer.create_discretized_var(data.domain[attribute], points)
def __call__(self, data, variable): if is_continuous(variable): stats = basic_stats.BasicStats(data, variable) value = stats.mean elif is_discrete(variable): dist = distribution.get_distribution(data, variable) value = dist.modus() else: raise TypeError var = copy.copy(variable) var.compute_value = ReplaceUnknowns(variable, value) return var
def _ensure_dist(var, data_or_dist): if isinstance(data_or_dist, distribution.Discrete): if not var.is_discrete: raise TypeError return data_or_dist elif isinstance(data_or_dist, distribution.Continuous): if not var.is_continuous: raise TypeError return data_or_dist elif isinstance(data_or_dist, Orange.data.Storage): return distribution.get_distribution(data_or_dist, var) else: raise ValueError("Need a distribution or data.")
def fit_storage(self, dat): if not isinstance(dat.domain.class_var, data.DiscreteVariable): raise ValueError( "classification.MajorityLearner expects a domain with a " "(single) discrete variable") dist = distribution.get_distribution(dat, dat.domain.class_var) N = dist.sum() if N > 0: dist /= N else: dist.fill(1 / len(dist)) return ConstantModel(dist=dist)
def __call__(self, data, variable, value=None): variable = data.domain[variable] if value is None: if variable.is_continuous: stats = basic_stats.BasicStats(data, variable) value = stats.mean elif variable.is_discrete: dist = distribution.get_distribution(data, variable) value = dist.modus() else: raise TypeError("Variable must be continuous or discrete") return variable.copy(compute_value=ReplaceUnknowns(variable, value))
def fit_storage(self, data): """ Construct a :obj:`MeanModel` by computing the mean value of the given data. :param data: data table :type data: Orange.data.Table :return: regression model, which always returns mean value :rtype: :obj:`MeanModel` """ if not data.domain.has_continuous_class: raise ValueError("regression.MeanLearner expects a domain with a " "(single) continuous variable") dist = distribution.get_distribution(data, data.domain.class_var) return MeanModel(dist)
def __call__(self, data, variable, value=None): variable = data.domain[variable] if value is None: if is_continuous(variable): stats = basic_stats.BasicStats(data, variable) value = stats.mean elif is_discrete(variable): dist = distribution.get_distribution(data, variable) value = dist.modus() else: raise TypeError("Variable must be continuous or discrete") var = copy.copy(variable) var.compute_value = ReplaceUnknowns(variable, value) return var
def __call__(self, data, variable, value=None): variable = data.domain[variable] if value is None: if variable.is_continuous: stats = basic_stats.BasicStats(data, variable) value = stats.mean elif variable.is_discrete: dist = distribution.get_distribution(data, variable) value = dist.modus() else: raise TypeError("Variable must be numeric or categorical.") a = variable.copy(compute_value=ReplaceUnknowns(variable, value)) a.to_sql = ImputeSql(variable, value) return a
def __call__(self, data, attribute): if type(data) == SqlTable: att = attribute.to_sql() quantiles = [(i + 1) / self.n for i in range(self.n - 1)] query = data._sql_query( ['quantile(%s, ARRAY%s)' % (att, str(quantiles))], use_time_sample=1000) with data._execute_sql_query(query) as cur: points = sorted(set(cur.fetchone()[0])) else: d = distribution.get_distribution(data, attribute) points = _discretize.split_eq_freq(d, self.n) # np.unique handles cases in which differences are below precision points = list(np.unique(points)) return Discretizer.create_discretized_var(data.domain[attribute], points)
def transform(var): dist = distribution.get_distribution(data, var) if self.center: c = self.center(dist) dist[0, :] -= c else: c = 0 if self.scale: s = self.scale(dist) if s < 1e-15: s = 1 else: s = 1 factor = 1 / s return var.copy(compute_value=transformation.Normalizer(var, c, factor))
def compute_box_data(self): attr = self.attribute if not attr: return dataset = self.dataset if dataset is None \ or not attr.is_continuous and not attr.values \ or self.group_var and not self.group_var.values: self.stats = [] self.dist = self.conts = None return if self.group_var: self.dist = None missing_val_str = f"missing '{self.group_var.name}'" group_var_labels = self.group_var.values + ("", ) if self.attribute.is_continuous: stats, label_texts = [], [] attr_col = dataset.get_column_view(attr)[0].astype(float) for group, value in \ zip(self._group_cols(dataset, self.group_var, attr_col), group_var_labels): if group.size: stats.append(BoxData(group, value)) label_texts.append(value or missing_val_str) self.stats = stats self.label_txts_all = label_texts else: self.conts = contingency.get_contingency( dataset, attr, self.group_var) self.label_txts_all = [ v or missing_val_str for v, c in zip( group_var_labels, self.conts.array_with_unknowns) if np.sum(c) > 0 ] else: self.conts = None if self.attribute.is_continuous: attr_col = dataset.get_column_view(attr)[0].astype(float) self.stats = [BoxData(attr_col)] else: self.dist = distribution.get_distribution(dataset, attr) self.label_txts_all = [""] self.label_txts = [ txts for stat, txts in zip(self.stats, self.label_txts_all) if stat.n > 0 ] self.stats = [stat for stat in self.stats if stat.n > 0]
def __call__(self, data, variable): variable = data.domain[variable] dist = distribution.get_distribution(data, variable) # A distribution is invalid if a continuous variable's column does not # contain any known values or if a discrete variable's .values == [] isinvalid = dist.size == 0 if isinvalid and variable.is_discrete: assert len(variable.values) == 0 raise ValueError("'{}' has no values".format(variable)) elif isinvalid and variable.is_continuous: raise ValueError("'{}' has an unknown distribution".format(variable)) if variable.is_discrete and numpy.sum(dist) == 0: dist += 1 / len(dist) elif variable.is_continuous and numpy.sum(dist[1, :]) == 0: dist[1, :] += 1 / dist.shape[1] return variable.copy(compute_value=ReplaceUnknownsRandom(variable, dist))
def _disc_plot(self): var = self.var self.ploti.getAxis("bottom").setTicks([list(enumerate(var.values))]) colors = [QColor(0, 128, 255)] dist = distribution.get_distribution(self.data, self.var) for i, freq in enumerate(dist): tooltip = \ "<p style='white-space:pre;'>" \ f"<b>{escape(var.values[i])}</b>: {int(freq)} " \ f"({100 * freq / len(self.valid_data):.2f} %) " self._add_bar(i - 0.5, 1, 0.1, [freq], colors, stacked=False, expanded=False, tooltip=tooltip)
def fit_storage(self, dat): if not dat.domain.has_discrete_class: raise ValueError("classification.MajorityLearner expects a domain " "with a (single) discrete variable") dist = distribution.get_distribution(dat, dat.domain.class_var) N = dist.sum() if N > 0: dist /= N else: dist.fill(1 / len(dist)) probs = np.array(dist) ties = np.flatnonzero(probs == probs.max()) if len(ties) > 1: random_idx = int(sha1(bytes(dat.Y)).hexdigest(), 16) % len(ties) unif_maj = ties[random_idx] else: unif_maj = None return ConstantModel(dist=dist, unif_maj=unif_maj)
def transform(var): dist = distribution.get_distribution(data, var) if self.center != self.NoCentering: c = self.center(dist) dist[0, :] -= c else: c = 0 if self.scale != self.NoScaling: s = self.scale(dist) if s < 1e-15: s = 1 else: s = 1 factor = 1 / s transformed_var = var.copy(compute_value=transformation.Normalizer(var, c, factor)) if s != 1: transformed_var.number_of_decimals = 3 return transformed_var
def fit_storage(self, dat): if not dat.domain.has_discrete_class: raise ValueError("classification.MajorityLearner expects a domain " "with a (single) discrete variable") dist = distribution.get_distribution(dat, dat.domain.class_var) N = dist.sum() if N > 0: dist /= N else: dist.fill(1 / len(dist)) probs = np.array(dist) ties = np.flatnonzero(probs == probs.max()) if len(ties) > 1: random_idx = int(sha1(bytes(dat.Y)).hexdigest(), 16) % len(ties) unif_maj = ties[random_idx] else: unif_maj = None return ConstantModel(dist=dist, unif_maj=unif_maj)
def compute_box_data(self): attr = self.attribute if not attr: return dataset = self.dataset if dataset is None \ or not attr.is_continuous and not attr.values \ or self.group_var and not self.group_var.values: self.stats = [] self.dist = self.conts = None return if self.group_var: self.dist = None self.conts = contingency.get_contingency(dataset, attr, self.group_var) missing_val_str = f"missing '{self.group_var.name}'" group_var_labels = self.group_var.values + ("", ) if self.attribute.is_continuous: stats, label_texts = [], [] for cont, value in zip(self.conts.array_with_unknowns, group_var_labels): if np.sum(cont[1]): stats.append(BoxData(cont, value)) label_texts.append(value or missing_val_str) self.stats = stats self.label_txts_all = label_texts else: self.label_txts_all = [ v or missing_val_str for v, c in zip( group_var_labels, self.conts.array_with_unknowns) if np.sum(c) > 0 ] else: self.dist = distribution.get_distribution(dataset, attr) self.conts = None if self.attribute.is_continuous: self.stats = [BoxData(self.dist, None)] self.label_txts_all = [""] self.label_txts = [ txts for stat, txts in zip(self.stats, self.label_txts_all) if stat.n > 0 ] self.stats = [stat for stat in self.stats if stat.n > 0]
def __call__(self, data, variable): variable = data.domain[variable] dist = distribution.get_distribution(data, variable) # A distribution is invalid if a continuous variable's column does not # contain any known values or if a discrete variable's .values == [] isinvalid = dist.size == 0 if isinvalid and variable.is_discrete: assert len(variable.values) == 0 raise ValueError("'{}' has no values".format(variable)) elif isinvalid and variable.is_continuous: raise ValueError( "'{}' has an unknown distribution".format(variable)) if variable.is_discrete and np.sum(dist) == 0: dist += 1 / len(dist) elif variable.is_continuous and np.sum(dist[1, :]) == 0: dist[1, :] += 1 / dist.shape[1] return variable.copy( compute_value=ReplaceUnknownsRandom(variable, dist))
def compute_box_data(self): attr = self.attribute if not attr: return dataset = self.dataset self.is_continuous = attr.is_continuous if dataset is None or not self.is_continuous and not attr.values or \ self.group_var and not self.group_var.values: self.stats = self.dist = self.conts = [] return if self.group_var: self.dist = [] self.conts = contingency.get_contingency(dataset, attr, self.group_var) group_var_labels = self.group_var.values + [ f"missing '{self.group_var.name}'" ] if self.is_continuous: stats, label_texts = [], [] for i, cont in enumerate(self.conts.array_with_unknowns): if np.sum(cont[1]): stats.append(BoxData(cont, attr, i, self.group_var)) label_texts.append(group_var_labels[i]) self.stats = stats self.label_txts_all = label_texts else: self.label_txts_all = [ v for v, c in zip(group_var_labels, self.conts.array_with_unknowns) if np.sum(c) > 0 ] else: self.dist = distribution.get_distribution(dataset, attr) self.conts = [] if self.is_continuous: self.stats = [BoxData(self.dist, attr, None)] self.label_txts_all = [""] self.label_txts = [ txts for stat, txts in zip(self.stats, self.label_txts_all) if stat.n > 0 ] self.stats = [stat for stat in self.stats if stat.n > 0]
def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] prob = self.controls.show_prob prob.clear() prob.addItem("(None)") prob.addItems(self.cvar.values) prob.addItem("(All)") self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1) data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: domain = Orange.data.Domain( [self.var, self.cvar] if self.cvar else [self.var]) data = Orange.data.Table(domain, data) disc = EqualWidth(n=self.bins[self.smoothing_index]) data = Discretize(method=disc, remove_const=False)(data) self.var = data.domain[0] self.set_left_axis_name() self.enable_disable_rel_freq() self.controls.cumulative_distr.setDisabled(not self.var.is_continuous) if self.cvar: self.contingencies = \ contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange()
def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] prob = self.controls.show_prob prob.clear() prob.addItem("(None)") prob.addItems(self.cvar.values) prob.addItem("(All)") self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1) data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: domain = Orange.data.Domain( [self.var, self.cvar] if self.cvar else [self.var]) data = Orange.data.Table(domain, data) disc = EqualWidth(n=self.bins[self.smoothing_index]) data = Discretize(method=disc, remove_const=False)(data) self.var = data.domain[0] self.set_left_axis_name() self.enable_disable_rel_freq() self.controls.cumulative_distr.setDisabled(not self.var.is_continuous) if self.cvar: self.contingencies = \ contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange()
def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] self.cb_prob.clear() self.cb_prob.addItem("(None)") self.cb_prob.addItems(self.cvar.values) self.cb_prob.addItem("(All)") self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1) data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: data = self.data[:, (self.var, self.cvar) if self.cvar else self.var] disc = Orange.preprocess.discretize.EqualWidth( n=self.bins[self.smoothing_index]) data = Orange.preprocess.Discretize(data, method=disc, remove_const=False) self.var = data.domain[0] self.set_left_axis_name() self.enable_disable_rel_freq() if self.cvar: self.contingencies = \ contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange()
def __call__(self, data, attribute, fixed=None): if fixed: min, max = fixed[attribute.name] points = self._split_eq_width_fixed(min, max, n=self.n) else: if type(data) == SqlTable: att = attribute.to_sql() query = data._sql_query(['min(%s)::double precision' % att, 'max(%s)::double precision' % att]) with data._execute_sql_query(query) as cur: min, max = cur.fetchone() dif = (max - min) / self.n points = [min + (i + 1) * dif for i in range(self.n - 1)] else: # TODO: why is the whole distribution computed instead of # just min/max d = distribution.get_distribution(data, attribute) points = self._split_eq_width(d, n=self.n) return Discretizer.create_discretized_var( data.domain[attribute], points)
def _setup(self): self.plot.clear() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] self.set_left_axis_name() self.enable_disable_rel_freq() if self.var is None: return if self.cvar: self.contingencies = \ contingency.get_contingency(self.data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(self.data, self.var) self.display_distribution()
def _setup(self): self.plot.clear() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] self.set_left_axis_name() self.enable_disable_rel_freq() if self.var is None: return if self.cvar: self.contingencies = \ contingency.get_contingency(self.data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(self.data, self.var) self.display_distribution()
def transform(var): dist = distribution.get_distribution(data, var) if self.center != self.NoCentering: c = self.center(dist) dist[0, :] -= c else: c = 0 if self.scale != self.NoScaling: s = self.scale(dist) if s < 1e-15: s = 1 else: s = 1 factor = 1 / s transformed_var = var.copy( compute_value=transformation.Normalizer(var, c, factor)) if s != 1: transformed_var.number_of_decimals = 3 return transformed_var
def _disc_plot(self): var = self.var dist = distribution.get_distribution(self.data, self.var) dist = np.array(dist) # Distribution misbehaves in further operations if self.sort_by_freq: order = np.argsort(dist)[::-1] else: order = np.arange(len(dist)) ordered_values = np.array(var.values)[order] self.ploti.getAxis("bottom").setTicks([list(enumerate(ordered_values))]) colors = [QColor(0, 128, 255)] for i, freq, desc in zip(count(), dist[order], ordered_values): tooltip = \ "<p style='white-space:pre;'>" \ f"<b>{escape(desc)}</b>: {int(freq)} " \ f"({100 * freq / len(self.valid_data):.2f} %) " self._add_bar( i - 0.5, 1, 0.1, [freq], colors, stacked=False, expanded=False, tooltip=tooltip, desc=desc)
def _setup(self): """Setup the plot.""" self.plot.clear() varidx = self.variable_idx var = cvar = None if varidx >= 0: var = self.varmodel[varidx] if self.groupvar_idx >= 0: cvar = self.groupvarmodel[self.groupvar_idx] if var is None: return if is_discrete(cvar): cont = contingency.get_contingency(self.data, var, cvar) self.set_contingency(cont, var, cvar) else: dist = distribution.get_distribution(self.data, var) self.set_distribution(dist, var)
def fit_storage(self, dat): """ Constructs `Orange.classification.majority.ConstantClassifier` from given data. :param dat: table of data :type dat: Orange.data.Table :return: classification model, which always returns majority value :rtype: Orange.classification.majority.ConstantClassifier """ if not isinstance(dat.domain.class_var, data.DiscreteVariable): raise ValueError( "classification.MajorityFitter expects a domain with a " "(single) discrete variable") dist = distribution.get_distribution(dat, dat.domain.class_var) N = dist.sum() if N > 0: dist /= N else: dist.fill(1 / len(dist)) return ConstantClassifier(dist=dist)
def _setup(self): """Setup the plot.""" self.plot.clear() varidx = self.variable_idx var = cvar = None if varidx >= 0: var = self.varmodel[varidx] if self.groupvar_idx >= 0: cvar = self.groupvarmodel[self.groupvar_idx] if var is None: return if is_discrete(cvar): cont = contingency.get_contingency(self.data, var, cvar) self.set_contingency(cont, var, cvar) else: dist = distribution.get_distribution(self.data, var) self.set_distribution(dist, var)
def iterate_states(self, state): """ Iterate through all combinations of attributes as ordered by Relief, starting with a single attribute if Mosaic is colored by class distributions, and two if by Pearson. """ # If we put initialization of `self.attrs` to `initialize`, # `score_heuristic` would be run on every call to master's `set_data`. master = self.master data = master.discrete_data min_attrs, max_attrs = self.attr_range() if min_attrs > max_attrs: return if state is None: # on the first call, compute order if self._compute_class_dists(): self.marginal = get_distribution(data, data.domain.class_var) self.marginal.normalize() state = list(range(min_attrs)) else: self.marginal = get_distributions(data) for dist in self.marginal: dist.normalize() state = list(range(min_attrs)) n_attrs = len(data.domain.attributes) while True: yield state # Reset while running; just abort if self.attr_ordering is None: break for up, _ in enumerate(state): state[up] += 1 if up + 1 == len(state) or state[up] < state[up + 1]: break state[up] = up if state[-1] == len(self.attr_ordering): if len(state) < min(max_attrs, n_attrs): state = list(range(len(state) + 1)) else: break
def compute_box_data(self): attr = self.attribute if not attr: return dataset = self.dataset self.is_continuous = attr.is_continuous if dataset is None or not self.is_continuous and not attr.values or \ self.group_var and not self.group_var.values: self.stats = self.dist = self.conts = [] return if self.group_var: self.dist = [] self.conts = contingency.get_contingency( dataset, attr, self.group_var) if self.is_continuous: stats, label_texts = [], [] for i, cont in enumerate(self.conts): if np.sum(cont[1]): stats.append(BoxData(cont, attr, i, self.group_var)) label_texts.append(self.group_var.values[i]) self.stats = stats self.label_txts_all = label_texts else: self.label_txts_all = \ [v for v, c in zip(self.group_var.values, self.conts) if np.sum(c) > 0] else: self.dist = distribution.get_distribution(dataset, attr) self.conts = [] if self.is_continuous: self.stats = [BoxData(self.dist, attr, None)] self.label_txts_all = [""] self.label_txts = [txts for stat, txts in zip(self.stats, self.label_txts_all) if stat.n > 0] self.stats = [stat for stat in self.stats if stat.n > 0]
def iterate_states(self, state): """ Iterate through all combinations of attributes as ordered by Relief, starting with a single attribute if Mosaic is colored by class distributions, and two if by Pearson. """ # If we put initialization of `self.attrs` to `initialize`, # `score_heuristic` would be run on every call to master's `set_data`. master = self.master data = master.discrete_data if state is None: # on the first call, compute order if self._compute_class_dists(): self.marginal = get_distribution(data, data.domain.class_var) self.marginal.normalize() state = [0] else: self.marginal = get_distributions(data) for dist in self.marginal: dist.normalize() state = [0, 1] n_attrs = len(data.domain.attributes) while True: yield state # Reset while running; just abort if self.attr_ordering is None: break for up, _ in enumerate(state): state[up] += 1 if up + 1 == len(state) or state[up] < state[up + 1]: break state[up] = up if state[-1] == len(self.attr_ordering): if len(state) < min(self.max_attrs, n_attrs): state = list(range(len(state) + 1)) else: break
def column_imputer_modus(variable, table): stat = distribution.get_distribution(table, variable) column_imputer_defaults(variable, table, stat.modus())
def fit_storage(self, dat): if not isinstance(dat.domain.class_var, data.ContinuousVariable): raise ValueError("regression.MeanFitter expects a domain with a " "(single) continuous variable") dist = distribution.get_distribution(dat, dat.domain.class_var) return MeanModel(dist)
def add_rect(x0, x1, y0, y1, condition="", used_attrs=[], used_vals=[], attr_vals=""): area_index = len(self.areas) if x0 == x1: x1 += 1 if y0 == y1: y1 += 1 # rectangles of width and height 1 are not shown - increase if x1 - x0 + y1 - y0 == 2: y1 += 1 if class_var and class_var.is_discrete: colors = [QColor(*col) for col in class_var.colors] else: colors = None def select_area(_, ev): self.select_area(area_index, ev) def rect(x, y, w, h, z, pen_color=None, brush_color=None, **args): if pen_color is None: return CanvasRectangle(self.canvas, x, y, w, h, z=z, onclick=select_area, **args) if brush_color is None: brush_color = pen_color return CanvasRectangle(self.canvas, x, y, w, h, pen_color, brush_color, z=z, onclick=select_area, **args) def line(x1, y1, x2, y2): r = QGraphicsLineItem(x1, y1, x2, y2, None) self.canvas.addItem(r) r.setPen(QPen(Qt.white, 2)) r.setZValue(30) outer_rect = rect(x0, y0, x1 - x0, y1 - y0, 30) self.areas.append((used_attrs, used_vals, outer_rect)) if not conditionaldict[attr_vals]: return if self.interior_coloring == self.PEARSON: s = sum(apriori_dists[0]) expected = s * reduce( mul, (apriori_dists[i][used_vals[i]] / float(s) for i in range(len(used_vals)))) actual = conditionaldict[attr_vals] pearson = (actual - expected) / sqrt(expected) if pearson == 0: ind = 0 else: ind = max(0, min(int(log(abs(pearson), 2)), 3)) color = [self.RED_COLORS, self.BLUE_COLORS][pearson > 0][ind] rect(x0, y0, x1 - x0, y1 - y0, -20, color) outer_rect.setToolTip( condition + "<hr/>" + "Expected instances: %.1f<br>" "Actual instances: %d<br>" "Standardized (Pearson) residual: %.1f" % (expected, conditionaldict[attr_vals], pearson)) else: cls_values = get_variable_values_sorted(class_var) prior = get_distribution(data, class_var.name) total = 0 for i, value in enumerate(cls_values): val = conditionaldict[attr_vals + "-" + value] if val == 0: continue if i == len(cls_values) - 1: v = y1 - y0 - total else: v = ((y1 - y0) * val) / conditionaldict[attr_vals] rect(x0, y0 + total, x1 - x0, v, -20, colors[i]) total += v if self.use_boxes and \ abs(x1 - x0) > bar_width and \ abs(y1 - y0) > bar_width: total = 0 line(x0 + bar_width, y0, x0 + bar_width, y1) n = sum(prior) for i, (val, color) in enumerate(zip(prior, colors)): if i == len(prior) - 1: h = y1 - y0 - total else: h = (y1 - y0) * val / n rect(x0, y0 + total, bar_width, h, 20, color) total += h if conditionalsubsetdict: if conditionalsubsetdict[attr_vals]: counts = [ conditionalsubsetdict[attr_vals + "-" + val] for val in cls_values ] if sum(counts) == 1: rect(x0 - 2, y0 - 2, x1 - x0 + 5, y1 - y0 + 5, -550, colors[counts.index(1)], Qt.white, penWidth=2, penStyle=Qt.DashLine) if self.subset_data is not None: line(x1 - bar_width, y0, x1 - bar_width, y1) total = 0 n = conditionalsubsetdict[attr_vals] if n: for i, (cls, color) in \ enumerate(zip(cls_values, colors)): val = conditionalsubsetdict[attr_vals + "-" + cls] if val == 0: continue if i == len(prior) - 1: v = y1 - y0 - total else: v = ((y1 - y0) * val) / n rect(x1 - bar_width, y0 + total, bar_width, v, 15, color) total += v actual = [ conditionaldict[attr_vals + "-" + cls_values[i]] for i in range(len(prior)) ] n_actual = sum(actual) if n_actual > 0: apriori = [prior[key] for key in cls_values] n_apriori = sum(apriori) text = "<br/>".join( "<b>%s</b>: %d / %.1f%% (Expected %.1f / %.1f%%)" % (cls, act, 100.0 * act / n_actual, apr / n_apriori * n_actual, 100.0 * apr / n_apriori) for cls, act, apr in zip(cls_values, actual, apriori)) else: text = "" outer_rect.setToolTip("{}<hr>Instances: {}<br><br>{}".format( condition, n_actual, text[:-4]))
def update_graph(self): spacing = self.SPACING bar_width = self.BAR_WIDTH def draw_data(attr_list, x0_x1, y0_y1, side, condition, total_attrs, used_attrs=[], used_vals=[], attr_vals=""): x0, x1 = x0_x1 y0, y1 = y0_y1 if conditionaldict[attr_vals] == 0: add_rect(x0, x1, y0, y1, "", used_attrs, used_vals, attr_vals=attr_vals) # store coordinates for later drawing of labels draw_text(side, attr_list[0], (x0, x1), (y0, y1), total_attrs, used_attrs, used_vals, attr_vals) return attr = attr_list[0] # how much smaller rectangles do we draw edge = len(attr_list) * spacing values = get_variable_values_sorted(data.domain[attr]) if side % 2: values = values[::-1] # reverse names if necessary if side % 2 == 0: # we are drawing on the x axis # remove the space needed for separating different attr. values whole = max(0, (x1 - x0) - edge * (len(values) - 1)) if whole == 0: edge = (x1 - x0) / float(len(values) - 1) else: # we are drawing on the y axis whole = max(0, (y1 - y0) - edge * (len(values) - 1)) if whole == 0: edge = (y1 - y0) / float(len(values) - 1) if attr_vals == "": counts = [conditionaldict[val] for val in values] else: counts = [ conditionaldict[attr_vals + "-" + val] for val in values ] total = sum(counts) # if we are visualizing the third attribute and the first attribute # has the last value, we have to reverse the order in which the # boxes will be drawn otherwise, if the last cell, nearest to the # labels of the fourth attribute, is empty, we wouldn't be able to # position the labels valrange = list(range(len(values))) if len(attr_list + used_attrs) == 4 and len(used_attrs) == 2: attr1values = get_variable_values_sorted( data.domain[used_attrs[0]]) if used_vals[0] == attr1values[-1]: valrange = valrange[::-1] for i in valrange: start = i * edge + whole * float(sum(counts[:i]) / total) end = i * edge + whole * float(sum(counts[:i + 1]) / total) val = values[i] htmlval = getHtmlCompatibleString(val) if attr_vals != "": newattrvals = attr_vals + "-" + val else: newattrvals = val tooltip = condition + 4 * " " + attr + \ ": <b>" + htmlval + "</b><br>" attrs = used_attrs + [attr] vals = used_vals + [val] common_args = attrs, vals, newattrvals if side % 2 == 0: # if we are moving horizontally if len(attr_list) == 1: add_rect(x0 + start, x0 + end, y0, y1, tooltip, *common_args) else: draw_data(attr_list[1:], (x0 + start, x0 + end), (y0, y1), side + 1, tooltip, total_attrs, *common_args) else: if len(attr_list) == 1: add_rect(x0, x1, y0 + start, y0 + end, tooltip, *common_args) else: draw_data(attr_list[1:], (x0, x1), (y0 + start, y0 + end), side + 1, tooltip, total_attrs, *common_args) draw_text(side, attr_list[0], (x0, x1), (y0, y1), total_attrs, used_attrs, used_vals, attr_vals) def draw_text(side, attr, x0_x1, y0_y1, total_attrs, used_attrs, used_vals, attr_vals): x0, x1 = x0_x1 y0, y1 = y0_y1 if side in drawn_sides: return # the text on the right will be drawn when we are processing # visualization of the last value of the first attribute if side == 3: attr1values = \ get_variable_values_sorted(data.domain[used_attrs[0]]) if used_vals[0] != attr1values[-1]: return if not conditionaldict[attr_vals]: if side not in draw_positions: draw_positions[side] = (x0, x1, y0, y1) return else: if side in draw_positions: # restore the positions of attribute values and name (x0, x1, y0, y1) = draw_positions[side] drawn_sides.add(side) values = get_variable_values_sorted(data.domain[attr]) if side % 2: values = values[::-1] spaces = spacing * (total_attrs - side) * (len(values) - 1) width = x1 - x0 - spaces * (side % 2 == 0) height = y1 - y0 - spaces * (side % 2 == 1) # calculate position of first attribute currpos = 0 if attr_vals == "": counts = [conditionaldict.get(val, 1) for val in values] else: counts = [ conditionaldict.get(attr_vals + "-" + val, 1) for val in values ] total = sum(counts) if total == 0: counts = [1] * len(values) total = sum(counts) aligns = [ Qt.AlignTop | Qt.AlignHCenter, Qt.AlignRight | Qt.AlignVCenter, Qt.AlignBottom | Qt.AlignHCenter, Qt.AlignLeft | Qt.AlignVCenter ] align = aligns[side] for i in range(len(values)): val = values[i] perc = counts[i] / float(total) if distributiondict[val] != 0: if side == 0: CanvasText(self.canvas, str(val), x0 + currpos + width * 0.5 * perc, y1 + self.ATTR_VAL_OFFSET, align) elif side == 1: CanvasText(self.canvas, str(val), x0 - self.ATTR_VAL_OFFSET, y0 + currpos + height * 0.5 * perc, align) elif side == 2: CanvasText(self.canvas, str(val), x0 + currpos + width * perc * 0.5, y0 - self.ATTR_VAL_OFFSET, align) else: CanvasText(self.canvas, str(val), x1 + self.ATTR_VAL_OFFSET, y0 + currpos + height * 0.5 * perc, align) if side % 2 == 0: currpos += perc * width + spacing * (total_attrs - side) else: currpos += perc * height + spacing * (total_attrs - side) if side == 0: CanvasText(self.canvas, attr, x0 + (x1 - x0) / 2, y1 + self.ATTR_VAL_OFFSET + self.ATTR_NAME_OFFSET, align, bold=1) elif side == 1: CanvasText(self.canvas, attr, x0 - max_ylabel_w1 - self.ATTR_VAL_OFFSET, y0 + (y1 - y0) / 2, align, bold=1, vertical=True) elif side == 2: CanvasText(self.canvas, attr, x0 + (x1 - x0) / 2, y0 - self.ATTR_VAL_OFFSET - self.ATTR_NAME_OFFSET, align, bold=1) else: CanvasText(self.canvas, attr, x1 + max_ylabel_w2 + self.ATTR_VAL_OFFSET, y0 + (y1 - y0) / 2, align, bold=1, vertical=True) def add_rect(x0, x1, y0, y1, condition="", used_attrs=[], used_vals=[], attr_vals=""): area_index = len(self.areas) if x0 == x1: x1 += 1 if y0 == y1: y1 += 1 # rectangles of width and height 1 are not shown - increase if x1 - x0 + y1 - y0 == 2: y1 += 1 if class_var and class_var.is_discrete: colors = [QColor(*col) for col in class_var.colors] else: colors = None def select_area(_, ev): self.select_area(area_index, ev) def rect(x, y, w, h, z, pen_color=None, brush_color=None, **args): if pen_color is None: return CanvasRectangle(self.canvas, x, y, w, h, z=z, onclick=select_area, **args) if brush_color is None: brush_color = pen_color return CanvasRectangle(self.canvas, x, y, w, h, pen_color, brush_color, z=z, onclick=select_area, **args) def line(x1, y1, x2, y2): r = QGraphicsLineItem(x1, y1, x2, y2, None) self.canvas.addItem(r) r.setPen(QPen(Qt.white, 2)) r.setZValue(30) outer_rect = rect(x0, y0, x1 - x0, y1 - y0, 30) self.areas.append((used_attrs, used_vals, outer_rect)) if not conditionaldict[attr_vals]: return if self.interior_coloring == self.PEARSON: s = sum(apriori_dists[0]) expected = s * reduce( mul, (apriori_dists[i][used_vals[i]] / float(s) for i in range(len(used_vals)))) actual = conditionaldict[attr_vals] pearson = (actual - expected) / sqrt(expected) if pearson == 0: ind = 0 else: ind = max(0, min(int(log(abs(pearson), 2)), 3)) color = [self.RED_COLORS, self.BLUE_COLORS][pearson > 0][ind] rect(x0, y0, x1 - x0, y1 - y0, -20, color) outer_rect.setToolTip( condition + "<hr/>" + "Expected instances: %.1f<br>" "Actual instances: %d<br>" "Standardized (Pearson) residual: %.1f" % (expected, conditionaldict[attr_vals], pearson)) else: cls_values = get_variable_values_sorted(class_var) prior = get_distribution(data, class_var.name) total = 0 for i, value in enumerate(cls_values): val = conditionaldict[attr_vals + "-" + value] if val == 0: continue if i == len(cls_values) - 1: v = y1 - y0 - total else: v = ((y1 - y0) * val) / conditionaldict[attr_vals] rect(x0, y0 + total, x1 - x0, v, -20, colors[i]) total += v if self.use_boxes and \ abs(x1 - x0) > bar_width and \ abs(y1 - y0) > bar_width: total = 0 line(x0 + bar_width, y0, x0 + bar_width, y1) n = sum(prior) for i, (val, color) in enumerate(zip(prior, colors)): if i == len(prior) - 1: h = y1 - y0 - total else: h = (y1 - y0) * val / n rect(x0, y0 + total, bar_width, h, 20, color) total += h if conditionalsubsetdict: if conditionalsubsetdict[attr_vals]: counts = [ conditionalsubsetdict[attr_vals + "-" + val] for val in cls_values ] if sum(counts) == 1: rect(x0 - 2, y0 - 2, x1 - x0 + 5, y1 - y0 + 5, -550, colors[counts.index(1)], Qt.white, penWidth=2, penStyle=Qt.DashLine) if self.subset_data is not None: line(x1 - bar_width, y0, x1 - bar_width, y1) total = 0 n = conditionalsubsetdict[attr_vals] if n: for i, (cls, color) in \ enumerate(zip(cls_values, colors)): val = conditionalsubsetdict[attr_vals + "-" + cls] if val == 0: continue if i == len(prior) - 1: v = y1 - y0 - total else: v = ((y1 - y0) * val) / n rect(x1 - bar_width, y0 + total, bar_width, v, 15, color) total += v actual = [ conditionaldict[attr_vals + "-" + cls_values[i]] for i in range(len(prior)) ] n_actual = sum(actual) if n_actual > 0: apriori = [prior[key] for key in cls_values] n_apriori = sum(apriori) text = "<br/>".join( "<b>%s</b>: %d / %.1f%% (Expected %.1f / %.1f%%)" % (cls, act, 100.0 * act / n_actual, apr / n_apriori * n_actual, 100.0 * apr / n_apriori) for cls, act, apr in zip(cls_values, actual, apriori)) else: text = "" outer_rect.setToolTip("{}<hr>Instances: {}<br><br>{}".format( condition, n_actual, text[:-4])) def draw_legend(x0_x1, y0_y1): x0, x1 = x0_x1 y0, y1 = y0_y1 if self.interior_coloring == self.PEARSON: names = [ "<-8", "-8:-4", "-4:-2", "-2:2", "2:4", "4:8", ">8", "Residuals:" ] colors = self.RED_COLORS[::-1] + self.BLUE_COLORS[1:] else: names = get_variable_values_sorted(class_var) + \ [class_var.name + ":"] colors = [QColor(*col) for col in class_var.colors] names = [ CanvasText(self.canvas, name, alignment=Qt.AlignVCenter) for name in names ] totalwidth = sum(text.boundingRect().width() for text in names) # compute the x position of the center of the legend y = y1 + self.ATTR_NAME_OFFSET + self.ATTR_VAL_OFFSET + 35 distance = 30 startx = (x0 + x1) / 2 - (totalwidth + (len(names)) * distance) / 2 names[-1].setPos(startx + 15, y) names[-1].show() xoffset = names[-1].boundingRect().width() + distance size = 8 for i in range(len(names) - 1): if self.interior_coloring == self.PEARSON: edgecolor = Qt.black else: edgecolor = colors[i] CanvasRectangle(self.canvas, startx + xoffset, y - size / 2, size, size, edgecolor, colors[i]) names[i].setPos(startx + xoffset + 10, y) xoffset += distance + names[i].boundingRect().width() self.canvas.clear() self.areas = [] data = self.discrete_data if data is None: return subset = self.subset_data attr_list = self.get_attr_list() class_var = data.domain.class_var if class_var: sql = type(data) == SqlTable name = not sql and data.name # save class_var because it is removed in the next line data = data[:, attr_list + [class_var]] data.domain.class_var = class_var if not sql: data.name = name else: data = data[:, attr_list] # TODO: check this # data = Preprocessor_dropMissing(data) if len(data) == 0: self.warning(5, "No valid data for current attributes.") return else: self.warning(5) if self.interior_coloring == self.PEARSON: apriori_dists = [ get_distribution(data, attr) for attr in attr_list ] else: apriori_dists = [] def get_max_label_width(attr): values = get_variable_values_sorted(data.domain[attr]) maxw = 0 for val in values: t = CanvasText(self.canvas, val, 0, 0, bold=0, show=False) maxw = max(int(t.boundingRect().width()), maxw) return maxw # get the maximum width of rectangle xoff = 20 width = 20 if len(attr_list) > 1: text = CanvasText(self.canvas, attr_list[1], bold=1, show=0) max_ylabel_w1 = min(get_max_label_width(attr_list[1]), 150) width = 5 + text.boundingRect().height() + \ self.ATTR_VAL_OFFSET + max_ylabel_w1 xoff = width if len(attr_list) == 4: text = CanvasText(self.canvas, attr_list[3], bold=1, show=0) max_ylabel_w2 = min(get_max_label_width(attr_list[3]), 150) width += text.boundingRect().height() + \ self.ATTR_VAL_OFFSET + max_ylabel_w2 - 10 # get the maximum height of rectangle height = 100 yoff = 45 square_size = min(self.canvas_view.width() - width - 20, self.canvas_view.height() - height - 20) if square_size < 0: return # canvas is too small to draw rectangles self.canvas_view.setSceneRect(0, 0, self.canvas_view.width(), self.canvas_view.height()) drawn_sides = set() draw_positions = {} conditionaldict, distributiondict = \ get_conditional_distribution(data, attr_list) conditionalsubsetdict = None if subset: conditionalsubsetdict, _ = \ get_conditional_distribution(subset, attr_list) # draw rectangles draw_data(attr_list, (xoff, xoff + square_size), (yoff, yoff + square_size), 0, "", len(attr_list)) draw_legend((xoff, xoff + square_size), (yoff, yoff + square_size)) self.update_selection_rects()
def update_graph(self): spacing = self.SPACING bar_width = self.BAR_WIDTH def get_counts(attr_vals, values): """Calculate rectangles' widths; if all are 0, they are set to 1.""" if not attr_vals: counts = [conditionaldict[val] for val in values] else: counts = [ conditionaldict[attr_vals + "-" + val] for val in values ] total = sum(counts) if total == 0: counts = [1] * len(values) total = sum(counts) return total, counts def draw_data(attr_list, x0_x1, y0_y1, side, condition, total_attrs, used_attrs, used_vals, attr_vals=""): x0, x1 = x0_x1 y0, y1 = y0_y1 if conditionaldict[attr_vals] == 0: add_rect(x0, x1, y0, y1, "", used_attrs, used_vals, attr_vals=attr_vals) # store coordinates for later drawing of labels draw_text(side, attr_list[0], (x0, x1), (y0, y1), total_attrs, used_attrs, used_vals, attr_vals) return attr = attr_list[0] # how much smaller rectangles do we draw edge = len(attr_list) * spacing values = get_variable_values_sorted(attr) if side % 2: values = values[::-1] # reverse names if necessary if side % 2 == 0: # we are drawing on the x axis # remove the space needed for separating different attr. values whole = max(0, (x1 - x0) - edge * (len(values) - 1)) if whole == 0: edge = (x1 - x0) / float(len(values) - 1) else: # we are drawing on the y axis whole = max(0, (y1 - y0) - edge * (len(values) - 1)) if whole == 0: edge = (y1 - y0) / float(len(values) - 1) total, counts = get_counts(attr_vals, values) # when visualizing the third attribute and the first attribute has # the last value, reverse the order in which the boxes are drawn; # otherwise, if the last cell, nearest to the labels of the fourth # attribute, is empty, we wouldn't be able to position the labels valrange = list(range(len(values))) if len(attr_list + used_attrs) == 4 and len(used_attrs) == 2: attr1values = get_variable_values_sorted(used_attrs[0]) if used_vals[0] == attr1values[-1]: valrange = valrange[::-1] for i in valrange: start = i * edge + whole * float(sum(counts[:i]) / total) end = i * edge + whole * float(sum(counts[:i + 1]) / total) val = values[i] htmlval = to_html(val) newattrvals = attr_vals + "-" + val if attr_vals else val tooltip = "{} {}: <b>{}</b><br/>".format( condition, attr.name, htmlval) attrs = used_attrs + [attr] vals = used_vals + [val] args = attrs, vals, newattrvals if side % 2 == 0: # if we are moving horizontally if len(attr_list) == 1: add_rect(x0 + start, x0 + end, y0, y1, tooltip, *args) else: draw_data(attr_list[1:], (x0 + start, x0 + end), (y0, y1), side + 1, tooltip, total_attrs, *args) else: if len(attr_list) == 1: add_rect(x0, x1, y0 + start, y0 + end, tooltip, *args) else: draw_data(attr_list[1:], (x0, x1), (y0 + start, y0 + end), side + 1, tooltip, total_attrs, *args) draw_text(side, attr_list[0], (x0, x1), (y0, y1), total_attrs, used_attrs, used_vals, attr_vals) def draw_text(side, attr, x0_x1, y0_y1, total_attrs, used_attrs, used_vals, attr_vals): x0, x1 = x0_x1 y0, y1 = y0_y1 if side in drawn_sides: return # the text on the right will be drawn when we are processing # visualization of the last value of the first attribute if side == 3: attr1values = get_variable_values_sorted(used_attrs[0]) if used_vals[0] != attr1values[-1]: return if not conditionaldict[attr_vals]: if side not in draw_positions: draw_positions[side] = (x0, x1, y0, y1) return else: if side in draw_positions: # restore the positions of attribute values and name (x0, x1, y0, y1) = draw_positions[side] drawn_sides.add(side) values = get_variable_values_sorted(attr) if side % 2: values = values[::-1] spaces = spacing * (total_attrs - side) * (len(values) - 1) width = x1 - x0 - spaces * (side % 2 == 0) height = y1 - y0 - spaces * (side % 2 == 1) # calculate position of first attribute currpos = 0 total, counts = get_counts(attr_vals, values) aligns = [ Qt.AlignTop | Qt.AlignHCenter, Qt.AlignRight | Qt.AlignVCenter, Qt.AlignBottom | Qt.AlignHCenter, Qt.AlignLeft | Qt.AlignVCenter ] align = aligns[side] for i, val in enumerate(values): if distributiondict[val] != 0: perc = counts[i] / float(total) rwidth = width * perc xs = [ x0 + currpos + rwidth / 2, x0 - self.ATTR_VAL_OFFSET, x0 + currpos + rwidth / 2, x1 + self.ATTR_VAL_OFFSET ] ys = [ y1 + self.ATTR_VAL_OFFSET, y0 + currpos + height * 0.5 * perc, y0 - self.ATTR_VAL_OFFSET, y0 + currpos + height * 0.5 * perc ] CanvasText(self.canvas, val, xs[side], ys[side], align, max_width=rwidth if side == 0 else None) space = height if side % 2 else width currpos += perc * space + spacing * (total_attrs - side) xs = [ x0 + (x1 - x0) / 2, x0 - max_ylabel_w1 - self.ATTR_VAL_OFFSET, x0 + (x1 - x0) / 2, x1 + max_ylabel_w2 + self.ATTR_VAL_OFFSET ] ys = [ y1 + self.ATTR_VAL_OFFSET + self.ATTR_NAME_OFFSET, y0 + (y1 - y0) / 2, y0 - self.ATTR_VAL_OFFSET - self.ATTR_NAME_OFFSET, y0 + (y1 - y0) / 2 ] CanvasText(self.canvas, attr.name, xs[side], ys[side], align, bold=True, vertical=side % 2) def add_rect(x0, x1, y0, y1, condition, used_attrs, used_vals, attr_vals=""): area_index = len(self.areas) x1 += (x0 == x1) y1 += (y0 == y1) # rectangles of width and height 1 are not shown - increase y1 += (x1 - x0 + y1 - y0 == 2) colors = class_var and [QColor(*col) for col in class_var.colors] def select_area(_, ev): self.select_area(area_index, ev) def rect(x, y, w, h, z, pen_color=None, brush_color=None, **args): if pen_color is None: return CanvasRectangle(self.canvas, x, y, w, h, z=z, onclick=select_area, **args) if brush_color is None: brush_color = pen_color return CanvasRectangle(self.canvas, x, y, w, h, pen_color, brush_color, z=z, onclick=select_area, **args) def line(x1, y1, x2, y2): r = QGraphicsLineItem(x1, y1, x2, y2, None) self.canvas.addItem(r) r.setPen(QPen(Qt.white, 2)) r.setZValue(30) outer_rect = rect(x0, y0, x1 - x0, y1 - y0, 30) self.areas.append((used_attrs, used_vals, outer_rect)) if not conditionaldict[attr_vals]: return if self.variable_color is None: s = sum(apriori_dists[0]) expected = s * reduce( mul, (apriori_dists[i][used_vals[i]] / float(s) for i in range(len(used_vals)))) actual = conditionaldict[attr_vals] pearson = float((actual - expected) / sqrt(expected)) if pearson == 0: ind = 0 else: ind = max(0, min(int(log(abs(pearson), 2)), 3)) color = [self.RED_COLORS, self.BLUE_COLORS][pearson > 0][ind] rect(x0, y0, x1 - x0, y1 - y0, -20, color) outer_rect.setToolTip( condition + "<hr/>" + "Expected instances: %.1f<br>" "Actual instances: %d<br>" "Standardized (Pearson) residual: %.1f" % (expected, conditionaldict[attr_vals], pearson)) else: cls_values = get_variable_values_sorted(class_var) prior = get_distribution(data, class_var.name) total = 0 for i, value in enumerate(cls_values): val = conditionaldict[attr_vals + "-" + value] if val == 0: continue if i == len(cls_values) - 1: v = y1 - y0 - total else: v = ((y1 - y0) * val) / conditionaldict[attr_vals] rect(x0, y0 + total, x1 - x0, v, -20, colors[i]) total += v if self.use_boxes and \ abs(x1 - x0) > bar_width and abs(y1 - y0) > bar_width: total = 0 line(x0 + bar_width, y0, x0 + bar_width, y1) n = sum(prior) for i, (val, color) in enumerate(zip(prior, colors)): if i == len(prior) - 1: h = y1 - y0 - total else: h = (y1 - y0) * val / n rect(x0, y0 + total, bar_width, h, 20, color) total += h if conditionalsubsetdict: if conditionalsubsetdict[attr_vals]: if self.subset_indices is not None: line(x1 - bar_width, y0, x1 - bar_width, y1) total = 0 n = conditionalsubsetdict[attr_vals] if n: for i, (cls, color) in \ enumerate(zip(cls_values, colors)): val = conditionalsubsetdict[attr_vals + "-" + cls] if val == 0: continue if i == len(prior) - 1: v = y1 - y0 - total else: v = ((y1 - y0) * val) / n rect(x1 - bar_width, y0 + total, bar_width, v, 15, color) total += v actual = [ conditionaldict[attr_vals + "-" + cls_values[i]] for i in range(len(prior)) ] n_actual = sum(actual) if n_actual > 0: apriori = [prior[key] for key in cls_values] n_apriori = sum(apriori) text = "<br/>".join( "<b>%s</b>: %d / %.1f%% (Expected %.1f / %.1f%%)" % (cls, act, 100.0 * act / n_actual, apr / n_apriori * n_actual, 100.0 * apr / n_apriori) for cls, act, apr in zip(cls_values, actual, apriori)) else: text = "" outer_rect.setToolTip("{}<hr>Instances: {}<br><br>{}".format( condition, n_actual, text[:-4])) def create_legend(): if self.variable_color is None: names = [ "<-8", "-8:-4", "-4:-2", "-2:2", "2:4", "4:8", ">8", "Residuals:" ] colors = self.RED_COLORS[::-1] + self.BLUE_COLORS[1:] edges = repeat(Qt.black) else: names = get_variable_values_sorted(class_var) edges = colors = [QColor(*col) for col in class_var.colors] items = [] size = 8 for name, color, edgecolor in zip(names, colors, edges): item = QGraphicsItemGroup() item.addToGroup( CanvasRectangle(None, -size / 2, -size / 2, size, size, edgecolor, color)) item.addToGroup( CanvasText(None, name, size, 0, Qt.AlignVCenter)) items.append(item) return wrap_legend_items(items, hspacing=20, vspacing=16 + size, max_width=self.canvas_view.width() - xoff) self.canvas.clear() self.areas = [] data = self.discrete_data if data is None: return attr_list = self.get_disc_attr_list() class_var = data.domain.class_var # TODO: check this # data = Preprocessor_dropMissing(data) unique = [v.name for v in set(attr_list + [class_var]) if v] if len(data[:, unique]) == 0: self.Warning.no_valid_data() return else: self.Warning.no_valid_data.clear() attrs = [attr for attr in attr_list if not attr.values] if attrs: CanvasText(self.canvas, "Feature {} has no values".format(attrs[0]), (self.canvas_view.width() - 120) / 2, self.canvas_view.height() / 2) return if self.variable_color is None: apriori_dists = [ get_distribution(data, attr) for attr in attr_list ] else: apriori_dists = [] def get_max_label_width(attr): values = get_variable_values_sorted(attr) maxw = 0 for val in values: t = CanvasText(self.canvas, val, 0, 0, bold=0, show=False) maxw = max(int(t.boundingRect().width()), maxw) return maxw xoff = 20 # get the maximum width of rectangle width = 20 max_ylabel_w1 = max_ylabel_w2 = 0 if len(attr_list) > 1: text = CanvasText(self.canvas, attr_list[1].name, bold=1, show=0) max_ylabel_w1 = min(get_max_label_width(attr_list[1]), 150) width = 5 + text.boundingRect().height() + \ self.ATTR_VAL_OFFSET + max_ylabel_w1 xoff = width if len(attr_list) == 4: text = CanvasText(self.canvas, attr_list[3].name, bold=1, show=0) max_ylabel_w2 = min(get_max_label_width(attr_list[3]), 150) width += text.boundingRect().height() + \ self.ATTR_VAL_OFFSET + max_ylabel_w2 - 10 legend = create_legend() # get the maximum height of rectangle yoff = 45 legendoff = yoff + self.ATTR_NAME_OFFSET + self.ATTR_VAL_OFFSET + 35 square_size = min( self.canvas_view.width() - width - 20, self.canvas_view.height() - legendoff - legend.boundingRect().height()) if square_size < 0: return # canvas is too small to draw rectangles self.canvas_view.setSceneRect(0, 0, self.canvas_view.width(), self.canvas_view.height()) drawn_sides = set() draw_positions = {} conditionaldict, distributiondict = \ get_conditional_distribution(data, attr_list) conditionalsubsetdict = None if self.subset_indices: conditionalsubsetdict, _ = get_conditional_distribution( self.discrete_data[self.subset_indices], attr_list) # draw rectangles draw_data(attr_list, (xoff, xoff + square_size), (yoff, yoff + square_size), 0, "", len(attr_list), [], []) self.canvas.addItem(legend) legend.setPos( xoff - legend.boundingRect().x() + max(0, (square_size - legend.boundingRect().width()) / 2), legendoff + square_size) self.update_selection_rects()
def add_rect(x0, x1, y0, y1, condition="", used_attrs=[], used_vals=[], attr_vals=""): area_index = len(self.areas) if x0 == x1: x1 += 1 if y0 == y1: y1 += 1 # rectangles of width and height 1 are not shown - increase if x1 - x0 + y1 - y0 == 2: y1 += 1 if class_var and class_var.is_discrete: colors = [QColor(*col) for col in class_var.colors] else: colors = None def select_area(_, ev): self.select_area(area_index, ev) def rect(x, y, w, h, z, pen_color=None, brush_color=None, **args): if pen_color is None: return CanvasRectangle( self.canvas, x, y, w, h, z=z, onclick=select_area, **args) if brush_color is None: brush_color = pen_color return CanvasRectangle( self.canvas, x, y, w, h, pen_color, brush_color, z=z, onclick=select_area, **args) def line(x1, y1, x2, y2): r = QGraphicsLineItem(x1, y1, x2, y2, None) self.canvas.addItem(r) r.setPen(QPen(Qt.white, 2)) r.setZValue(30) outer_rect = rect(x0, y0, x1 - x0, y1 - y0, 30) self.areas.append((used_attrs, used_vals, outer_rect)) if not conditionaldict[attr_vals]: return if self.interior_coloring == self.PEARSON: s = sum(apriori_dists[0]) expected = s * reduce( mul, (apriori_dists[i][used_vals[i]] / float(s) for i in range(len(used_vals)))) actual = conditionaldict[attr_vals] pearson = (actual - expected) / sqrt(expected) if pearson == 0: ind = 0 else: ind = max(0, min(int(log(abs(pearson), 2)), 3)) color = [self.RED_COLORS, self.BLUE_COLORS][pearson > 0][ind] rect(x0, y0, x1 - x0, y1 - y0, -20, color) outer_rect.setToolTip( condition + "<hr/>" + "Expected instances: %.1f<br>" "Actual instances: %d<br>" "Standardized (Pearson) residual: %.1f" % (expected, conditionaldict[attr_vals], pearson)) else: cls_values = get_variable_values_sorted(class_var) prior = get_distribution(data, class_var.name) total = 0 for i, value in enumerate(cls_values): val = conditionaldict[attr_vals + "-" + value] if val == 0: continue if i == len(cls_values) - 1: v = y1 - y0 - total else: v = ((y1 - y0) * val) / conditionaldict[attr_vals] rect(x0, y0 + total, x1 - x0, v, -20, colors[i]) total += v if self.use_boxes and \ abs(x1 - x0) > bar_width and \ abs(y1 - y0) > bar_width: total = 0 line(x0 + bar_width, y0, x0 + bar_width, y1) n = sum(prior) for i, (val, color) in enumerate(zip(prior, colors)): if i == len(prior) - 1: h = y1 - y0 - total else: h = (y1 - y0) * val / n rect(x0, y0 + total, bar_width, h, 20, color) total += h if conditionalsubsetdict: if conditionalsubsetdict[attr_vals]: counts = [conditionalsubsetdict[attr_vals + "-" + val] for val in cls_values] if sum(counts) == 1: rect(x0 - 2, y0 - 2, x1 - x0 + 5, y1 - y0 + 5, -550, colors[counts.index(1)], Qt.white, penWidth=2, penStyle=Qt.DashLine) if self.subset_data is not None: line(x1 - bar_width, y0, x1 - bar_width, y1) total = 0 n = conditionalsubsetdict[attr_vals] if n: for i, (cls, color) in \ enumerate(zip(cls_values, colors)): val = conditionalsubsetdict[ attr_vals + "-" + cls] if val == 0: continue if i == len(prior) - 1: v = y1 - y0 - total else: v = ((y1 - y0) * val) / n rect(x1 - bar_width, y0 + total, bar_width, v, 15, color) total += v actual = [conditionaldict[attr_vals + "-" + cls_values[i]] for i in range(len(prior))] n_actual = sum(actual) if n_actual > 0: apriori = [prior[key] for key in cls_values] n_apriori = sum(apriori) text = "<br/>".join( "<b>%s</b>: %d / %.1f%% (Expected %.1f / %.1f%%)" % (cls, act, 100.0 * act / n_actual, apr / n_apriori * n_actual, 100.0 * apr / n_apriori ) for cls, act, apr in zip(cls_values, actual, apriori )) else: text = "" outer_rect.setToolTip( "{}<hr>Instances: {}<br><br>{}".format( condition, n_actual, text[:-4]))
def fit_storage(self, data): dist = distribution.get_distribution(data, data.domain.class_var) return MeanModel(dist)
def fit_storage(self, data): dist = distribution.get_distribution(data, data.domain.class_var) domain = Orange.data.Domain((), (data.domain.class_var,)) return MeanPredictor(domain, dist)
def column_imputer_modus(variable, table): stat = distribution.get_distribution(table, variable) column_imputer_defaults(variable, table, stat.modus())
def fit_storage(self, data): dist = distribution.get_distribution(data, data.domain.class_var) domain = Orange.data.Domain((), (data.domain.class_var,)) return MeanPredictor(domain, dist)
def update_graph(self): spacing = self.SPACING bar_width = self.BAR_WIDTH def draw_data(attr_list, x0_x1, y0_y1, side, condition, total_attrs, used_attrs=[], used_vals=[], attr_vals=""): x0, x1 = x0_x1 y0, y1 = y0_y1 if conditionaldict[attr_vals] == 0: add_rect(x0, x1, y0, y1, "", used_attrs, used_vals, attr_vals=attr_vals) # store coordinates for later drawing of labels draw_text(side, attr_list[0], (x0, x1), (y0, y1), total_attrs, used_attrs, used_vals, attr_vals) return attr = attr_list[0] # how much smaller rectangles do we draw edge = len(attr_list) * spacing values = get_variable_values_sorted(data.domain[attr]) if side % 2: values = values[::-1] # reverse names if necessary if side % 2 == 0: # we are drawing on the x axis # remove the space needed for separating different attr. values whole = max(0, (x1 - x0) - edge * ( len(values) - 1)) if whole == 0: edge = (x1 - x0) / float(len(values) - 1) else: # we are drawing on the y axis whole = max(0, (y1 - y0) - edge * (len(values) - 1)) if whole == 0: edge = (y1 - y0) / float(len(values) - 1) if attr_vals == "": counts = [conditionaldict[val] for val in values] else: counts = [conditionaldict[attr_vals + "-" + val] for val in values] total = sum(counts) # if we are visualizing the third attribute and the first attribute # has the last value, we have to reverse the order in which the # boxes will be drawn otherwise, if the last cell, nearest to the # labels of the fourth attribute, is empty, we wouldn't be able to # position the labels valrange = list(range(len(values))) if len(attr_list + used_attrs) == 4 and len(used_attrs) == 2: attr1values = get_variable_values_sorted( data.domain[used_attrs[0]]) if used_vals[0] == attr1values[-1]: valrange = valrange[::-1] for i in valrange: start = i * edge + whole * float(sum(counts[:i]) / total) end = i * edge + whole * float(sum(counts[:i + 1]) / total) val = values[i] htmlval = to_html(val) if attr_vals != "": newattrvals = attr_vals + "-" + val else: newattrvals = val tooltip = condition + 4 * " " + attr + \ ": <b>" + htmlval + "</b><br>" attrs = used_attrs + [attr] vals = used_vals + [val] common_args = attrs, vals, newattrvals if side % 2 == 0: # if we are moving horizontally if len(attr_list) == 1: add_rect(x0 + start, x0 + end, y0, y1, tooltip, *common_args) else: draw_data(attr_list[1:], (x0 + start, x0 + end), (y0, y1), side + 1, tooltip, total_attrs, *common_args) else: if len(attr_list) == 1: add_rect(x0, x1, y0 + start, y0 + end, tooltip, *common_args) else: draw_data(attr_list[1:], (x0, x1), (y0 + start, y0 + end), side + 1, tooltip, total_attrs, *common_args) draw_text(side, attr_list[0], (x0, x1), (y0, y1), total_attrs, used_attrs, used_vals, attr_vals) def draw_text(side, attr, x0_x1, y0_y1, total_attrs, used_attrs, used_vals, attr_vals): x0, x1 = x0_x1 y0, y1 = y0_y1 if side in drawn_sides: return # the text on the right will be drawn when we are processing # visualization of the last value of the first attribute if side == 3: attr1values = \ get_variable_values_sorted(data.domain[used_attrs[0]]) if used_vals[0] != attr1values[-1]: return if not conditionaldict[attr_vals]: if side not in draw_positions: draw_positions[side] = (x0, x1, y0, y1) return else: if side in draw_positions: # restore the positions of attribute values and name (x0, x1, y0, y1) = draw_positions[side] drawn_sides.add(side) values = get_variable_values_sorted(data.domain[attr]) if side % 2: values = values[::-1] spaces = spacing * (total_attrs - side) * (len(values) - 1) width = x1 - x0 - spaces * (side % 2 == 0) height = y1 - y0 - spaces * (side % 2 == 1) # calculate position of first attribute currpos = 0 if attr_vals == "": counts = [conditionaldict.get(val, 1) for val in values] else: counts = [conditionaldict.get(attr_vals + "-" + val, 1) for val in values] total = sum(counts) if total == 0: counts = [1] * len(values) total = sum(counts) aligns = [Qt.AlignTop | Qt.AlignHCenter, Qt.AlignRight | Qt.AlignVCenter, Qt.AlignBottom | Qt.AlignHCenter, Qt.AlignLeft | Qt.AlignVCenter] align = aligns[side] for i in range(len(values)): val = values[i] perc = counts[i] / float(total) if distributiondict[val] != 0: if side == 0: CanvasText(self.canvas, str(val), x0 + currpos + width * 0.5 * perc, y1 + self.ATTR_VAL_OFFSET, align) elif side == 1: CanvasText(self.canvas, str(val), x0 - self.ATTR_VAL_OFFSET, y0 + currpos + height * 0.5 * perc, align) elif side == 2: CanvasText(self.canvas, str(val), x0 + currpos + width * perc * 0.5, y0 - self.ATTR_VAL_OFFSET, align) else: CanvasText(self.canvas, str(val), x1 + self.ATTR_VAL_OFFSET, y0 + currpos + height * 0.5 * perc, align) if side % 2 == 0: currpos += perc * width + spacing * (total_attrs - side) else: currpos += perc * height + spacing * (total_attrs - side) if side == 0: CanvasText( self.canvas, attr, x0 + (x1 - x0) / 2, y1 + self.ATTR_VAL_OFFSET + self.ATTR_NAME_OFFSET, align, bold=1) elif side == 1: CanvasText( self.canvas, attr, x0 - max_ylabel_w1 - self.ATTR_VAL_OFFSET, y0 + (y1 - y0) / 2, align, bold=1, vertical=True) elif side == 2: CanvasText( self.canvas, attr, x0 + (x1 - x0) / 2, y0 - self.ATTR_VAL_OFFSET - self.ATTR_NAME_OFFSET, align, bold=1) else: CanvasText( self.canvas, attr, x1 + max_ylabel_w2 + self.ATTR_VAL_OFFSET, y0 + (y1 - y0) / 2, align, bold=1, vertical=True) def add_rect(x0, x1, y0, y1, condition="", used_attrs=[], used_vals=[], attr_vals=""): area_index = len(self.areas) if x0 == x1: x1 += 1 if y0 == y1: y1 += 1 # rectangles of width and height 1 are not shown - increase if x1 - x0 + y1 - y0 == 2: y1 += 1 if class_var and class_var.is_discrete: colors = [QColor(*col) for col in class_var.colors] else: colors = None def select_area(_, ev): self.select_area(area_index, ev) def rect(x, y, w, h, z, pen_color=None, brush_color=None, **args): if pen_color is None: return CanvasRectangle( self.canvas, x, y, w, h, z=z, onclick=select_area, **args) if brush_color is None: brush_color = pen_color return CanvasRectangle( self.canvas, x, y, w, h, pen_color, brush_color, z=z, onclick=select_area, **args) def line(x1, y1, x2, y2): r = QGraphicsLineItem(x1, y1, x2, y2, None) self.canvas.addItem(r) r.setPen(QPen(Qt.white, 2)) r.setZValue(30) outer_rect = rect(x0, y0, x1 - x0, y1 - y0, 30) self.areas.append((used_attrs, used_vals, outer_rect)) if not conditionaldict[attr_vals]: return if self.interior_coloring == self.PEARSON: s = sum(apriori_dists[0]) expected = s * reduce( mul, (apriori_dists[i][used_vals[i]] / float(s) for i in range(len(used_vals)))) actual = conditionaldict[attr_vals] pearson = (actual - expected) / sqrt(expected) if pearson == 0: ind = 0 else: ind = max(0, min(int(log(abs(pearson), 2)), 3)) color = [self.RED_COLORS, self.BLUE_COLORS][pearson > 0][ind] rect(x0, y0, x1 - x0, y1 - y0, -20, color) outer_rect.setToolTip( condition + "<hr/>" + "Expected instances: %.1f<br>" "Actual instances: %d<br>" "Standardized (Pearson) residual: %.1f" % (expected, conditionaldict[attr_vals], pearson)) else: cls_values = get_variable_values_sorted(class_var) prior = get_distribution(data, class_var.name) total = 0 for i, value in enumerate(cls_values): val = conditionaldict[attr_vals + "-" + value] if val == 0: continue if i == len(cls_values) - 1: v = y1 - y0 - total else: v = ((y1 - y0) * val) / conditionaldict[attr_vals] rect(x0, y0 + total, x1 - x0, v, -20, colors[i]) total += v if self.use_boxes and \ abs(x1 - x0) > bar_width and \ abs(y1 - y0) > bar_width: total = 0 line(x0 + bar_width, y0, x0 + bar_width, y1) n = sum(prior) for i, (val, color) in enumerate(zip(prior, colors)): if i == len(prior) - 1: h = y1 - y0 - total else: h = (y1 - y0) * val / n rect(x0, y0 + total, bar_width, h, 20, color) total += h if conditionalsubsetdict: if conditionalsubsetdict[attr_vals]: counts = [conditionalsubsetdict[attr_vals + "-" + val] for val in cls_values] if sum(counts) == 1: rect(x0 - 2, y0 - 2, x1 - x0 + 5, y1 - y0 + 5, -550, colors[counts.index(1)], Qt.white, penWidth=2, penStyle=Qt.DashLine) if self.subset_data is not None: line(x1 - bar_width, y0, x1 - bar_width, y1) total = 0 n = conditionalsubsetdict[attr_vals] if n: for i, (cls, color) in \ enumerate(zip(cls_values, colors)): val = conditionalsubsetdict[ attr_vals + "-" + cls] if val == 0: continue if i == len(prior) - 1: v = y1 - y0 - total else: v = ((y1 - y0) * val) / n rect(x1 - bar_width, y0 + total, bar_width, v, 15, color) total += v actual = [conditionaldict[attr_vals + "-" + cls_values[i]] for i in range(len(prior))] n_actual = sum(actual) if n_actual > 0: apriori = [prior[key] for key in cls_values] n_apriori = sum(apriori) text = "<br/>".join( "<b>%s</b>: %d / %.1f%% (Expected %.1f / %.1f%%)" % (cls, act, 100.0 * act / n_actual, apr / n_apriori * n_actual, 100.0 * apr / n_apriori ) for cls, act, apr in zip(cls_values, actual, apriori )) else: text = "" outer_rect.setToolTip( "{}<hr>Instances: {}<br><br>{}".format( condition, n_actual, text[:-4])) def draw_legend(x0_x1, y0_y1): x0, x1 = x0_x1 y0, y1 = y0_y1 if self.interior_coloring == self.PEARSON: names = ["<-8", "-8:-4", "-4:-2", "-2:2", "2:4", "4:8", ">8", "Residuals:"] colors = self.RED_COLORS[::-1] + self.BLUE_COLORS[1:] else: names = get_variable_values_sorted(class_var) + \ [class_var.name + ":"] colors = [QColor(*col) for col in class_var.colors] names = [CanvasText(self.canvas, name, alignment=Qt.AlignVCenter) for name in names] totalwidth = sum(text.boundingRect().width() for text in names) # compute the x position of the center of the legend y = y1 + self.ATTR_NAME_OFFSET + self.ATTR_VAL_OFFSET + 35 distance = 30 startx = (x0 + x1) / 2 - (totalwidth + (len(names)) * distance) / 2 names[-1].setPos(startx + 15, y) names[-1].show() xoffset = names[-1].boundingRect().width() + distance size = 8 for i in range(len(names) - 1): if self.interior_coloring == self.PEARSON: edgecolor = Qt.black else: edgecolor = colors[i] CanvasRectangle(self.canvas, startx + xoffset, y - size / 2, size, size, edgecolor, colors[i]) names[i].setPos(startx + xoffset + 10, y) xoffset += distance + names[i].boundingRect().width() self.canvas.clear() self.areas = [] data = self.discrete_data if data is None: return subset = self.subset_data attr_list = self.get_attr_list() class_var = data.domain.class_var if class_var: sql = type(data) == SqlTable name = not sql and data.name # save class_var because it is removed in the next line data = data[:, attr_list + [class_var]] data.domain.class_var = class_var if not sql: data.name = name else: data = data[:, attr_list] # TODO: check this # data = Preprocessor_dropMissing(data) if len(data) == 0: self.Warning.no_valid_data() return else: self.Warning.no_valid_data.clear() if self.interior_coloring == self.PEARSON: apriori_dists = [get_distribution(data, attr) for attr in attr_list] else: apriori_dists = [] def get_max_label_width(attr): values = get_variable_values_sorted(data.domain[attr]) maxw = 0 for val in values: t = CanvasText(self.canvas, val, 0, 0, bold=0, show=False) maxw = max(int(t.boundingRect().width()), maxw) return maxw # get the maximum width of rectangle xoff = 20 width = 20 if len(attr_list) > 1: text = CanvasText(self.canvas, attr_list[1], bold=1, show=0) max_ylabel_w1 = min(get_max_label_width(attr_list[1]), 150) width = 5 + text.boundingRect().height() + \ self.ATTR_VAL_OFFSET + max_ylabel_w1 xoff = width if len(attr_list) == 4: text = CanvasText(self.canvas, attr_list[3], bold=1, show=0) max_ylabel_w2 = min(get_max_label_width(attr_list[3]), 150) width += text.boundingRect().height() + \ self.ATTR_VAL_OFFSET + max_ylabel_w2 - 10 # get the maximum height of rectangle height = 100 yoff = 45 square_size = min(self.canvas_view.width() - width - 20, self.canvas_view.height() - height - 20) if square_size < 0: return # canvas is too small to draw rectangles self.canvas_view.setSceneRect( 0, 0, self.canvas_view.width(), self.canvas_view.height()) drawn_sides = set() draw_positions = {} conditionaldict, distributiondict = \ get_conditional_distribution(data, attr_list) conditionalsubsetdict = None if subset: conditionalsubsetdict, _ = \ get_conditional_distribution(subset, attr_list) # draw rectangles draw_data( attr_list, (xoff, xoff + square_size), (yoff, yoff + square_size), 0, "", len(attr_list)) draw_legend((xoff, xoff + square_size), (yoff, yoff + square_size)) self.update_selection_rects()