def grid_bin(data, xvar, yvar, xbins, ybins, zvar=None): x_disc = Discretizer.create_discretized_var(xvar, xbins[1:-1]) y_disc = Discretizer.create_discretized_var(yvar, ybins[1:-1]) x_min, x_max = xbins[0], xbins[-1] y_min, y_max = ybins[0], ybins[-1] querydomain = [x_disc, y_disc] if zvar is not None: querydomain = querydomain + [zvar] querydomain = Orange.data.Domain(querydomain) def interval_filter(var, low, high): return Orange.data.filter.Values( [Orange.data.filter.FilterContinuous( var, max=high, min=low, oper=Orange.data.filter.FilterContinuous.Between)] ) def value_filter(var, val): return Orange.data.filter.Values( [Orange.data.filter.FilterDiscrete(var, [val])] ) def filters_join(filters): return Orange.data.filter.Values( reduce(list.__iadd__, (f.conditions for f in filters), []) ) inf_bounds = np.isinf([x_min, x_max, y_min, y_max]) if not all(inf_bounds): # No need to filter the data range_filters = [interval_filter(xvar, x_min, x_max), interval_filter(yvar, y_min, y_max)] range_filter = filters_join(range_filters) subset = range_filter(data) else: subset = data if zvar.is_discrete: filters = [value_filter(zvar, val) for val in zvar.values] contingencies = [ contingency.get_contingency( filter_(subset.from_table(querydomain, subset)), col_variable=y_disc, row_variable=x_disc ) for filter_ in filters ] contingencies = np.dstack(contingencies) else: contingencies = contingency.get_contingency( subset.from_table(querydomain, subset), col_variable=y_disc, row_variable=x_disc ) contingencies = np.asarray(contingencies) return Tree(xbins, ybins, contingencies, None)
def burt_table(data, variables): """ Construct a 'Burt table' (all values cross-tabulation) for variables. Return and ordered list of (variable, value) pairs and a numpy.ndarray contingency :param Orange.data.Table data: Data table. :param variables: List of variables (discrete). :type variables: list of Orange.data.DiscreteVariable """ values = [(var, value) for var in variables for value in var.values] table = np.zeros((len(values), len(values))) counts = [len(attr.values) for attr in variables] offsets = np.r_[0, np.cumsum(counts)] for i in range(len(variables)): for j in range(i + 1): var1 = variables[i] var2 = variables[j] cm = contingency.get_contingency(data, var2, var1) start1, end1 = offsets[i], offsets[i] + counts[i] start2, end2 = offsets[j], offsets[j] + counts[j] table[start1: end1, start2: end2] += cm if i != j: table[start2: end2, start1: end1] += cm.T return values, table
def compute_score(attr): if attr is group_var: return 3 if attr.is_continuous: # One-way ANOVA col = data.get_column_view(attr)[0].astype(float) groups = (col[group_col == i] for i in range(n_groups)) groups = (col[~np.isnan(col)] for col in groups) groups = [group for group in groups if len(group)] p = f_oneway(*groups)[1] if len(groups) > 1 else 2 else: # Chi-square with the given distribution into groups # (see degrees of freedom in computation of the p-value) if not attr.values or not group_var.values: return 2 observed = np.array( contingency.get_contingency(data, group_var, attr)) observed = observed[observed.sum(axis=1) != 0, :] observed = observed[:, observed.sum(axis=0) != 0] if min(observed.shape) < 2: return 2 expected = \ np.outer(observed.sum(axis=1), observed.sum(axis=0)) / \ np.sum(observed) p = chisquare(observed.ravel(), f_exp=expected.ravel(), ddof=n_groups - 1)[1] if math.isnan(p): return 2 return p
def compute_box_data(self): attr = self.attribute if not attr: return dataset = self.dataset self.is_continuous = attr.is_continuous if dataset is None or not self.is_continuous and not attr.values or \ self.group_var and not self.group_var.values: self.stats = self.dist = self.conts = [] return if self.group_var: self.dist = [] self.conts = contingency.get_contingency( dataset, attr, self.group_var) if self.is_continuous: self.stats = [BoxData(cont, attr, i, self.group_var) for i, cont in enumerate(self.conts)] self.label_txts_all = self.group_var.values else: self.dist = distribution.get_distribution(dataset, attr) self.conts = [] if self.is_continuous: self.stats = [BoxData(self.dist, attr, None)] self.label_txts_all = [""] self.label_txts = [txts for stat, txts in zip(self.stats, self.label_txts_all) if stat.n > 0] self.stats = [stat for stat in self.stats if stat.n > 0]
def test_mixedtype_metas(self): import Orange zoo = Orange.data.Table("zoo") dom = Orange.data.Domain(zoo.domain.attributes, zoo.domain.class_var, zoo.domain.metas + zoo.domain.attributes[:2]) t = Orange.data.Table(dom, zoo) cont = contingency.get_contingency(zoo, 2, t.domain.metas[1]) assert_dist_equal(cont["1"], [38, 5]) assert_dist_equal(cont, [[4, 54], [38, 5]]) zoo[25][t.domain.metas[1]] = float("nan") zoo[0][2] = float("nan") cont = contingency.get_contingency(zoo, 2, t.domain.metas[1]) assert_dist_equal(cont["1"], [37, 5]) assert_dist_equal(cont, [[4, 53], [37, 5]]) np.testing.assert_almost_equal(cont.unknowns, [0, 1]) self.assertEqual(cont.unknown_rows, 1)
def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: data = self.data[:, (self.var, self.cvar) if self.cvar else self.var ] disc = Orange.preprocess.discretize.EqualWidth(n=self.bins[self.smoothing_index]) data = Orange.preprocess.Discretize(data, method=disc) self.var = data.domain.variables[0] self.set_left_axis_name() self.enable_disable_rel_freq() if self.cvar: self.contingencies = \ contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange()
def contingency_table(data, columns, rows): ct = contingency.get_contingency(data, columns, rows) metavar = StringVariable(rows.name) metas = [[str(val)] for val in rows.values] domain = Domain([ContinuousVariable(val, number_of_decimals=0) for val in columns.values], metas=[metavar]) return Table(domain, ct, metas=metas)
def test_compute_contingency_invalid(self): rstate = np.random.RandomState(0xFFFF) X = data.ContinuousVariable("X") C = data.DiscreteVariable("C", values=["C{}".format(i + 1) for i in range(1024)]) domain = data.Domain([X], [C]) d = data.Table.from_numpy( domain, rstate.uniform(size=(20, 1)).round(1), rstate.randint(0, 1024, size=(20, 1)), ) c = contingency.get_contingency(d, X, C) self.assertEqual(c.counts.shape[0], 1024) d.Y[5] = 1024 with self.assertRaises(IndexError): contingency.get_contingency(d, X, C)
def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(var.is_discrete for var in table.domain.variables): raise NotImplementedError("Only discrete variables are supported.") cont = contingency.get_contingencies(table) class_freq = np.array(np.diag( contingency.get_contingency(table, table.domain.class_var))) nclss = (class_freq != 0).sum() if not nclss: raise ValueError("Data has no defined target values") # Laplacian smoothing considers only classes that appear in the data, # in part to avoid cases where the probabilities are affected by empty # (or completely spurious) classes that appear because of Orange's reuse # of variables. See GH-2943. # The corresponding elements of class_probs are set to zero only after # mock non-zero values are used in computation of log_cont_prob to # prevent division by zero. class_prob = (class_freq + 1) / (np.sum(class_freq) + nclss) log_cont_prob = [np.log( (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + nclss) / class_prob[:, None]) for c in cont] class_prob[class_freq == 0] = 0 return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] self.cb_prob.clear() self.cb_prob.addItem("(None)") self.cb_prob.addItems(self.cvar.values) self.cb_prob.addItem("(All)") self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1) data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: data = self.data[:, (self.var, self.cvar) if self.cvar else self.var] disc = Orange.preprocess.discretize.EqualWidth(n=self.bins[self.smoothing_index]) data = Orange.preprocess.Discretize(data, method=disc, remove_const=False) self.var = data.domain[0] self.set_left_axis_name() self.enable_disable_rel_freq() if self.cvar: self.contingencies = contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange()
def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(isinstance(var, DiscreteVariable) for var in table.domain.variables): raise NotImplementedError("Only discrete variables are supported.") cont = contingency.get_contingencies(table) class_freq = np.diag(contingency.get_contingency(table, table.domain.class_var)) return NaiveBayesModel(cont, class_freq, table.domain)
def __call__(self, data, attribute): cont = contingency.get_contingency(data, attribute) values, I = cont.values, cont.counts.T cut_ind = np.array(_entropy_discretize_sorted(I, self.force)) if len(cut_ind) > 0: #"the midpoint between each successive pair of examples" (FI p.1) points = (values[cut_ind] + values[cut_ind - 1])/2. return _discretized_var(data, attribute, points) else: return None
def __init__(self, data, attr1, attr2): self.observed = get_contingency(data, attr1, attr2) self.n = np.sum(self.observed) self.probs_x = self.observed.sum(axis=0) / self.n self.probs_y = self.observed.sum(axis=1) / self.n self.expected = np.outer(self.probs_y, self.probs_x) * self.n self.residuals = (self.observed - self.expected) / np.sqrt(self.expected) self.chisqs = self.residuals ** 2 self.chisq = float(np.sum(self.chisqs)) self.p = chi2.sf(self.chisq, (len(self.probs_x) - 1) * (len(self.probs_y) - 1))
def compute_box_data(self): if self.split_var: return ( contingency.get_contingency( self.dataset, self.attribute, self.split_var), self.split_var.values) else: return [ distribution.get_distribution( self.dataset, self.attribute)], [""]
def __call__(self, data, attribute): from Orange.statistics import contingency as c cont = c.get_contingency(data, attribute) values, I = join_contingency(cont) cut_ind = numpy.array(entropy_discretize_sorted(I)) if len(cut_ind) > 0: points = values[cut_ind - 1] return disc._discretized_var(data, attribute, points) else: return None
def draw_distributions(self): """Draw distributions with discrete attributes""" if not (self.show_distributions and self.have_data and self.data_has_discrete_class): return class_count = len(self.data_domain.class_var.values) class_ = self.data_domain.class_var # we create a hash table of possible class values (happens only if we have a discrete class) if self.domain_contingencies is None: self.domain_contingencies = dict( zip( [attr for attr in self.data_domain if isinstance(attr, DiscreteVariable)], get_contingencies(self.raw_data, skipContinuous=True), ) ) self.domain_contingencies[class_] = get_contingency(self.raw_data, class_, class_) max_count = max([contingency.max() for contingency in self.domain_contingencies.values()] or [1]) sorted_class_values = get_variable_values_sorted(self.data_domain.class_var) for axis_idx, attr_idx in enumerate(self.attribute_indices): attr = self.data_domain[attr_idx] if isinstance(attr, DiscreteVariable): continue contingency = self.domain_contingencies[attr] attr_len = len(attr.values) # we create a hash table of variable values and their indices sorted_variable_values = get_variable_values_sorted(attr) # create bar curve for j in range(attr_len): attribute_value = sorted_variable_values[j] value_count = contingency[:, attribute_value] for i in range(class_count): class_value = sorted_class_values[i] color = QColor(self.discrete_palette[i]) color.setAlpha(self.alpha_value) width = float(value_count[class_value] * 0.5) / float(max_count) y_off = float(1.0 + 2.0 * j) / float(2 * attr_len) height = 0.7 / float(class_count * attr_len) y_low_bottom = y_off + float(class_count * height) / 2.0 - i * height curve = PolygonCurve( QPen(color), QBrush(color), xData=[axis_idx, axis_idx + width, axis_idx + width, axis_idx], yData=[y_low_bottom, y_low_bottom, y_low_bottom - height, y_low_bottom - height], tooltip=attr.name, ) curve.attach(self)
def test_get_contingency(self): d = self._construct_sparse() cont = contingency.get_contingency(d, 5) self.assertIsInstance(cont, contingency.Discrete) np.testing.assert_almost_equal(cont[0], [1, 0, 0]) np.testing.assert_almost_equal(cont["b"], [0, 1, 1]) np.testing.assert_almost_equal(cont[2], [1, 0, 0]) cont = contingency.get_contingency(d, "c4") self.assertIsInstance(cont, contingency.Continuous) np.testing.assert_almost_equal(cont[0], [[], []]) np.testing.assert_almost_equal(cont["b"], [[1], [1]]) np.testing.assert_almost_equal(cont[2], [[2], [1]]) cont = contingency.get_contingency(d, d.domain[13]) self.assertIsInstance(cont, contingency.Continuous) np.testing.assert_almost_equal(cont[0], [[1.1], [1]]) np.testing.assert_almost_equal(cont["b"], [[1], [1]]) np.testing.assert_almost_equal(cont[2], [[], []]) np.testing.assert_almost_equal(cont[2], [[], []])
def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(var.is_discrete for var in table.domain.variables): raise NotImplementedError("Only discrete variables are supported.") cont = contingency.get_contingencies(table) class_freq = np.array(np.diag( contingency.get_contingency(table, table.domain.class_var))) class_prob = (class_freq + 1) / (np.sum(class_freq) + len(class_freq)) log_cont_prob = [np.log( (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + c.shape[0]) / class_prob[:, None]) for c in cont] return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
def rank(self): if self.progress: return disc = Orange.preprocess.EqualWidth(n=10) ndomain = Orange.data.Domain( [ disc(self.data, attr) if type(attr) == Orange.data.variable.ContinuousVariable else attr for attr in self.data.domain.attributes ], self.data.domain.class_vars, ) t = self.data.from_table(ndomain, self.data) attrs = t.domain.attributes tables = {} l = 0 self.progress = gui.ProgressBar(self, len(attrs) * (len(attrs) - 1) / 2) for i in range(len(attrs)): for j in range(i): ct = np.array(contingency.get_contingency(t, attrs[j], attrs[i])) pindex, _, _ = p_index(ct) tables[i, j] = ct item = QStandardItem() item.setData(float(pindex), Qt.DisplayRole) self.projectionTableModel.setItem(l, 0, item) item = QStandardItem() item.setData(attrs[i].name, Qt.DisplayRole) self.projectionTableModel.setItem(l, 1, item) item = QStandardItem() item.setData(attrs[j].name, Qt.DisplayRole) self.projectionTableModel.setItem(l, 2, item) self.progress.advance() l += 1 self.progress.finish() self.progress = None
def __init__(self, data, attr1, attr2): attr1 = data.domain[attr1] attr2 = data.domain[attr2] if attr1.is_discrete and not attr1.values or \ attr2.is_discrete and not attr2.values: self.p = np.nan return self.observed = get_contingency(data, attr1, attr2) self.n = np.sum(self.observed) self.probs_x = self.observed.sum(axis=0) / self.n self.probs_y = self.observed.sum(axis=1) / self.n self.expected = np.outer(self.probs_y, self.probs_x) * self.n self.residuals = \ (self.observed - self.expected) / np.sqrt(self.expected) self.residuals = np.nan_to_num(self.residuals) self.chisqs = self.residuals ** 2 self.chisq = float(np.sum(self.chisqs)) self.p = chi2.sf( self.chisq, (len(self.probs_x) - 1) * (len(self.probs_y) - 1))
def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] prob = self.controls.show_prob prob.clear() prob.addItem("(None)") prob.addItems(self.cvar.values) prob.addItem("(All)") self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1) data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: domain = Orange.data.Domain( [self.var, self.cvar] if self.cvar else [self.var]) data = Orange.data.Table(domain, data) disc = EqualWidth(n=self.bins[self.smoothing_index]) data = Discretize(method=disc, remove_const=False)(data) self.var = data.domain[0] self.set_left_axis_name() self.enable_disable_rel_freq() self.controls.cumulative_distr.setDisabled(not self.var.is_continuous) if self.cvar: self.contingencies = \ contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange()
def _setup(self): self.plot.clear() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] self.set_left_axis_name() self.enable_disable_rel_freq() if self.var is None: return if self.cvar: self.contingencies = \ contingency.get_contingency(self.data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(self.data, self.var) self.display_distribution()
def update_XY(self): self.axis_x_cb.clear() self.axis_y_cb.clear() ca_vars = self.selected_vars() if len(ca_vars) == 0: return multi = len(ca_vars) != 2 if multi: _, ctable = burt_table(self.data, ca_vars) else: ctable = contingency.get_contingency(self.data, *ca_vars[::-1]) self.ca = correspondence(ctable, ) rfs = self.ca.row_factors.shape[1] axes = ["{}".format(i + 1) for i in range(rfs)] self.axis_x_cb.addItems(axes) self.axis_y_cb.addItems(axes) return rfs
def __init__(self, data, attr1, attr2): attr1 = data.domain[attr1] attr2 = data.domain[attr2] if attr1.is_discrete and not attr1.values or \ attr2.is_discrete and not attr2.values: self.p = np.nan return self.observed = get_contingency(data, attr1, attr2) self.n = np.sum(self.observed) # pylint: disable=unexpected-keyword-arg self.probs_x = self.observed.sum(axis=0) / self.n self.probs_y = self.observed.sum(axis=1) / self.n self.expected = np.outer(self.probs_y, self.probs_x) * self.n with np.errstate(divide="ignore", invalid="ignore"): self.residuals = \ (self.observed - self.expected) / np.sqrt(self.expected) self.residuals = np.nan_to_num(self.residuals) self.chisqs = self.residuals ** 2 self.chisq = float(np.sum(self.chisqs)) self.p = chi2.sf( self.chisq, (len(self.probs_x) - 1) * (len(self.probs_y) - 1))
def _setup(self): """Setup the plot.""" self.plot.clear() varidx = self.variable_idx var = cvar = None if varidx >= 0: var = self.varmodel[varidx] if self.groupvar_idx >= 0: cvar = self.groupvarmodel[self.groupvar_idx] if var is None: return if is_discrete(cvar): cont = contingency.get_contingency(self.data, var, cvar) self.set_contingency(cont, var, cvar) else: dist = distribution.get_distribution(self.data, var) self.set_distribution(dist, var)
def _update_CA(self): ca_vars = self.selected_vars() if len(ca_vars) == 0: return multi = len(ca_vars) != 2 if multi: _, ctable = burt_table(self.data, ca_vars) else: ctable = contingency.get_contingency(self.data, *ca_vars[::-1]) self.ca = correspondence(ctable, ) axes = ["{}".format(i + 1) for i in range(self.ca.row_factors.shape[1])] self.axis_x_cb.clear() self.axis_x_cb.addItems(axes) self.axis_y_cb.clear() self.axis_y_cb.addItems(axes) self.component_x, self.component_y = self.component_x, self.component_y self._setup_plot() self._update_info()
def compute_box_data(self): attr = self.attribute if not attr: return dataset = self.dataset self.is_continuous = attr.is_continuous if dataset is None or not self.is_continuous and not attr.values or \ self.group_var and not self.group_var.values: self.stats = self.dist = self.conts = [] return if self.group_var: self.dist = [] self.conts = contingency.get_contingency( dataset, attr, self.group_var) if self.is_continuous: stats, label_texts = [], [] for i, cont in enumerate(self.conts): if np.sum(cont[1]): stats.append(BoxData(cont, attr, i, self.group_var)) label_texts.append(self.group_var.values[i]) self.stats = stats self.label_txts_all = label_texts else: self.label_txts_all = \ [v for v, c in zip(self.group_var.values, self.conts) if np.sum(c) > 0] else: self.dist = distribution.get_distribution(dataset, attr) self.conts = [] if self.is_continuous: self.stats = [BoxData(self.dist, attr, None)] self.label_txts_all = [""] self.label_txts = [txts for stat, txts in zip(self.stats, self.label_txts_all) if stat.n > 0] self.stats = [stat for stat in self.stats if stat.n > 0]
def updateGraph(self, *args): for item in self.canvas.items(): self.canvas.removeItem(item) if self.data is None or len(self.data) == 0 or \ self.attrX is None or self.attrY is None: return data = self.data[:, [self.attrX, self.attrY]] valsX = [] valsY = [] contX = get_contingency(data, self.attrX, self.attrX) contY = get_contingency(data, self.attrY, self.attrY) # compute contingency of x and y attributes for entry in contX: sum_ = 0 try: for val in entry: sum_ += val except: pass valsX.append(sum_) for entry in contY: sum_ = 0 try: for val in entry: sum_ += val except: pass valsY.append(sum_) contXY, _ = get_conditional_distribution( data, [data.domain[self.attrX], data.domain[self.attrY]]) # compute probabilities probs = {} for i in range(len(valsX)): valx = valsX[i] for j in range(len(valsY)): valy = valsY[j] try: actualProb = contXY['%s-%s' % (data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] # for val in contXY['%s-%s' %(i, j)]: actualProb += val except: actualProb = 0 probs['%s-%s' % (data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] = (( data.domain[self.attrX].values[i], valx), (data.domain[self.attrY].values[j], valy), actualProb, len(data)) #get text width of Y labels max_ylabel_w = 0 for j in range(len(valsY)): xl = CanvasText(self.canvas, "", 0, 0, html_text=getHtmlCompatibleString( data.domain[self.attrY].values[j]), show=False) max_ylabel_w = max(int(xl.boundingRect().width()), max_ylabel_w) max_ylabel_w = min(max_ylabel_w, 200) #upper limit for label widths # get text width of Y attribute name text = CanvasText(self.canvas, data.domain[self.attrY].name, x=0, y=0, bold=1, show=0, vertical=True) xOff = int(text.boundingRect().height() + max_ylabel_w) yOff = 55 sqareSize = min(self.canvasView.width() - xOff - 35, self.canvasView.height() - yOff - 50) sqareSize = max(sqareSize, 10) self.canvasView.setSceneRect(0, 0, self.canvasView.width(), self.canvasView.height()) # print graph name name = "<b>P(%s, %s) ≠ P(%s)×P(%s)</b>" % ( self.attrX, self.attrY, self.attrX, self.attrY) CanvasText(self.canvas, "", xOff + sqareSize / 2, 20, Qt.AlignCenter, html_text=name) CanvasText(self.canvas, "N = " + str(len(data)), xOff + sqareSize / 2, 38, Qt.AlignCenter, bold=0) ###################### # compute chi-square chisquare = 0.0 for i in range(len(valsX)): for j in range(len(valsY)): ((xAttr, xVal), (yAttr, yVal), actual, sum_) = probs['%s-%s' % (data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] expected = float(xVal * yVal) / float(sum_) if expected == 0: continue pearson2 = (actual - expected) * (actual - expected) / expected chisquare += pearson2 ###################### # draw rectangles currX = xOff max_xlabel_h = 0 normX, normY = sum(valsX), sum(valsY) self.areas = [] for i in range(len(valsX)): if valsX[i] == 0: continue currY = yOff width = int(float(sqareSize * valsX[i]) / float(normX)) for j in range(len(valsY) - 1, -1, -1): # this way we sort y values correctly ((xAttr, xVal), (yAttr, yVal), actual, sum_) = probs['%s-%s' % (data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] if valsY[j] == 0: continue height = int(float(sqareSize * valsY[j]) / float(normY)) # create rectangle selected = len(self.areas) in self.selection rect = CanvasRectangle(self.canvas, currX + 2, currY + 2, width - 4, height - 4, z=-10, onclick=self.select_area) rect.value_pair = i, j self.areas.append(rect) self.addRectIndependencePearson( rect, currX + 2, currY + 2, width - 4, height - 4, (xAttr, xVal), (yAttr, yVal), actual, sum_, width=1 + 3 * selected, # Ugly! This is needed since # resize redraws the graph! When this is handled by resizing # just the viewer, update_selection will take care of this ) expected = float(xVal * yVal) / float(sum_) pearson = (actual - expected) / sqrt(expected) tooltipText = """<b>X Attribute: %s</b><br>Value: <b>%s</b><br>Number of instances (p(x)): <b>%d (%.2f%%)</b><hr> <b>Y Attribute: %s</b><br>Value: <b>%s</b><br>Number of instances (p(y)): <b>%d (%.2f%%)</b><hr> <b>Number Of Instances (Probabilities):</b><br>Expected (p(x)p(y)): <b>%.1f (%.2f%%)</b><br>Actual (p(x,y)): <b>%d (%.2f%%)</b> <hr><b>Statistics:</b><br>Chi-square: <b>%.2f</b><br>Standardized Pearson residual: <b>%.2f</b>""" % ( self.attrX, getHtmlCompatibleString(xAttr), xVal, 100.0 * float(xVal) / float(sum_), self.attrY, getHtmlCompatibleString(yAttr), yVal, 100.0 * float(yVal) / float(sum_), expected, 100.0 * float(xVal * yVal) / float(sum_ * sum_), actual, 100.0 * float(actual) / float(sum_), chisquare, pearson) rect.setToolTip(tooltipText) currY += height if currX == xOff: CanvasText(self.canvas, "", xOff, currY - height / 2, Qt.AlignRight | Qt.AlignVCenter, html_text=getHtmlCompatibleString( data.domain[self.attrY].values[j])) xl = CanvasText(self.canvas, "", currX + width / 2, yOff + sqareSize, Qt.AlignHCenter | Qt.AlignTop, html_text=getHtmlCompatibleString( data.domain[self.attrX].values[i])) max_xlabel_h = max(int(xl.boundingRect().height()), max_xlabel_h) currX += width # show attribute names CanvasText(self.canvas, self.attrY, 0, yOff + sqareSize / 2, Qt.AlignLeft | Qt.AlignVCenter, bold=1, vertical=True) CanvasText(self.canvas, self.attrX, xOff + sqareSize / 2, yOff + sqareSize + max_xlabel_h, Qt.AlignHCenter | Qt.AlignTop, bold=1)
def updateGraph(self, *args): for item in self.canvas.items(): self.canvas.removeItem(item) # remove all canvas items if not self.data: return if not self.attrX or not self.attrY: return data = self.getConditionalData() if not data or len(data) == 0: return valsX = [] valsY = [] # contX = orange.ContingencyAttrAttr(self.attrX, self.attrX, data) # distribution of X attribute # contY = orange.ContingencyAttrAttr(self.attrY, self.attrY, data) # distribution of Y attribute contX = get_contingency(data, self.attrX, self.attrX) contY = get_contingency(data, self.attrY, self.attrY) # compute contingency of x and y attributes for entry in contX: sum_ = 0 try: for val in entry: sum_ += val except: pass valsX.append(sum_) for entry in contY: sum_ = 0 try: for val in entry: sum_ += val except: pass valsY.append(sum_) # create cartesian product of selected attributes and compute contingency # (cart, profit) = FeatureByCartesianProduct(data, [data.domain[self.attrX], data.domain[self.attrY]]) # tempData = data.select(list(data.domain) + [cart]) # contXY = orange.ContingencyAttrAttr(cart, cart, tempData) # distribution of X attribute # contXY = get_contingency(tempData, cart, cart) contXY = self.getConditionalDistributions(data, [data.domain[self.attrX], data.domain[self.attrY]]) # compute probabilities probs = {} for i in range(len(valsX)): valx = valsX[i] for j in range(len(valsY)): valy = valsY[j] actualProb = 0 try: actualProb = contXY['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] # for val in contXY['%s-%s' %(i, j)]: actualProb += val except: actualProb = 0 probs['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] = ((data.domain[self.attrX].values[i], valx), (data.domain[self.attrY].values[j], valy), actualProb, len(data)) # get text width of Y attribute name text = OWCanvasText(self.canvas, data.domain[self.attrY].name, x = 0, y = 0, bold = 1, show = 0, vertical=True) xOff = int(text.boundingRect().height() + 40) yOff = 50 sqareSize = min(self.canvasView.width() - xOff - 35, self.canvasView.height() - yOff - 30) if sqareSize < 0: return # canvas is too small to draw rectangles self.canvasView.setSceneRect(0, 0, self.canvasView.width(), self.canvasView.height()) # print graph name if self.attrCondition == "(None)": name = "<b>P(%s, %s) ≠ P(%s)×P(%s)</b>" %(self.attrX, self.attrY, self.attrX, self.attrY) else: name = "<b>P(%s, %s | %s = %s) ≠ P(%s | %s = %s)×P(%s | %s = %s)</b>" %(self.attrX, self.attrY, self.attrCondition, getHtmlCompatibleString(self.attrConditionValue), self.attrX, self.attrCondition, getHtmlCompatibleString(self.attrConditionValue), self.attrY, self.attrCondition, getHtmlCompatibleString(self.attrConditionValue)) OWCanvasText(self.canvas, "" , xOff+ sqareSize/2, 20, Qt.AlignCenter, htmlText = name) OWCanvasText(self.canvas, "N = " + str(len(data)), xOff+ sqareSize/2, 38, Qt.AlignCenter, bold = 0) ###################### # compute chi-square chisquare = 0.0 for i in range(len(valsX)): for j in range(len(valsY)): ((xAttr, xVal), (yAttr, yVal), actual, sum_) = probs['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] expected = float(xVal*yVal)/float(sum_) if expected == 0: continue pearson2 = (actual - expected)*(actual - expected) / expected chisquare += pearson2 ###################### # draw rectangles currX = xOff max_ylabel_w = 0 normX, normY = sum(valsX), sum(valsY) for i in range(len(valsX)): if valsX[i] == 0: continue currY = yOff width = int(float(sqareSize * valsX[i])/float(normX)) #for j in range(len(valsY)): for j in range(len(valsY)-1, -1, -1): # this way we sort y values correctly ((xAttr, xVal), (yAttr, yVal), actual, sum_) = probs['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] if valsY[j] == 0: continue height = int(float(sqareSize * valsY[j])/float(normY)) # create rectangle rect = OWCanvasRectangle(self.canvas, currX+2, currY+2, width-4, height-4, z = -10) self.addRectIndependencePearson(rect, currX+2, currY+2, width-4, height-4, (xAttr, xVal), (yAttr, yVal), actual, sum_) expected = float(xVal*yVal)/float(sum_) pearson = (actual - expected) / sqrt(expected) tooltipText = """<b>X Attribute: %s</b><br>Value: <b>%s</b><br>Number of examples (p(x)): <b>%d (%.2f%%)</b><hr> <b>Y Attribute: %s</b><br>Value: <b>%s</b><br>Number of examples (p(y)): <b>%d (%.2f%%)</b><hr> <b>Number Of Examples (Probabilities):</b><br>Expected (p(x)p(y)): <b>%.1f (%.2f%%)</b><br>Actual (p(x,y)): <b>%d (%.2f%%)</b> <hr><b>Statistics:</b><br>Chi-square: <b>%.2f</b><br>Standardized Pearson residual: <b>%.2f</b>""" %(self.attrX, getHtmlCompatibleString(xAttr), xVal, 100.0*float(xVal)/float(sum_), self.attrY, getHtmlCompatibleString(yAttr), yVal, 100.0*float(yVal)/float(sum_), expected, 100.0*float(xVal*yVal)/float(sum_*sum_), actual, 100.0*float(actual)/float(sum_), chisquare, pearson ) rect.setToolTip(tooltipText) currY += height if currX == xOff: xl = OWCanvasText(self.canvas, "", xOff - 10, currY - height/2, Qt.AlignRight | Qt.AlignVCenter, htmlText = getHtmlCompatibleString(data.domain[self.attrY].values[j])) max_ylabel_w = max(int(xl.boundingRect().width()), max_ylabel_w) OWCanvasText(self.canvas, "", currX + width/2, yOff + sqareSize + 5, Qt.AlignCenter, htmlText = getHtmlCompatibleString(data.domain[self.attrX].values[i])) currX += width # show attribute names OWCanvasText(self.canvas, self.attrY, max(xOff-20-max_ylabel_w, 20), yOff + sqareSize/2, Qt.AlignRight | Qt.AlignVCenter, bold = 1, vertical=True) OWCanvasText(self.canvas, self.attrX, xOff + sqareSize/2, yOff + sqareSize + 15, Qt.AlignCenter, bold = 1)
def updateGraph(self, *args): for item in self.canvas.items(): self.canvas.removeItem(item) if self.data is None or len(self.data) == 0 or \ self.attrX is None or self.attrY is None: return data = self.data[:, [self.attrX, self.attrY]] valsX = [] valsY = [] contX = get_contingency(data, self.attrX, self.attrX) contY = get_contingency(data, self.attrY, self.attrY) # compute contingency of x and y attributes for entry in contX: sum_ = 0 try: for val in entry: sum_ += val except: pass valsX.append(sum_) for entry in contY: sum_ = 0 try: for val in entry: sum_ += val except: pass valsY.append(sum_) contXY, _ = get_conditional_distribution( data, [data.domain[self.attrX], data.domain[self.attrY]]) # compute probabilities probs = {} for i in range(len(valsX)): valx = valsX[i] for j in range(len(valsY)): valy = valsY[j] try: actualProb = contXY['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] # for val in contXY['%s-%s' %(i, j)]: actualProb += val except: actualProb = 0 probs['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] = ((data.domain[self.attrX].values[i], valx), (data.domain[self.attrY].values[j], valy), actualProb, len(data)) #get text width of Y labels max_ylabel_w = 0 for j in range(len(valsY)): xl = CanvasText(self.canvas, "", 0, 0, html_text= getHtmlCompatibleString(data.domain[self.attrY].values[j]), show=False) max_ylabel_w = max(int(xl.boundingRect().width()), max_ylabel_w) max_ylabel_w = min(max_ylabel_w, 200) #upper limit for label widths # get text width of Y attribute name text = CanvasText(self.canvas, data.domain[self.attrY].name, x = 0, y = 0, bold = 1, show = 0, vertical=True) xOff = int(text.boundingRect().height() + max_ylabel_w) yOff = 55 sqareSize = min(self.canvasView.width() - xOff - 35, self.canvasView.height() - yOff - 50) sqareSize = max(sqareSize, 10) self.canvasView.setSceneRect(0, 0, self.canvasView.width(), self.canvasView.height()) # print graph name name = "<b>P(%s, %s) ≠ P(%s)×P(%s)</b>" %(self.attrX, self.attrY, self.attrX, self.attrY) CanvasText(self.canvas, "", xOff + sqareSize / 2, 20, Qt.AlignCenter, html_text= name) CanvasText(self.canvas, "N = " + str(len(data)), xOff + sqareSize / 2, 38, Qt.AlignCenter, bold = 0) ###################### # compute chi-square chisquare = 0.0 for i in range(len(valsX)): for j in range(len(valsY)): ((xAttr, xVal), (yAttr, yVal), actual, sum_) = probs['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] expected = float(xVal*yVal)/float(sum_) if expected == 0: continue pearson2 = (actual - expected)*(actual - expected) / expected chisquare += pearson2 ###################### # draw rectangles currX = xOff max_xlabel_h = 0 normX, normY = sum(valsX), sum(valsY) self.areas = [] for i in range(len(valsX)): if valsX[i] == 0: continue currY = yOff width = int(float(sqareSize * valsX[i])/float(normX)) for j in range(len(valsY)-1, -1, -1): # this way we sort y values correctly ((xAttr, xVal), (yAttr, yVal), actual, sum_) = probs['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] if valsY[j] == 0: continue height = int(float(sqareSize * valsY[j])/float(normY)) # create rectangle selected = len(self.areas) in self.selection rect = CanvasRectangle( self.canvas, currX+2, currY+2, width-4, height-4, z = -10, onclick=self.select_area) rect.value_pair = i, j self.areas.append(rect) self.addRectIndependencePearson(rect, currX+2, currY+2, width-4, height-4, (xAttr, xVal), (yAttr, yVal), actual, sum_, width=1 + 3 * selected, # Ugly! This is needed since # resize redraws the graph! When this is handled by resizing # just the viewer, update_selection will take care of this ) expected = float(xVal*yVal)/float(sum_) pearson = (actual - expected) / sqrt(expected) tooltipText = """<b>X Attribute: %s</b><br>Value: <b>%s</b><br>Number of instances (p(x)): <b>%d (%.2f%%)</b><hr> <b>Y Attribute: %s</b><br>Value: <b>%s</b><br>Number of instances (p(y)): <b>%d (%.2f%%)</b><hr> <b>Number Of Instances (Probabilities):</b><br>Expected (p(x)p(y)): <b>%.1f (%.2f%%)</b><br>Actual (p(x,y)): <b>%d (%.2f%%)</b> <hr><b>Statistics:</b><br>Chi-square: <b>%.2f</b><br>Standardized Pearson residual: <b>%.2f</b>""" %(self.attrX, getHtmlCompatibleString(xAttr), xVal, 100.0*float(xVal)/float(sum_), self.attrY, getHtmlCompatibleString(yAttr), yVal, 100.0*float(yVal)/float(sum_), expected, 100.0*float(xVal*yVal)/float(sum_*sum_), actual, 100.0*float(actual)/float(sum_), chisquare, pearson ) rect.setToolTip(tooltipText) currY += height if currX == xOff: CanvasText(self.canvas, "", xOff, currY - height / 2, Qt.AlignRight | Qt.AlignVCenter, html_text= getHtmlCompatibleString(data.domain[self.attrY].values[j])) xl = CanvasText(self.canvas, "", currX + width / 2, yOff + sqareSize, Qt.AlignHCenter | Qt.AlignTop, html_text= getHtmlCompatibleString(data.domain[self.attrX].values[i])) max_xlabel_h = max(int(xl.boundingRect().height()), max_xlabel_h) currX += width # show attribute names CanvasText(self.canvas, self.attrY, 0, yOff + sqareSize / 2, Qt.AlignLeft | Qt.AlignVCenter, bold = 1, vertical=True) CanvasText(self.canvas, self.attrX, xOff + sqareSize / 2, yOff + sqareSize + max_xlabel_h, Qt.AlignHCenter | Qt.AlignTop, bold = 1)
def fit_storage(self, table): cont = contingency.get_contingencies(table) class_freq = np.diag( contingency.get_contingency(table, table.domain.class_var)) return BayesStorageClassifier(cont, class_freq, table.domain)
print(cattrs) print(dattrs) t0 = time.time() for i in range(len(cattrs)): for j in range(i): print(p_index(cdata, cattrs[i], cattrs[j])) t1 = time.time() - t0 print('t1', t1) t0 = time.time() for i in range(len(dattrs)): for j in range(i): ct = np.array( contingency.get_contingency(ddata, dattrs[j], dattrs[i])) print(p_index_ct(ct)) t2 = time.time() - t0 print('t2', t2) Ns.append(N) times1.append(t1) times2.append(t2) fig, ax = plt.subplots() ax.plot(Ns, times1, '-', Ns, times2, '-') ax.set_xlabel('Number of examples') ax.set_ylabel('Time in seconds') ax.set_title('Rank Scatterplots - {}'.format(dataset)) plt.xlim(min(Ns), max(Ns))
def updateGraph(self, *args): for item in self.canvas.items(): self.canvas.removeItem(item) # remove all canvas items if not self.data: return if not self.attrX or not self.attrY: return data = self.getConditionalData() if not data or len(data) == 0: return valsX = [] valsY = [] # contX = orange.ContingencyAttrAttr(self.attrX, self.attrX, data) # distribution of X attribute # contY = orange.ContingencyAttrAttr(self.attrY, self.attrY, data) # distribution of Y attribute contX = get_contingency(data, self.attrX, self.attrX) contY = get_contingency(data, self.attrY, self.attrY) # compute contingency of x and y attributes for entry in contX: sum_ = 0 try: for val in entry: sum_ += val except: pass valsX.append(sum_) for entry in contY: sum_ = 0 try: for val in entry: sum_ += val except: pass valsY.append(sum_) # create cartesian product of selected attributes and compute contingency # (cart, profit) = FeatureByCartesianProduct(data, [data.domain[self.attrX], data.domain[self.attrY]]) # tempData = data.select(list(data.domain) + [cart]) # contXY = orange.ContingencyAttrAttr(cart, cart, tempData) # distribution of X attribute # contXY = get_contingency(tempData, cart, cart) contXY = self.getConditionalDistributions( data, [data.domain[self.attrX], data.domain[self.attrY]]) # compute probabilities probs = {} for i in range(len(valsX)): valx = valsX[i] for j in range(len(valsY)): valy = valsY[j] actualProb = 0 try: actualProb = contXY['%s-%s' % (data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] # for val in contXY['%s-%s' %(i, j)]: actualProb += val except: actualProb = 0 probs['%s-%s' % (data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] = (( data.domain[self.attrX].values[i], valx), (data.domain[self.attrY].values[j], valy), actualProb, len(data)) # get text width of Y attribute name text = OWCanvasText(self.canvas, data.domain[self.attrY].name, x=0, y=0, bold=1, show=0, vertical=True) xOff = int(text.boundingRect().height() + 40) yOff = 50 sqareSize = min(self.canvasView.width() - xOff - 35, self.canvasView.height() - yOff - 30) if sqareSize < 0: return # canvas is too small to draw rectangles self.canvasView.setSceneRect(0, 0, self.canvasView.width(), self.canvasView.height()) # print graph name if self.attrCondition == "(None)": name = "<b>P(%s, %s) ≠ P(%s)×P(%s)</b>" % ( self.attrX, self.attrY, self.attrX, self.attrY) else: name = "<b>P(%s, %s | %s = %s) ≠ P(%s | %s = %s)×P(%s | %s = %s)</b>" % ( self.attrX, self.attrY, self.attrCondition, getHtmlCompatibleString( self.attrConditionValue), self.attrX, self.attrCondition, getHtmlCompatibleString( self.attrConditionValue), self.attrY, self.attrCondition, getHtmlCompatibleString(self.attrConditionValue)) OWCanvasText(self.canvas, "", xOff + sqareSize / 2, 20, Qt.AlignCenter, htmlText=name) OWCanvasText(self.canvas, "N = " + str(len(data)), xOff + sqareSize / 2, 38, Qt.AlignCenter, bold=0) ###################### # compute chi-square chisquare = 0.0 for i in range(len(valsX)): for j in range(len(valsY)): ((xAttr, xVal), (yAttr, yVal), actual, sum_) = probs['%s-%s' % (data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] expected = float(xVal * yVal) / float(sum_) if expected == 0: continue pearson2 = (actual - expected) * (actual - expected) / expected chisquare += pearson2 ###################### # draw rectangles currX = xOff max_ylabel_w = 0 normX, normY = sum(valsX), sum(valsY) for i in range(len(valsX)): if valsX[i] == 0: continue currY = yOff width = int(float(sqareSize * valsX[i]) / float(normX)) #for j in range(len(valsY)): for j in range(len(valsY) - 1, -1, -1): # this way we sort y values correctly ((xAttr, xVal), (yAttr, yVal), actual, sum_) = probs['%s-%s' % (data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] if valsY[j] == 0: continue height = int(float(sqareSize * valsY[j]) / float(normY)) # create rectangle rect = OWCanvasRectangle(self.canvas, currX + 2, currY + 2, width - 4, height - 4, z=-10) self.addRectIndependencePearson(rect, currX + 2, currY + 2, width - 4, height - 4, (xAttr, xVal), (yAttr, yVal), actual, sum_) expected = float(xVal * yVal) / float(sum_) pearson = (actual - expected) / sqrt(expected) tooltipText = """<b>X Attribute: %s</b><br>Value: <b>%s</b><br>Number of examples (p(x)): <b>%d (%.2f%%)</b><hr> <b>Y Attribute: %s</b><br>Value: <b>%s</b><br>Number of examples (p(y)): <b>%d (%.2f%%)</b><hr> <b>Number Of Examples (Probabilities):</b><br>Expected (p(x)p(y)): <b>%.1f (%.2f%%)</b><br>Actual (p(x,y)): <b>%d (%.2f%%)</b> <hr><b>Statistics:</b><br>Chi-square: <b>%.2f</b><br>Standardized Pearson residual: <b>%.2f</b>""" % ( self.attrX, getHtmlCompatibleString(xAttr), xVal, 100.0 * float(xVal) / float(sum_), self.attrY, getHtmlCompatibleString(yAttr), yVal, 100.0 * float(yVal) / float(sum_), expected, 100.0 * float(xVal * yVal) / float(sum_ * sum_), actual, 100.0 * float(actual) / float(sum_), chisquare, pearson) rect.setToolTip(tooltipText) currY += height if currX == xOff: xl = OWCanvasText(self.canvas, "", xOff - 10, currY - height / 2, Qt.AlignRight | Qt.AlignVCenter, htmlText=getHtmlCompatibleString( data.domain[self.attrY].values[j])) max_ylabel_w = max(int(xl.boundingRect().width()), max_ylabel_w) OWCanvasText(self.canvas, "", currX + width / 2, yOff + sqareSize + 5, Qt.AlignCenter, htmlText=getHtmlCompatibleString( data.domain[self.attrX].values[i])) currX += width # show attribute names OWCanvasText(self.canvas, self.attrY, max(xOff - 20 - max_ylabel_w, 20), yOff + sqareSize / 2, Qt.AlignRight | Qt.AlignVCenter, bold=1, vertical=True) OWCanvasText(self.canvas, self.attrX, xOff + sqareSize / 2, yOff + sqareSize + 15, Qt.AlignCenter, bold=1)
def draw_distributions(self): """Draw distributions with discrete attributes""" if not (self.show_distributions and self.data is not None and self.domain.has_discrete_class): return class_count = len(self.domain.class_var.values) class_ = self.domain.class_var # we create a hash table of possible class values (happens only if we have a discrete class) if self.domain_contingencies is None: self.domain_contingencies = dict( zip([attr for attr in self.domain if attr.is_discrete], get_contingencies(self.data, skipContinuous=True))) self.domain_contingencies[class_] = get_contingency( self.data, class_, class_) max_count = max([ contingency.max() for contingency in self.domain_contingencies.values() ] or [1]) sorted_class_values = get_variable_values_sorted(self.domain.class_var) for axis_idx, attr_idx in enumerate(self.attribute_indices): attr = self.domain[attr_idx] if attr.is_discrete: continue contingency = self.domain_contingencies[attr] attr_len = len(attr.values) # we create a hash table of variable values and their indices sorted_variable_values = get_variable_values_sorted(attr) # create bar curve for j in range(attr_len): attribute_value = sorted_variable_values[j] value_count = contingency[:, attribute_value] for i in range(class_count): class_value = sorted_class_values[i] color = QColor(*self.colors[i]) color.setAlpha(self.alpha_value) width = float( value_count[class_value] * 0.5) / float(max_count) y_off = float(1.0 + 2.0 * j) / float(2 * attr_len) height = 0.7 / float(class_count * attr_len) y_low_bottom = y_off + float( class_count * height) / 2.0 - i * height curve = PolygonCurve(QPen(color), QBrush(color), xData=[ axis_idx, axis_idx + width, axis_idx + width, axis_idx ], yData=[ y_low_bottom, y_low_bottom, y_low_bottom - height, y_low_bottom - height ], tooltip=attr.name) curve.attach(self)
def grid_bin(data, xvar, yvar, xbins, ybins, zvar=None): x_disc = Discretizer.create_discretized_var(data, xvar, xbins[1:-1]) y_disc = Discretizer.create_discretized_var(data, yvar, ybins[1:-1]) x_min, x_max = xbins[0], xbins[-1] y_min, y_max = ybins[0], ybins[-1] querydomain = [x_disc, y_disc] if zvar is not None: querydomain = querydomain + [zvar] querydomain = Orange.data.Domain(querydomain) def interval_filter(var, low, high): return Orange.data.filter.Values([ Orange.data.filter.FilterContinuous( var, max=high, min=low, oper=Orange.data.filter.FilterContinuous.Between) ]) def value_filter(var, val): return Orange.data.filter.Values( [Orange.data.filter.FilterDiscrete(var, [val])]) def filters_join(filters): return Orange.data.filter.Values( reduce(list.__iadd__, (f.conditions for f in filters), [])) inf_bounds = np.isinf([x_min, x_max, y_min, y_max]) if not all(inf_bounds): # No need to filter the data range_filters = [ interval_filter(xvar, x_min, x_max), interval_filter(yvar, y_min, y_max) ] range_filter = filters_join(range_filters) subset = range_filter(data) else: subset = data if is_discrete(zvar): filters = [value_filter(zvar, val) for val in zvar.values] contingencies = [ contingency.get_contingency(filter_( subset.from_table(querydomain, subset)), col_variable=y_disc, row_variable=x_disc) for filter_ in filters ] contingencies = np.dstack(contingencies) else: contingencies = contingency.get_contingency(subset.from_table( querydomain, subset), col_variable=y_disc, row_variable=x_disc) contingencies = np.asarray(contingencies) return Tree(xbins, ybins, contingencies, None)